/*
 * Copyright (c) 2004, Nate Nielsen
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *     * Redistributions of source code must retain the above
 *       copyright notice, this list of conditions and the
 *       following disclaimer.
 *     * Redistributions in binary form must reproduce the
 *       above copyright notice, this list of conditions and
 *       the following disclaimer in the documentation and/or
 *       other materials provided with the distribution.
 *     * The names of contributors to this software may not be
 *       used to endorse or promote products derived from this
 *       software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 *
 * CONTRIBUTORS
 *  Nate Nielsen <nielsen@memberwebs.com>
 *
 */

#include "usuals.h"

#include <stdlib.h>
#include <stdio.h>
#include "rtfparser.h"
#include "internal.h"

const wchar_t kAnsiToUnicode[] = {

	/* Moltly invalid, but used for wierd things in RTF anyway :( */
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0009, 0x000A, 0x0000, 0x0000, 0x000D, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,

	/* Low 7 bit, same */
	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
	0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
	0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
	0x0040, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048,
	0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
	0x0050, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
	0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
	0x0060, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068,
	0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
	0x0070, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
	0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,

	/* Changes in the middle here to random chars */
	0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
	0x02C6,	0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
	0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,

	/* High stuff, all the same */
	0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
	0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
	0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
	0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
	0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
	0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
	0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
	0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
	0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
	0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
	0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
	0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF

};


/* ----------------------------------------------------------------------------------
 *  CONSTRUCTION
 */

RtfParser::RtfParser()
{
	m_handler = NULL;
	m_depth = 0;
	m_parseHex = true;
	m_parseUnicode = false;
	m_uniEat = 0;
	m_uniEatStack.push(0);
}

RtfParser::~RtfParser()
{

}


/* ----------------------------------------------------------------------------------
 *  PUBLIC METHODS
 */

bool RtfParser::parse(string fileName)
{
	FILE* file = fopen(fileName.c_str(), "r");
	if(!file)
		return false;

	bool ret = parse(file);

	fclose(file);

	return ret;
}

bool RtfParser::parse(FILE* file)
{
    int ch = 0;
    bool isData = false;

    m_depth = 0;
    m_parseErrors = "";
    m_file = file;

    if(m_handler)
        m_handler->startDocument(this);

    while(1)
    {
        ch = fgetc(file);
        if(ch == EOF)
            goto done;

        switch(ch)
        {

        // Starting a control word
        case '\\':
            if(!parseControlWord())
                goto done;
            break;

        // Starting an RTF group
        case '{':
            {
                // Send all previous data
                flushData();

                // Handle any unicode destinations properly
                m_uniEatStack.push(m_uniEatStack.top());

                if(m_handler)
                    m_handler->groupStart();

                m_depth++;
            }
            break;

        case '}':
            {
                // Send all previous data
                flushData();

                if(m_handler)
                    m_handler->groupEnd();

                // Handle any unicode destinations properly
                if(!m_uniEatStack.empty())
                    m_uniEatStack.pop();

                m_depth--;
            }
            break;

        default:
            isData = true;
            break;
        }

        if(isData)
        {
            // We translate tabs into the appropriate control word
            if(ch == '\t')
                sendControlWord("tab", 0, -1);

            // line endings aren't used
            else if(!strchr("\r\n", ch))
               sendData(ch);

            isData = false;
        }
    }


done:

    if(m_depth != 0)
        m_parseErrors.append("unmatched braces\n");

    if(m_handler)
        m_handler->endDocument();

    m_file = NULL;
    m_dataBuffer.resize(0);

    // If any parse errors return failure
    return m_parseErrors.empty();
}


/* ----------------------------------------------------------------------------------
 *  HANDLER CALLS
 */

void RtfParser::flushData()
{
	if(!m_dataBuffer.empty())
	{
		if(m_handler)
			m_handler->charData(m_dataBuffer);

		m_dataBuffer.resize(0);
	}
}

void RtfParser::sendData(wchar_t ch)
{
    // Skip unicode chars we've been asked to
	if(m_uniEat > 0)
		m_uniEat--;

	else
        transcode16to8(ch, m_dataBuffer);
}

void RtfParser::sendData(const wstring& data)
{
    // Skip any unicode chars we've been asked to
	if(m_uniEat > 0)
	{
		int len = data.size();
		if(len > m_uniEat)
			len = m_uniEat;

        transcode16to8(data.substr(len), m_dataBuffer);
		m_uniEat -= len;
	}
	else
	{
        transcode16to8(data, m_dataBuffer);
	}
}

void RtfParser::sendControlWord(const string& cw, int flags, int param)
{
	flushData();

	if(m_handler)
		m_handler->controlWord(cw, flags, param);
}


/* ----------------------------------------------------------------------------------
 *  PARSE HELPERS
 */

bool RtfParser::parseHexChar(int num, bool ansi)
{
	string data;

	// Ansi is only 256 chars long
	ASSERT(num == 2 || !ansi);

    // Get num chars and put them in the string
	for(int i = 0; i < num; i++)
	{
		char ch = fgetc(m_file);

		if(ch == -1)
			return false;

		if((ch >= 'A' && ch <= 'F') ||
		   (ch >= 'a' && ch <= 'f') ||
		   (ch >= '0' && ch <= '9'))
		{
			data.append(1, ch);
		}
		else
		{
			m_parseErrors.append((string)"invalid hex char: " + ch + "\n");
		}
	}

    // If parsing hex, then convert to appropriate unicode
	if(m_parseHex)
	{
		char* end = NULL;
		int val = strtol(data.c_str(), &end, 16);
		if(end == data.c_str() + data.size() && m_parseHex)
		{
			if(ansi)
			{
				ASSERT((sizeof(kAnsiToUnicode) / sizeof(kAnsiToUnicode[0])) == 256);

				if(val < 0 || val >= 256)
				{
					m_parseErrors.append("invalid ansi char: " + data + "\n");
				}
				else
				{
					wchar_t ch = kAnsiToUnicode[val];
					if(ch)
						sendData(ch);
				}
			}
			else
			{
				sendData(val);
			}
		}
		else
			m_parseErrors.append("invalid hex char: " + data + "\n");
	}

    // TODO: Why would we ever want to do this?
    // Otherwise just send as a hex control word
	else
	{
		sendControlWord(data, RtfHandler::kIsEncoded, -1);
	}

	return true;
}

bool RtfParser::parseControlWord()
{
	bool isAsterisk = false;
	string controlword;
	string param;

	while(1)
	{
		int ch = fgetc(m_file);
		if(ch == (int)WEOF)
			return false;

		bool empty = controlword.empty();

		// Part of the name of a control word
		// NOTE: Although the RTF specification prohibits uppercase
		// control words, MS Word uses them :-/
		if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
			controlword.append(1, (char)ch);

		// Part of the parameter of a control word
		else if(ch >= '0' && ch <= '9')
			param.append(1, (char)ch);

		// Hyphens are part of the parameter of a control word
		else if(ch == '-' && !controlword.empty())
			param.append(1, (char)ch);

		// Now handle escapes and other special types of
		// control words. These are all only valid at beginning
		// of the "control word"

		// hex spelled out character
		else if(empty && ch == '\'')
		{
			parseHexChar(2, true);
			break;
		}

		// Asterisk type destination
		else if(empty && ch == '*')
		{
			isAsterisk = true;

			ch = fgetc(m_file);
			while(strchr("\r\n", ch))
				ch = fgetc(m_file);

			if(ch != '\\')
				ungetc(ch, m_file);
		}

		// Escaped backslash
		else if(empty && ch == '\\')
		{
			sendData(L'\\');
			break;
		}

        // Escaped braces
        else if(empty && ch == '{')
        {
            sendData(L'{');
        }

        else if(empty && ch == '}')
        {
            sendData(L'}');
        }

		// Non breaking space
		else if(empty && ch == '~')
		{
			sendData(0x00A0);
			break;
		}

		// Optional hyphen
		else if(empty && ch == '-')
		{
			sendData(0x00AD);
			break;
		}

		// a hyphen right after control word is part of number
		else if(!empty && param.empty() && ch == '-')
		{
			param.append(1, (char)ch);
		}

		// Space at end a rtf code (it gets eaten)
		else if(strchr(" ", ch))
			break;

		// Anything else (including a backslash ends a control word)
		else
		{
			ungetc(ch, m_file);
			break;
		}
	}

	// Empty out the control word buffers
	if(!controlword.empty())
	{
		int flags = isAsterisk ? RtfHandler::kAsterisk : 0;
		int numPar = -1;

		if(!param.empty())
		{
			char* end = NULL;
			numPar = strtol(param.c_str(), &end, 10);
			if(end == param.c_str() + param.size())
				flags += RtfHandler::kHasParam;
		}

		// Here we check for common characters
		if(controlword == "emdash")
			sendData(0x2014);
		else if(controlword == "endash")
			sendData(0x2013);
		else if(controlword == "emspace")
			sendData(0x2003);
		else if(controlword == "enspace")
			sendData(0x2002);
		else if(controlword == "bullet")
			sendData(0x2022);
		else if(controlword == "lquote")
			sendData(0x2018);
		else if(controlword == "rquote")
			sendData(0x2019);
		else if(controlword == "ldblquote")
			sendData(0x201C);
		else if(controlword == "rdblquote")
			sendData(0x201D);

		// Unicode values get sent through
		else if(m_parseUnicode && flags & RtfHandler::kHasParam &&
			    controlword == "u" )
		{
			// RTF plays hokey and uses negative values in unicode
			sendData((unsigned short)((short)numPar));
			m_uniEat = m_uniEatStack.top();
		}

		// Skip value for unicode characters
		else if(m_parseUnicode && controlword == "uc")
		{
			m_uniEatStack.pop();
			m_uniEatStack.push(numPar);
		}

		// Otherwise we send the control word
		else
		{
			if(m_handler)
				sendControlWord(controlword, flags, numPar);
		}
	}

	return true;
}