/*
 * Copyright (c) 2004, Nate Nielsen
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *     * Redistributions of source code must retain the above
 *       copyright notice, this list of conditions and the
 *       following disclaimer.
 *     * Redistributions in binary form must reproduce the
 *       above copyright notice, this list of conditions and
 *       the following disclaimer in the documentation and/or
 *       other materials provided with the distribution.
 *     * The names of contributors to this software may not be
 *       used to endorse or promote products derived from this
 *       software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 *
 * CONTRIBUTORS
 *  Nate Nielsen <nielsen@memberwebs.com>
 *
 */

#include "usuals.h"

#include <stdlib.h>
#include <stdio.h>
#include "rtfreader.h"


const int RtfHandler::kAsterisk = 0x00000001;
const int RtfHandler::kHasParam = 0x00000002;
const int RtfHandler::kIsEncoded = 0x00000004;

RtfReader::RtfReader()
{
	m_handler = NULL;
	m_depth = 0;
	m_parseHex = true;
	m_parseUnicode = false;
	m_uniEat = 0;
	m_uniEatStack.push(0);
}

RtfReader::~RtfReader()
{

}

bool RtfReader::parse(string fileName)
{
	FILE* file = fopen(fileName.c_str(), "r");
	if(!file)
		return false;

	bool ret = parse(file);

	fclose(file);

	return ret;
}

void RtfReader::emptyData(RtfContext& cx)
{
	if(!cx.data.empty())
	{
		if(m_handler)
			m_handler->charData(cx.data);
		cx.data.resize(0);
	}
}

void RtfReader::sendData(RtfContext& cx, wchar_t ch)
{
	if(m_uniEat > 0)
		m_uniEat--;
	else
		cx.data.append(1, ch);
}

void RtfReader::sendData(RtfContext& cx, wstring data)
{
	if(m_uniEat > 0)
	{
		int len = data.size();
		if(len > m_uniEat)
			len = m_uniEat;

		cx.data.append(data.substr(len));
		m_uniEat -= len;
	}
	else
	{
		cx.data.append(data);
	}
}

void RtfReader::sendControlWord(RtfContext& cx, string cw, int flags, int param)
{
	emptyData(cx);
	if(m_handler)
		m_handler->controlWord(cw, flags, param);
}

bool RtfReader::parseHexChar(RtfContext& cx, int num)
{
	string data;
	for(int i = 0; i < num; i++)
	{
		char ch = fgetc(cx.file);

		if(ch == -1)
			return false;

		if((ch >= 'A' && ch <= 'F') ||
		   (ch >= 'a' && ch <= 'f') ||
		   (ch >= '0' && ch <= '9'))
		{
			data.append(1, ch);
		}
		else
		{
			m_parseErrors.append((string)"invalid hex char: " + ch + "\n");
		}
	}

	if(m_parseHex)
	{
		char* end = NULL;
		int val = strtol(data.c_str(), &end, 16);
		if(end == data.c_str() + data.size() && m_parseHex)
			sendData(cx, val);
		else
			m_parseErrors.append("invalid hex char: " + data + "\n");
	}
	else
	{
		sendControlWord(cx, data, RtfHandler::kIsEncoded, -1);
	}

	return true;
}

bool RtfReader::parseControlWord(RtfContext& cx)
{
	bool isAsterisk = false;
	string controlword;
	string param;

	while(1)
	{
		int ch = fgetc(cx.file);
		if(ch == WEOF)
			return false;

		bool empty = controlword.empty();

		// Part of the name of a control word
		// NOTE: Although the RTF specification prohibits upercase
		// control words, MS Word uses them :-/
		if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
			controlword.append(1, (char)ch);

		// Part of the parameter of a control word
		else if(ch >= '0' && ch <= '9')
			param.append(1, (char)ch);

		// Now handle escapes and other special types of
		// control words. These are all only valid at beginning
		// of the "control word"

		// hex spelled out character
		else if(empty && ch == '\'')
		{
			parseHexChar(cx, 2);
			break;
		}

		// Asterisk type destination
		else if(empty && ch == '*')
		{
			isAsterisk = true;

			ch = fgetc(cx.file);
			while(strchr("\r\n", ch))
				ch = fgetc(cx.file);

			if(ch != '\\')
				ungetc(ch, cx.file);
		}

		// Escaped backslash
		else if(empty && ch == '\\')
		{
			sendData(cx, L'\\');
			break;
		}

    // Escaped braces
    else if(empty && ch == '{')
    {
      sendData(cx, L'{');
    }

    else if(empty && ch == '}')
    {
      sendData(cx, L'}');
    }

		// Non breaking space
		else if(empty && ch == '~')
		{
			sendData(cx, 0x00A0);
			break;
		}

		// Optional hyphen
		else if(empty && ch == '-')
		{
			sendData(cx, 0x00AD);
			break;
		}

		// a hyphen right after control word is part of number
		else if(!empty && param.empty() && ch == '-')
		{
			param.append(1, (char)ch);
		}

		// TODO: This looks real hokey and acts that
		// way too
#if 0
		// An enter as the first character of a control word
		// makes a paragraph
		else if(strchr("\n\r", ch))
		{
			controlword = "par";
			break;
		}
#endif
		// Space end a rtf code (but get eaten)
		else if(strchr(" ", ch))
			break;

		// Anything else (including a backslash ends a control word)
		else
		{
			ungetc(ch, cx.file);
			break;
		}
	}

	// Empty out the control word buffers
	if(!controlword.empty())
	{
		int flags = isAsterisk ? RtfHandler::kAsterisk : 0;
		int numPar = -1;

		if(!param.empty())
		{
			char* end = NULL;
			numPar = strtol(param.c_str(), &end, 10);
			if(end == param.c_str() + param.size())
				flags += RtfHandler::kHasParam;
		}

		// Here we check for common characters
		if(controlword == "emdash")
			sendData(cx, 0x2014);
		else if(controlword == "endash")
			sendData(cx, 0x2013);
		else if(controlword == "emspace")
			sendData(cx, 0x2003);
		else if(controlword == "enspace")
			sendData(cx, 0x2002);
		else if(controlword == "bullet")
			sendData(cx, 0x2022);
		else if(controlword == "lquote")
			sendData(cx, 0x2018);
		else if(controlword == "rquote")
			sendData(cx, 0x2019);
		else if(controlword == "ldblquote")
			sendData(cx, 0x201C);
		else if(controlword == "rdblquote")
			sendData(cx, 0x201D);

		// Unicode values get sent through
		else if(m_parseUnicode && flags & RtfHandler::kHasParam &&
			    controlword == "u" )
		{
			sendData(cx, numPar);
			m_uniEat = m_uniEatStack.top();
		}

		// Unicode destination
		else if(m_parseUnicode && controlword == "ud")
		{

		}

		// Skip value for unicode characters
		else if(m_parseUnicode && controlword == "uc")
		{
			m_uniEatStack.pop();
			m_uniEatStack.push(numPar);
		}

		// Otherwise we send the control word
		else
		{
			if(m_handler)
				sendControlWord(cx, controlword, flags, numPar);
		}
	}

	return true;
}

bool RtfReader::parse(FILE* file)
{
	m_depth = 0;
	m_parseErrors = "";

	int ch = 0;

	RtfContext cx;
	cx.isData = false;
	cx.file = file;
	cx.data = L"";

	if(m_handler)
		m_handler->startDocument(this);

	while(1)
	{
		ch = fgetc(file);
		if(ch == EOF)
			goto done;

		// Type is undetermined so we figure it out here
		if(!cx.isData)
		{
			switch(ch)
			{
			case '\\':
				if(!parseControlWord(cx))
					goto done;
				break;

			case '{':
				{
					emptyData(cx);

					m_uniEatStack.push(m_uniEatStack.top());

					if(m_handler)
						m_handler->groupStart();

					m_depth++;
				}
				break;

			case '}':
				{
					emptyData(cx);

					if(m_handler)
						m_handler->groupEnd();

					if(!m_uniEatStack.empty())
						m_uniEatStack.pop();

					m_depth--;
				}
				break;

			default:
				cx.isData = true;
				break;
			}
		}

		if(cx.isData)
		{
			// We translate tabs into the appropriate control
			// word
			if(ch == '\t')
				sendControlWord(cx, "tab", 0, -1);

// Don't need this code, the XML outputter
// Takes care of it for us
#if 0
			if(ch == '&')
				sendData(cx, L"&amp;");

			else if(ch == '\'')
				sendData(cx, L"&apos;");

			else if(ch == '"')
				sendData(cx, L"&quot;");

			else if(ch == '<')
				sendData(cx, L"&lt;");

			else if(ch == '>')
				sendData(cx, L"&gt;");
#endif

			// enters a
			else if(!strchr("\r\n", ch))
				sendData(cx, ch);

			cx.isData = false;
		}
	}

done:

	if(m_depth != 0)
		m_parseErrors.append("unmatched braces\n");

	if(m_handler)
		m_handler->endDocument();

	return m_parseErrors.empty();
}