Initial Import

author: Stef Walter <stef@memberwebs.com> 2003-09-17 18:34:42 +0000
committer: Stef Walter <stef@memberwebs.com> 2003-09-17 18:34:42 +0000
commit: 69aa93c828303dcc44253fe88ff3d6024d10817b (patch)
tree: 63d14dacbd3d81363fcbea1036c47a0210b0f397 /src/rtfparser.cpp
parent: 15f3015d2e8305b729d7996faad410b3378497da (diff)
1 files changed, 398 insertions, 0 deletions
diff --git a/src/rtfparser.cpp b/src/rtfparser.cpp
new file mode 100644
index 0000000..2928aa7
--- /dev/null
+++ b/src/rtfparser.cpp
@@ -0,0 +1,398 @@
+// RtfReader.cpp: implementation of the RtfReader class.
+//
+//////////////////////////////////////////////////////////////////////
+
+#include "stdafx.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "RtfReader.h"
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+const int RtfHandler::kAsterisk = 0x00000001;
+const int RtfHandler::kHasParam = 0x00000002;
+const int RtfHandler::kIsEncoded = 0x00000004;
+
+RtfReader::RtfReader()
+{
+	m_handler = NULL;
+	m_depth = 0;
+	m_parseHex = true;
+	m_parseUnicode = false;
+	m_uniEat = 0;
+	m_uniEatStack.push(0);
+}
+
+RtfReader::~RtfReader()
+{
+
+}
+
+bool RtfReader::parse(string fileName)
+{
+	FILE* file = fopen(fileName, "r");
+	if(!file)
+		return false;
+
+	bool ret = parse(file);
+
+	fclose(file);
+
+	return ret;
+}
+
+void RtfReader::emptyData(RtfContext& cx)
+{
+	if(!cx.data.empty())
+	{
+		if(m_handler)
+			m_handler->charData(cx.data);
+		cx.data.resize(0);
+	}
+}
+
+void RtfReader::sendData(RtfContext& cx, wchar_t ch)
+{
+	if(m_uniEat > 0)
+		m_uniEat--;
+	else
+		cx.data.append(1, ch);
+}
+
+void RtfReader::sendData(RtfContext& cx, wstring data)
+{
+	if(m_uniEat > 0)
+	{
+		int len = __min(data.size(), m_uniEat);
+		cx.data.append(data.substr(len));
+		m_uniEat -= len;
+	}
+	else
+	{
+		cx.data.append(data);
+	}
+}
+
+void RtfReader::sendControlWord(RtfContext& cx, string cw, int flags, int param)
+{
+	emptyData(cx);
+	if(m_handler)
+		m_handler->controlWord(cw, flags, param);
+}
+
+bool RtfReader::parseHexChar(RtfContext& cx, int num)
+{
+	string data;
+	for(int i = 0; i < num; i++)
+	{
+		char ch = fgetc(cx.file);
+
+		if(ch == -1)
+			return false;
+
+		if((ch >= 'A' && ch <= 'F') ||
+		   (ch >= 'a' && ch <= 'f') ||
+		   (ch >= '0' && ch <= '9'))
+		{
+			data.append(1, ch);
+		}
+		else
+		{
+			m_parseErrors.append((string)"invalid hex char: " + ch + "\n");
+		}
+	}
+
+	if(m_parseHex)
+	{
+		char* end = NULL;
+		int val = strtol(data.c_str(), &end, 16);
+		if(end == data.c_str() + data.size() && m_parseHex)
+			sendData(cx, val);
+		else
+			m_parseErrors.append("invalid hex char: " + data + "\n");
+	}
+	else
+	{
+		sendControlWord(cx, data, RtfHandler::kIsEncoded, -1);
+	}
+
+	return true;
+}
+
+bool RtfReader::parseControlWord(RtfContext& cx)
+{
+	bool isAsterisk = false;
+	string controlword;
+	string param;
+
+	while(1)
+	{
+		int ch = fgetc(cx.file);
+		if(ch == WEOF)
+			return false;
+
+		bool empty = controlword.empty();
+
+		// Part of the name of a control word
+		// NOTE: Although the RTF specification prohibits upercase
+		// control words, MS Word uses them :-/
+		if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
+			controlword.append(1, ch);
+
+		// Part of the parameter of a control word
+		else if(ch >= '0' && ch <= '9')
+			param.append(1, ch);
+
+		// Now handle escapes and other special types of
+		// control words. These are all only valid at beginning
+		// of the "control word"
+
+		// hex spelled out character
+		else if(empty && ch == '\'')
+		{
+			parseHexChar(cx, 2);
+			break;
+		}
+
+		// Asterisk type destination
+		else if(empty && ch == '*')
+		{
+			isAsterisk = true;
+
+			ch = fgetc(cx.file);
+			while(strchr("\r\n", ch))
+				ch = fgetc(cx.file);
+
+			if(ch != '\\')
+				ungetc(ch, cx.file);
+		}
+
+		// Escaped backslash
+		else if(empty && ch == '\\')
+		{
+			sendData(cx, L'\\');
+			break;
+		}
+
+		// Non breaking space
+		else if(empty && ch == '~')
+		{
+			sendData(cx, 0x00A0);
+			break;
+		}
+
+		// Optional hyphen
+		else if(empty && ch == '-')
+		{
+			sendData(cx, 0x00AD);
+			break;
+		}
+
+		// a hyphen right after control word is part of number
+		else if(!empty && param.empty() && ch == '-')
+		{
+			param.append(1, ch);
+		}
+
+		// TODO: This looks real hokey and acts that
+		// way too
+#if 0
+		// An enter as the first character of a control word
+		// makes a paragraph
+		else if(strchr("\n\r", ch))
+		{
+			controlword = "par";
+			break;
+		}
+#endif
+		// Space end a rtf code (but get eaten)
+		else if(strchr(" ", ch))
+			break;
+
+		// Anything else (including a backslash ends a control word)
+		else
+		{
+			ungetc(ch, cx.file);
+			break;
+		}
+	}
+
+	// Empty out the control word buffers
+	if(!controlword.empty())
+	{
+		int flags = isAsterisk ? RtfHandler::kAsterisk : 0;
+		int numPar = -1;
+
+		if(!param.empty())
+		{
+			char* end = NULL;
+			numPar = strtol(param.c_str(), &end, 10);
+			if(end == param.c_str() + param.size())
+				flags += RtfHandler::kHasParam;
+		}
+
+		// Here we check for common characters
+		if(controlword == "emdash")
+			sendData(cx, 0x2014);
+		else if(controlword == "endash")
+			sendData(cx, 0x2013);
+		else if(controlword == "emspace")
+			sendData(cx, 0x2003);
+		else if(controlword == "enspace")
+			sendData(cx, 0x2002);
+		else if(controlword == "bullet")
+			sendData(cx, 0x2022);
+		else if(controlword == "lquote")
+			sendData(cx, 0x2018);
+		else if(controlword == "rquote")
+			sendData(cx, 0x2019);
+		else if(controlword == "ldblquote")
+			sendData(cx, 0x201C);
+		else if(controlword == "rdblquote")
+			sendData(cx, 0x201D);
+
+		// Unicode values get sent through
+		else if(m_parseUnicode && flags & RtfHandler::kHasParam &&
+			    controlword == "u" )
+		{
+			sendData(cx, numPar);
+			m_uniEat = m_uniEatStack.top();
+		}
+
+		// Unicode destination
+		else if(m_parseUnicode && controlword == "ud")
+		{
+
+		}
+
+		// Skip value for unicode characters
+		else if(m_parseUnicode && controlword == "uc")
+		{
+			m_uniEatStack.pop();
+			m_uniEatStack.push(numPar);
+		}
+
+		// Otherwise we send the control word
+		else
+		{
+			if(m_handler)
+				sendControlWord(cx, controlword, flags, numPar);
+		}
+	}
+
+	return true;
+}
+
+bool RtfReader::parse(FILE* file)
+{
+	m_depth = 0;
+	m_parseErrors = "";
+
+	int ch = 0;
+
+	RtfContext cx;
+	cx.isData = false;
+	cx.file = file;
+	cx.data = L"";
+
+	if(m_handler)
+		m_handler->startDocument(this);
+
+	while(1)
+	{
+		ch = fgetc(file);
+		if(ch == EOF)
+			goto done;
+
+		// Type is undetermined so we figure it out here
+		if(!cx.isData)
+		{
+			switch(ch)
+			{
+			case '\\':
+				if(!parseControlWord(cx))
+					goto done;
+				break;
+
+			case '{':
+				{
+					emptyData(cx);
+
+					m_uniEatStack.push(m_uniEatStack.top());
+
+					if(m_handler)
+						m_handler->groupStart();
+
+					m_depth++;
+				}
+				break;
+
+			case '}':
+				{
+					emptyData(cx);
+
+					if(m_handler)
+						m_handler->groupEnd();
+
+					if(!m_uniEatStack.empty())
+						m_uniEatStack.pop();
+
+					m_depth--;
+				}
+				break;
+
+			default:
+				cx.isData = true;
+				break;
+			}
+		}
+
+		if(cx.isData)
+		{
+			// We translate tabs into the appropriate control
+			// word
+			if(ch == '\t')
+				sendControlWord(cx, "tab", 0, -1);
+
+// Don't need this code, the XML outputter
+// Takes care of it for us
+#if 0
+			if(ch == '&')
+				sendData(cx, L"&amp;");
+
+			else if(ch == '\'')
+				sendData(cx, L"&apos;");
+
+			else if(ch == '"')
+				sendData(cx, L"&quot;");
+
+			else if(ch == '<')
+				sendData(cx, L"&lt;");
+
+			else if(ch == '>')
+				sendData(cx, L"&gt;");
+#endif
+
+			// enters a
+			else if(!strchr("\r\n", ch))
+				sendData(cx, ch);
+
+			cx.isData = false;
+		}
+	}
+
+done:
+
+	if(m_depth != 0)
+		m_parseErrors.append("unmatched braces\n");
+
+	// TODO: Check depth and give errors if screwy
+	if(m_handler)
+		m_handler->endDocument();
+
+	return m_parseErrors.empty();
+}
+
author	Stef Walter <stef@memberwebs.com>	2003-09-17 18:34:42 +0000
committer	Stef Walter <stef@memberwebs.com>	2003-09-17 18:34:42 +0000
commit	69aa93c828303dcc44253fe88ff3d6024d10817b (patch)
tree	63d14dacbd3d81363fcbea1036c47a0210b0f397 /src/rtfparser.cpp
parent	15f3015d2e8305b729d7996faad410b3378497da (diff)