summaryrefslogtreecommitdiff
path: root/src/rtfparser.cpp
diff options
context:
space:
mode:
authorStef Walter <stef@memberwebs.com>2003-09-17 18:34:42 +0000
committerStef Walter <stef@memberwebs.com>2003-09-17 18:34:42 +0000
commit69aa93c828303dcc44253fe88ff3d6024d10817b (patch)
tree63d14dacbd3d81363fcbea1036c47a0210b0f397 /src/rtfparser.cpp
parent15f3015d2e8305b729d7996faad410b3378497da (diff)
Initial Import
Diffstat (limited to 'src/rtfparser.cpp')
-rw-r--r--src/rtfparser.cpp398
1 files changed, 398 insertions, 0 deletions
diff --git a/src/rtfparser.cpp b/src/rtfparser.cpp
new file mode 100644
index 0000000..2928aa7
--- /dev/null
+++ b/src/rtfparser.cpp
@@ -0,0 +1,398 @@
+// RtfReader.cpp: implementation of the RtfReader class.
+//
+//////////////////////////////////////////////////////////////////////
+
+#include "stdafx.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "RtfReader.h"
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+const int RtfHandler::kAsterisk = 0x00000001;
+const int RtfHandler::kHasParam = 0x00000002;
+const int RtfHandler::kIsEncoded = 0x00000004;
+
+RtfReader::RtfReader()
+{
+ m_handler = NULL;
+ m_depth = 0;
+ m_parseHex = true;
+ m_parseUnicode = false;
+ m_uniEat = 0;
+ m_uniEatStack.push(0);
+}
+
+RtfReader::~RtfReader()
+{
+
+}
+
+bool RtfReader::parse(string fileName)
+{
+ FILE* file = fopen(fileName, "r");
+ if(!file)
+ return false;
+
+ bool ret = parse(file);
+
+ fclose(file);
+
+ return ret;
+}
+
+void RtfReader::emptyData(RtfContext& cx)
+{
+ if(!cx.data.empty())
+ {
+ if(m_handler)
+ m_handler->charData(cx.data);
+ cx.data.resize(0);
+ }
+}
+
+void RtfReader::sendData(RtfContext& cx, wchar_t ch)
+{
+ if(m_uniEat > 0)
+ m_uniEat--;
+ else
+ cx.data.append(1, ch);
+}
+
+void RtfReader::sendData(RtfContext& cx, wstring data)
+{
+ if(m_uniEat > 0)
+ {
+ int len = __min(data.size(), m_uniEat);
+ cx.data.append(data.substr(len));
+ m_uniEat -= len;
+ }
+ else
+ {
+ cx.data.append(data);
+ }
+}
+
+void RtfReader::sendControlWord(RtfContext& cx, string cw, int flags, int param)
+{
+ emptyData(cx);
+ if(m_handler)
+ m_handler->controlWord(cw, flags, param);
+}
+
+bool RtfReader::parseHexChar(RtfContext& cx, int num)
+{
+ string data;
+ for(int i = 0; i < num; i++)
+ {
+ char ch = fgetc(cx.file);
+
+ if(ch == -1)
+ return false;
+
+ if((ch >= 'A' && ch <= 'F') ||
+ (ch >= 'a' && ch <= 'f') ||
+ (ch >= '0' && ch <= '9'))
+ {
+ data.append(1, ch);
+ }
+ else
+ {
+ m_parseErrors.append((string)"invalid hex char: " + ch + "\n");
+ }
+ }
+
+ if(m_parseHex)
+ {
+ char* end = NULL;
+ int val = strtol(data.c_str(), &end, 16);
+ if(end == data.c_str() + data.size() && m_parseHex)
+ sendData(cx, val);
+ else
+ m_parseErrors.append("invalid hex char: " + data + "\n");
+ }
+ else
+ {
+ sendControlWord(cx, data, RtfHandler::kIsEncoded, -1);
+ }
+
+ return true;
+}
+
+bool RtfReader::parseControlWord(RtfContext& cx)
+{
+ bool isAsterisk = false;
+ string controlword;
+ string param;
+
+ while(1)
+ {
+ int ch = fgetc(cx.file);
+ if(ch == WEOF)
+ return false;
+
+ bool empty = controlword.empty();
+
+ // Part of the name of a control word
+ // NOTE: Although the RTF specification prohibits upercase
+ // control words, MS Word uses them :-/
+ if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
+ controlword.append(1, ch);
+
+ // Part of the parameter of a control word
+ else if(ch >= '0' && ch <= '9')
+ param.append(1, ch);
+
+ // Now handle escapes and other special types of
+ // control words. These are all only valid at beginning
+ // of the "control word"
+
+ // hex spelled out character
+ else if(empty && ch == '\'')
+ {
+ parseHexChar(cx, 2);
+ break;
+ }
+
+ // Asterisk type destination
+ else if(empty && ch == '*')
+ {
+ isAsterisk = true;
+
+ ch = fgetc(cx.file);
+ while(strchr("\r\n", ch))
+ ch = fgetc(cx.file);
+
+ if(ch != '\\')
+ ungetc(ch, cx.file);
+ }
+
+ // Escaped backslash
+ else if(empty && ch == '\\')
+ {
+ sendData(cx, L'\\');
+ break;
+ }
+
+ // Non breaking space
+ else if(empty && ch == '~')
+ {
+ sendData(cx, 0x00A0);
+ break;
+ }
+
+ // Optional hyphen
+ else if(empty && ch == '-')
+ {
+ sendData(cx, 0x00AD);
+ break;
+ }
+
+ // a hyphen right after control word is part of number
+ else if(!empty && param.empty() && ch == '-')
+ {
+ param.append(1, ch);
+ }
+
+ // TODO: This looks real hokey and acts that
+ // way too
+#if 0
+ // An enter as the first character of a control word
+ // makes a paragraph
+ else if(strchr("\n\r", ch))
+ {
+ controlword = "par";
+ break;
+ }
+#endif
+ // Space end a rtf code (but get eaten)
+ else if(strchr(" ", ch))
+ break;
+
+ // Anything else (including a backslash ends a control word)
+ else
+ {
+ ungetc(ch, cx.file);
+ break;
+ }
+ }
+
+ // Empty out the control word buffers
+ if(!controlword.empty())
+ {
+ int flags = isAsterisk ? RtfHandler::kAsterisk : 0;
+ int numPar = -1;
+
+ if(!param.empty())
+ {
+ char* end = NULL;
+ numPar = strtol(param.c_str(), &end, 10);
+ if(end == param.c_str() + param.size())
+ flags += RtfHandler::kHasParam;
+ }
+
+ // Here we check for common characters
+ if(controlword == "emdash")
+ sendData(cx, 0x2014);
+ else if(controlword == "endash")
+ sendData(cx, 0x2013);
+ else if(controlword == "emspace")
+ sendData(cx, 0x2003);
+ else if(controlword == "enspace")
+ sendData(cx, 0x2002);
+ else if(controlword == "bullet")
+ sendData(cx, 0x2022);
+ else if(controlword == "lquote")
+ sendData(cx, 0x2018);
+ else if(controlword == "rquote")
+ sendData(cx, 0x2019);
+ else if(controlword == "ldblquote")
+ sendData(cx, 0x201C);
+ else if(controlword == "rdblquote")
+ sendData(cx, 0x201D);
+
+ // Unicode values get sent through
+ else if(m_parseUnicode && flags & RtfHandler::kHasParam &&
+ controlword == "u" )
+ {
+ sendData(cx, numPar);
+ m_uniEat = m_uniEatStack.top();
+ }
+
+ // Unicode destination
+ else if(m_parseUnicode && controlword == "ud")
+ {
+
+ }
+
+ // Skip value for unicode characters
+ else if(m_parseUnicode && controlword == "uc")
+ {
+ m_uniEatStack.pop();
+ m_uniEatStack.push(numPar);
+ }
+
+ // Otherwise we send the control word
+ else
+ {
+ if(m_handler)
+ sendControlWord(cx, controlword, flags, numPar);
+ }
+ }
+
+ return true;
+}
+
+bool RtfReader::parse(FILE* file)
+{
+ m_depth = 0;
+ m_parseErrors = "";
+
+ int ch = 0;
+
+ RtfContext cx;
+ cx.isData = false;
+ cx.file = file;
+ cx.data = L"";
+
+ if(m_handler)
+ m_handler->startDocument(this);
+
+ while(1)
+ {
+ ch = fgetc(file);
+ if(ch == EOF)
+ goto done;
+
+ // Type is undetermined so we figure it out here
+ if(!cx.isData)
+ {
+ switch(ch)
+ {
+ case '\\':
+ if(!parseControlWord(cx))
+ goto done;
+ break;
+
+ case '{':
+ {
+ emptyData(cx);
+
+ m_uniEatStack.push(m_uniEatStack.top());
+
+ if(m_handler)
+ m_handler->groupStart();
+
+ m_depth++;
+ }
+ break;
+
+ case '}':
+ {
+ emptyData(cx);
+
+ if(m_handler)
+ m_handler->groupEnd();
+
+ if(!m_uniEatStack.empty())
+ m_uniEatStack.pop();
+
+ m_depth--;
+ }
+ break;
+
+ default:
+ cx.isData = true;
+ break;
+ }
+ }
+
+ if(cx.isData)
+ {
+ // We translate tabs into the appropriate control
+ // word
+ if(ch == '\t')
+ sendControlWord(cx, "tab", 0, -1);
+
+// Don't need this code, the XML outputter
+// Takes care of it for us
+#if 0
+ if(ch == '&')
+ sendData(cx, L"&amp;");
+
+ else if(ch == '\'')
+ sendData(cx, L"&apos;");
+
+ else if(ch == '"')
+ sendData(cx, L"&quot;");
+
+ else if(ch == '<')
+ sendData(cx, L"&lt;");
+
+ else if(ch == '>')
+ sendData(cx, L"&gt;");
+#endif
+
+ // enters a
+ else if(!strchr("\r\n", ch))
+ sendData(cx, ch);
+
+ cx.isData = false;
+ }
+ }
+
+done:
+
+ if(m_depth != 0)
+ m_parseErrors.append("unmatched braces\n");
+
+ // TODO: Check depth and give errors if screwy
+ if(m_handler)
+ m_handler->endDocument();
+
+ return m_parseErrors.empty();
+}
+