From 53914f770f1e1dc1ab4342c64846fd995825b7e6 Mon Sep 17 00:00:00 2001 From: Stef Date: Wed, 17 Sep 2003 18:34:42 +0000 Subject: Initial Import --- src/rtfparser.cpp | 398 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 398 insertions(+) create mode 100644 src/rtfparser.cpp (limited to 'src/rtfparser.cpp') diff --git a/src/rtfparser.cpp b/src/rtfparser.cpp new file mode 100644 index 0000000..2928aa7 --- /dev/null +++ b/src/rtfparser.cpp @@ -0,0 +1,398 @@ +// RtfReader.cpp: implementation of the RtfReader class. +// +////////////////////////////////////////////////////////////////////// + +#include "stdafx.h" + +#include +#include +#include "RtfReader.h" + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +const int RtfHandler::kAsterisk = 0x00000001; +const int RtfHandler::kHasParam = 0x00000002; +const int RtfHandler::kIsEncoded = 0x00000004; + +RtfReader::RtfReader() +{ + m_handler = NULL; + m_depth = 0; + m_parseHex = true; + m_parseUnicode = false; + m_uniEat = 0; + m_uniEatStack.push(0); +} + +RtfReader::~RtfReader() +{ + +} + +bool RtfReader::parse(string fileName) +{ + FILE* file = fopen(fileName, "r"); + if(!file) + return false; + + bool ret = parse(file); + + fclose(file); + + return ret; +} + +void RtfReader::emptyData(RtfContext& cx) +{ + if(!cx.data.empty()) + { + if(m_handler) + m_handler->charData(cx.data); + cx.data.resize(0); + } +} + +void RtfReader::sendData(RtfContext& cx, wchar_t ch) +{ + if(m_uniEat > 0) + m_uniEat--; + else + cx.data.append(1, ch); +} + +void RtfReader::sendData(RtfContext& cx, wstring data) +{ + if(m_uniEat > 0) + { + int len = __min(data.size(), m_uniEat); + cx.data.append(data.substr(len)); + m_uniEat -= len; + } + else + { + cx.data.append(data); + } +} + +void RtfReader::sendControlWord(RtfContext& cx, string cw, int flags, int param) +{ + emptyData(cx); + if(m_handler) + m_handler->controlWord(cw, flags, param); +} + +bool RtfReader::parseHexChar(RtfContext& cx, int num) +{ + string data; + for(int i = 0; i < num; i++) + { + char ch = fgetc(cx.file); + + if(ch == -1) + return false; + + if((ch >= 'A' && ch <= 'F') || + (ch >= 'a' && ch <= 'f') || + (ch >= '0' && ch <= '9')) + { + data.append(1, ch); + } + else + { + m_parseErrors.append((string)"invalid hex char: " + ch + "\n"); + } + } + + if(m_parseHex) + { + char* end = NULL; + int val = strtol(data.c_str(), &end, 16); + if(end == data.c_str() + data.size() && m_parseHex) + sendData(cx, val); + else + m_parseErrors.append("invalid hex char: " + data + "\n"); + } + else + { + sendControlWord(cx, data, RtfHandler::kIsEncoded, -1); + } + + return true; +} + +bool RtfReader::parseControlWord(RtfContext& cx) +{ + bool isAsterisk = false; + string controlword; + string param; + + while(1) + { + int ch = fgetc(cx.file); + if(ch == WEOF) + return false; + + bool empty = controlword.empty(); + + // Part of the name of a control word + // NOTE: Although the RTF specification prohibits upercase + // control words, MS Word uses them :-/ + if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z') + controlword.append(1, ch); + + // Part of the parameter of a control word + else if(ch >= '0' && ch <= '9') + param.append(1, ch); + + // Now handle escapes and other special types of + // control words. These are all only valid at beginning + // of the "control word" + + // hex spelled out character + else if(empty && ch == '\'') + { + parseHexChar(cx, 2); + break; + } + + // Asterisk type destination + else if(empty && ch == '*') + { + isAsterisk = true; + + ch = fgetc(cx.file); + while(strchr("\r\n", ch)) + ch = fgetc(cx.file); + + if(ch != '\\') + ungetc(ch, cx.file); + } + + // Escaped backslash + else if(empty && ch == '\\') + { + sendData(cx, L'\\'); + break; + } + + // Non breaking space + else if(empty && ch == '~') + { + sendData(cx, 0x00A0); + break; + } + + // Optional hyphen + else if(empty && ch == '-') + { + sendData(cx, 0x00AD); + break; + } + + // a hyphen right after control word is part of number + else if(!empty && param.empty() && ch == '-') + { + param.append(1, ch); + } + + // TODO: This looks real hokey and acts that + // way too +#if 0 + // An enter as the first character of a control word + // makes a paragraph + else if(strchr("\n\r", ch)) + { + controlword = "par"; + break; + } +#endif + // Space end a rtf code (but get eaten) + else if(strchr(" ", ch)) + break; + + // Anything else (including a backslash ends a control word) + else + { + ungetc(ch, cx.file); + break; + } + } + + // Empty out the control word buffers + if(!controlword.empty()) + { + int flags = isAsterisk ? RtfHandler::kAsterisk : 0; + int numPar = -1; + + if(!param.empty()) + { + char* end = NULL; + numPar = strtol(param.c_str(), &end, 10); + if(end == param.c_str() + param.size()) + flags += RtfHandler::kHasParam; + } + + // Here we check for common characters + if(controlword == "emdash") + sendData(cx, 0x2014); + else if(controlword == "endash") + sendData(cx, 0x2013); + else if(controlword == "emspace") + sendData(cx, 0x2003); + else if(controlword == "enspace") + sendData(cx, 0x2002); + else if(controlword == "bullet") + sendData(cx, 0x2022); + else if(controlword == "lquote") + sendData(cx, 0x2018); + else if(controlword == "rquote") + sendData(cx, 0x2019); + else if(controlword == "ldblquote") + sendData(cx, 0x201C); + else if(controlword == "rdblquote") + sendData(cx, 0x201D); + + // Unicode values get sent through + else if(m_parseUnicode && flags & RtfHandler::kHasParam && + controlword == "u" ) + { + sendData(cx, numPar); + m_uniEat = m_uniEatStack.top(); + } + + // Unicode destination + else if(m_parseUnicode && controlword == "ud") + { + + } + + // Skip value for unicode characters + else if(m_parseUnicode && controlword == "uc") + { + m_uniEatStack.pop(); + m_uniEatStack.push(numPar); + } + + // Otherwise we send the control word + else + { + if(m_handler) + sendControlWord(cx, controlword, flags, numPar); + } + } + + return true; +} + +bool RtfReader::parse(FILE* file) +{ + m_depth = 0; + m_parseErrors = ""; + + int ch = 0; + + RtfContext cx; + cx.isData = false; + cx.file = file; + cx.data = L""; + + if(m_handler) + m_handler->startDocument(this); + + while(1) + { + ch = fgetc(file); + if(ch == EOF) + goto done; + + // Type is undetermined so we figure it out here + if(!cx.isData) + { + switch(ch) + { + case '\\': + if(!parseControlWord(cx)) + goto done; + break; + + case '{': + { + emptyData(cx); + + m_uniEatStack.push(m_uniEatStack.top()); + + if(m_handler) + m_handler->groupStart(); + + m_depth++; + } + break; + + case '}': + { + emptyData(cx); + + if(m_handler) + m_handler->groupEnd(); + + if(!m_uniEatStack.empty()) + m_uniEatStack.pop(); + + m_depth--; + } + break; + + default: + cx.isData = true; + break; + } + } + + if(cx.isData) + { + // We translate tabs into the appropriate control + // word + if(ch == '\t') + sendControlWord(cx, "tab", 0, -1); + +// Don't need this code, the XML outputter +// Takes care of it for us +#if 0 + if(ch == '&') + sendData(cx, L"&"); + + else if(ch == '\'') + sendData(cx, L"'"); + + else if(ch == '"') + sendData(cx, L"""); + + else if(ch == '<') + sendData(cx, L"<"); + + else if(ch == '>') + sendData(cx, L">"); +#endif + + // enters a + else if(!strchr("\r\n", ch)) + sendData(cx, ch); + + cx.isData = false; + } + } + +done: + + if(m_depth != 0) + m_parseErrors.append("unmatched braces\n"); + + // TODO: Check depth and give errors if screwy + if(m_handler) + m_handler->endDocument(); + + return m_parseErrors.empty(); +} + -- cgit v1.2.3