/* * Copyright (c) 2004, Nate Nielsen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above * copyright notice, this list of conditions and the * following disclaimer. * * Redistributions in binary form must reproduce the * above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or * other materials provided with the distribution. * * The names of contributors to this software may not be * used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * * CONTRIBUTORS * Nate Nielsen * */ #include "usuals.h" #include #include #include "rtfreader.h" const int RtfHandler::kAsterisk = 0x00000001; const int RtfHandler::kHasParam = 0x00000002; const int RtfHandler::kIsEncoded = 0x00000004; RtfReader::RtfReader() { m_handler = NULL; m_depth = 0; m_parseHex = true; m_parseUnicode = false; m_uniEat = 0; m_uniEatStack.push(0); } RtfReader::~RtfReader() { } bool RtfReader::parse(string fileName) { FILE* file = fopen(fileName.c_str(), "r"); if(!file) return false; bool ret = parse(file); fclose(file); return ret; } void RtfReader::emptyData(RtfContext& cx) { if(!cx.data.empty()) { if(m_handler) m_handler->charData(cx.data); cx.data.resize(0); } } void RtfReader::sendData(RtfContext& cx, wchar_t ch) { if(m_uniEat > 0) m_uniEat--; else cx.data.append(1, ch); } void RtfReader::sendData(RtfContext& cx, wstring data) { if(m_uniEat > 0) { int len = data.size(); if(len > m_uniEat) len = m_uniEat; cx.data.append(data.substr(len)); m_uniEat -= len; } else { cx.data.append(data); } } void RtfReader::sendControlWord(RtfContext& cx, string cw, int flags, int param) { emptyData(cx); if(m_handler) m_handler->controlWord(cw, flags, param); } bool RtfReader::parseHexChar(RtfContext& cx, int num) { string data; for(int i = 0; i < num; i++) { char ch = fgetc(cx.file); if(ch == -1) return false; if((ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f') || (ch >= '0' && ch <= '9')) { data.append(1, ch); } else { m_parseErrors.append((string)"invalid hex char: " + ch + "\n"); } } if(m_parseHex) { char* end = NULL; int val = strtol(data.c_str(), &end, 16); if(end == data.c_str() + data.size() && m_parseHex) sendData(cx, val); else m_parseErrors.append("invalid hex char: " + data + "\n"); } else { sendControlWord(cx, data, RtfHandler::kIsEncoded, -1); } return true; } bool RtfReader::parseControlWord(RtfContext& cx) { bool isAsterisk = false; string controlword; string param; while(1) { int ch = fgetc(cx.file); if(ch == WEOF) return false; bool empty = controlword.empty(); // Part of the name of a control word // NOTE: Although the RTF specification prohibits upercase // control words, MS Word uses them :-/ if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z') controlword.append(1, (char)ch); // Part of the parameter of a control word else if(ch >= '0' && ch <= '9') param.append(1, (char)ch); // Now handle escapes and other special types of // control words. These are all only valid at beginning // of the "control word" // hex spelled out character else if(empty && ch == '\'') { parseHexChar(cx, 2); break; } // Asterisk type destination else if(empty && ch == '*') { isAsterisk = true; ch = fgetc(cx.file); while(strchr("\r\n", ch)) ch = fgetc(cx.file); if(ch != '\\') ungetc(ch, cx.file); } // Escaped backslash else if(empty && ch == '\\') { sendData(cx, L'\\'); break; } // Escaped braces else if(empty && ch == '{') { sendData(cx, L'{'); } else if(empty && ch == '}') { sendData(cx, L'}'); } // Non breaking space else if(empty && ch == '~') { sendData(cx, 0x00A0); break; } // Optional hyphen else if(empty && ch == '-') { sendData(cx, 0x00AD); break; } // a hyphen right after control word is part of number else if(!empty && param.empty() && ch == '-') { param.append(1, (char)ch); } // TODO: This looks real hokey and acts that // way too #if 0 // An enter as the first character of a control word // makes a paragraph else if(strchr("\n\r", ch)) { controlword = "par"; break; } #endif // Space end a rtf code (but get eaten) else if(strchr(" ", ch)) break; // Anything else (including a backslash ends a control word) else { ungetc(ch, cx.file); break; } } // Empty out the control word buffers if(!controlword.empty()) { int flags = isAsterisk ? RtfHandler::kAsterisk : 0; int numPar = -1; if(!param.empty()) { char* end = NULL; numPar = strtol(param.c_str(), &end, 10); if(end == param.c_str() + param.size()) flags += RtfHandler::kHasParam; } // Here we check for common characters if(controlword == "emdash") sendData(cx, 0x2014); else if(controlword == "endash") sendData(cx, 0x2013); else if(controlword == "emspace") sendData(cx, 0x2003); else if(controlword == "enspace") sendData(cx, 0x2002); else if(controlword == "bullet") sendData(cx, 0x2022); else if(controlword == "lquote") sendData(cx, 0x2018); else if(controlword == "rquote") sendData(cx, 0x2019); else if(controlword == "ldblquote") sendData(cx, 0x201C); else if(controlword == "rdblquote") sendData(cx, 0x201D); // Unicode values get sent through else if(m_parseUnicode && flags & RtfHandler::kHasParam && controlword == "u" ) { sendData(cx, numPar); m_uniEat = m_uniEatStack.top(); } // Unicode destination else if(m_parseUnicode && controlword == "ud") { } // Skip value for unicode characters else if(m_parseUnicode && controlword == "uc") { m_uniEatStack.pop(); m_uniEatStack.push(numPar); } // Otherwise we send the control word else { if(m_handler) sendControlWord(cx, controlword, flags, numPar); } } return true; } bool RtfReader::parse(FILE* file) { m_depth = 0; m_parseErrors = ""; int ch = 0; RtfContext cx; cx.isData = false; cx.file = file; cx.data = L""; if(m_handler) m_handler->startDocument(this); while(1) { ch = fgetc(file); if(ch == EOF) goto done; // Type is undetermined so we figure it out here if(!cx.isData) { switch(ch) { case '\\': if(!parseControlWord(cx)) goto done; break; case '{': { emptyData(cx); m_uniEatStack.push(m_uniEatStack.top()); if(m_handler) m_handler->groupStart(); m_depth++; } break; case '}': { emptyData(cx); if(m_handler) m_handler->groupEnd(); if(!m_uniEatStack.empty()) m_uniEatStack.pop(); m_depth--; } break; default: cx.isData = true; break; } } if(cx.isData) { // We translate tabs into the appropriate control // word if(ch == '\t') sendControlWord(cx, "tab", 0, -1); // Don't need this code, the XML outputter // Takes care of it for us #if 0 if(ch == '&') sendData(cx, L"&"); else if(ch == '\'') sendData(cx, L"'"); else if(ch == '"') sendData(cx, L"""); else if(ch == '<') sendData(cx, L"<"); else if(ch == '>') sendData(cx, L">"); #endif // enters a else if(!strchr("\r\n", ch)) sendData(cx, ch); cx.isData = false; } } done: if(m_depth != 0) m_parseErrors.append("unmatched braces\n"); if(m_handler) m_handler->endDocument(); return m_parseErrors.empty(); }