/* * Copyright (c) 2004, Nate Nielsen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above * copyright notice, this list of conditions and the * following disclaimer. * * Redistributions in binary form must reproduce the * above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or * other materials provided with the distribution. * * The names of contributors to this software may not be * used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * * CONTRIBUTORS * Nate Nielsen * */ #include "usuals.h" #include #include #include "rtfparser.h" #include "internal.h" const unsigned int MAX_CHUNK = 4096; const wchar_t kAnsiToUnicode[] = { /* Moltly invalid, but used for wierd things in RTF anyway :( */ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0009, 0x000A, 0x0000, 0x0000, 0x000D, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* Low 7 bit, same */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, 0x0040, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, 0x0060, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, /* Changes in the middle here to random chars */ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, /* High stuff, all the same */ 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF }; /* ---------------------------------------------------------------------------------- * CONSTRUCTION */ RtfParser::RtfParser() { m_handler = NULL; m_depth = 0; m_parseHex = true; m_parseUnicode = false; m_uniEat = 0; m_uniEatStack.push(0); } RtfParser::~RtfParser() { } /* ---------------------------------------------------------------------------------- * PUBLIC METHODS */ bool RtfParser::parse(string fileName) { FILE* file = fopen(fileName.c_str(), "r"); if(!file) return false; bool ret = parse(file); fclose(file); return ret; } bool RtfParser::parse(FILE* file) { int ch = 0; bool isData = false; m_depth = 0; m_parseErrors = ""; m_file = file; if(m_handler) m_handler->startDocument(this); while(1) { ch = fgetc(file); if(ch == EOF) goto done; switch(ch) { // Starting a control word case '\\': if(!parseControlWord()) goto done; break; // Starting an RTF group case '{': { // Send all previous data flushData(); // Handle any unicode destinations properly m_uniEatStack.push(m_uniEatStack.top()); if(m_handler) m_handler->groupStart(); m_depth++; } break; case '}': { // Send all previous data flushData(); if(m_handler) m_handler->groupEnd(); // Handle any unicode destinations properly if(!m_uniEatStack.empty()) m_uniEatStack.pop(); m_depth--; } break; default: isData = true; break; } if(isData) { // We translate tabs into the appropriate control word if(ch == '\t') sendControlWord("tab", 0, -1); // line endings aren't used else if(!strchr("\r\n", ch)) sendData(ch); isData = false; } } done: if(m_depth != 0) m_parseErrors.append("unmatched braces\n"); if(m_handler) m_handler->endDocument(); m_file = NULL; m_dataBuffer.resize(0); // If any parse errors return failure return m_parseErrors.empty(); } /* ---------------------------------------------------------------------------------- * HANDLER CALLS */ void RtfParser::flushData() { if(!m_dataBuffer.empty()) { if(m_handler) m_handler->charData(m_dataBuffer); m_dataBuffer.resize(0); } } void RtfParser::sendData(wchar_t ch) { // Skip unicode chars we've been asked to if(m_uniEat > 0) m_uniEat--; else transcode16to8(ch, m_dataBuffer); if(m_dataBuffer.size() > MAX_CHUNK) flushData(); } void RtfParser::sendData(const wstring& data) { // Skip any unicode chars we've been asked to if(m_uniEat > 0) { int len = data.size(); if(len > m_uniEat) len = m_uniEat; transcode16to8(data.substr(len), m_dataBuffer); m_uniEat -= len; } else { transcode16to8(data, m_dataBuffer); } } void RtfParser::sendControlWord(const string& cw, int flags, int param) { flushData(); if(m_handler) m_handler->controlWord(cw, flags, param); } /* ---------------------------------------------------------------------------------- * PARSE HELPERS */ bool RtfParser::parseHexChar(int num, bool ansi) { string data; // Ansi is only 256 chars long ASSERT(num == 2 || !ansi); // Get num chars and put them in the string for(int i = 0; i < num; i++) { char ch = fgetc(m_file); if(ch == -1) return false; if((ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f') || (ch >= '0' && ch <= '9')) { data.append(1, ch); } else { m_parseErrors.append((string)"invalid hex char: " + ch + "\n"); } } // If parsing hex, then convert to appropriate unicode if(m_parseHex) { char* end = NULL; int val = strtol(data.c_str(), &end, 16); if(end == data.c_str() + data.size() && m_parseHex) { if(ansi) { ASSERT((sizeof(kAnsiToUnicode) / sizeof(kAnsiToUnicode[0])) == 256); if(val < 0 || val >= 256) { m_parseErrors.append("invalid ansi char: " + data + "\n"); } else { wchar_t ch = kAnsiToUnicode[val]; if(ch) sendData(ch); } } else { sendData(val); } } else m_parseErrors.append("invalid hex char: " + data + "\n"); } // TODO: Why would we ever want to do this? // Otherwise just send as a hex control word else { sendControlWord(data, RtfHandler::kIsEncoded, -1); } return true; } bool RtfParser::parseControlWord() { bool isAsterisk = false; string controlword; string param; while(1) { int ch = fgetc(m_file); if(ch == (int)WEOF) return false; bool empty = controlword.empty(); // Part of the name of a control word // NOTE: Although the RTF specification prohibits uppercase // control words, MS Word uses them :-/ if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z') controlword.append(1, (char)ch); // Part of the parameter of a control word else if(ch >= '0' && ch <= '9') param.append(1, (char)ch); // Hyphens are part of the parameter of a control word else if(ch == '-' && !controlword.empty()) param.append(1, (char)ch); // Now handle escapes and other special types of // control words. These are all only valid at beginning // of the "control word" // hex spelled out character else if(empty && ch == '\'') { parseHexChar(2, true); break; } // Asterisk type destination else if(empty && ch == '*') { isAsterisk = true; ch = fgetc(m_file); while(strchr("\r\n", ch)) ch = fgetc(m_file); if(ch != '\\') ungetc(ch, m_file); } // Escaped backslash else if(empty && ch == '\\') { sendData(L'\\'); break; } // Escaped braces else if(empty && ch == '{') { sendData(L'{'); } else if(empty && ch == '}') { sendData(L'}'); } // Non breaking space else if(empty && ch == '~') { sendData(0x00A0); break; } // Optional hyphen else if(empty && ch == '-') { sendData(0x00AD); break; } // a hyphen right after control word is part of number else if(!empty && param.empty() && ch == '-') { param.append(1, (char)ch); } // Space at end a rtf code (it gets eaten) else if(strchr(" ", ch)) break; // Anything else (including a backslash ends a control word) else { ungetc(ch, m_file); break; } } // Empty out the control word buffers if(!controlword.empty()) { int flags = isAsterisk ? RtfHandler::kAsterisk : 0; int numPar = -1; if(!param.empty()) { char* end = NULL; numPar = strtol(param.c_str(), &end, 10); if(end == param.c_str() + param.size()) flags += RtfHandler::kHasParam; } // Here we check for common characters if(controlword == "emdash") sendData(0x2014); else if(controlword == "endash") sendData(0x2013); else if(controlword == "emspace") sendData(0x2003); else if(controlword == "enspace") sendData(0x2002); else if(controlword == "bullet") sendData(0x2022); else if(controlword == "lquote") sendData(0x2018); else if(controlword == "rquote") sendData(0x2019); else if(controlword == "ldblquote") sendData(0x201C); else if(controlword == "rdblquote") sendData(0x201D); // Unicode values get sent through else if(m_parseUnicode && flags & RtfHandler::kHasParam && controlword == "u" ) { // RTF plays hokey and uses negative values in unicode sendData((unsigned short)((short)numPar)); m_uniEat = m_uniEatStack.top(); } // Skip value for unicode characters else if(m_parseUnicode && controlword == "uc") { m_uniEatStack.pop(); m_uniEatStack.push(numPar); } // Otherwise we send the control word else { if(m_handler) sendControlWord(controlword, flags, numPar); } } return true; }