From 507524b97ef3bedb42f6c15ec93eedff8ee4b150 Mon Sep 17 00:00:00 2001 From: "(no author)" <(no author)> Date: Wed, 26 Nov 2003 02:12:18 +0000 Subject: New repository initialized by cvs2svn. --- src/rtfparser.cpp | 442 ------------------------------------------------------ 1 file changed, 442 deletions(-) delete mode 100644 src/rtfparser.cpp (limited to 'src/rtfparser.cpp') diff --git a/src/rtfparser.cpp b/src/rtfparser.cpp deleted file mode 100644 index c136e95..0000000 --- a/src/rtfparser.cpp +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (c) 2004, Nate Nielsen - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above - * copyright notice, this list of conditions and the - * following disclaimer. - * * Redistributions in binary form must reproduce the - * above copyright notice, this list of conditions and - * the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * * The names of contributors to this software may not be - * used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF - * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - * DAMAGE. - * - * - * CONTRIBUTORS - * Nate Nielsen - * - */ - -#include "usuals.h" - -#include -#include -#include "rtfparser.h" - - -/* ---------------------------------------------------------------------------------- - * CONSTRUCTION - */ - -RtfParser::RtfParser() -{ - m_handler = NULL; - m_depth = 0; - m_parseHex = true; - m_parseUnicode = false; - m_uniEat = 0; - m_uniEatStack.push(0); -} - -RtfParser::~RtfParser() -{ - -} - - -/* ---------------------------------------------------------------------------------- - * PUBLIC METHODS - */ - -bool RtfParser::parse(string fileName) -{ - FILE* file = fopen(fileName.c_str(), "r"); - if(!file) - return false; - - bool ret = parse(file); - - fclose(file); - - return ret; -} - -bool RtfParser::parse(FILE* file) -{ - int ch = 0; - bool isData = false; - - m_depth = 0; - m_parseErrors = ""; - m_file = file; - - if(m_handler) - m_handler->startDocument(this); - - while(1) - { - ch = fgetc(file); - if(ch == EOF) - goto done; - - switch(ch) - { - - // Starting a control word - case '\\': - if(!parseControlWord()) - goto done; - break; - - // Starting an RTF group - case '{': - { - // Send all previous data - flushData(); - - // Handle any unicode destinations properly - m_uniEatStack.push(m_uniEatStack.top()); - - if(m_handler) - m_handler->groupStart(); - - m_depth++; - } - break; - - case '}': - { - // Send all previous data - flushData(); - - if(m_handler) - m_handler->groupEnd(); - - // Handle any unicode destinations properly - if(!m_uniEatStack.empty()) - m_uniEatStack.pop(); - - m_depth--; - } - break; - - default: - isData = true; - break; - } - - if(isData) - { - // We translate tabs into the appropriate control word - if(ch == '\t') - sendControlWord("tab", 0, -1); - - // line endings aren't used - else if(!strchr("\r\n", ch)) - sendData(ch); - - isData = false; - } - } - - -done: - - if(m_depth != 0) - m_parseErrors.append("unmatched braces\n"); - - if(m_handler) - m_handler->endDocument(); - - m_file = NULL; - m_dataBuffer.resize(0); - - // If any parse errors return failure - return m_parseErrors.empty(); -} - - -/* ---------------------------------------------------------------------------------- - * HANDLER CALLS - */ - -void RtfParser::flushData() -{ - if(!m_dataBuffer.empty()) - { - if(m_handler) - m_handler->charData(m_dataBuffer); - - m_dataBuffer.resize(0); - } -} - -void RtfParser::sendData(wchar_t ch) -{ - // Skip unicode chars we've been asked to - if(m_uniEat > 0) - m_uniEat--; - - else - m_dataBuffer.append(1, ch); -} - -void RtfParser::sendData(wstring data) -{ - // Skip any unicode chars we've been asked to - if(m_uniEat > 0) - { - int len = data.size(); - if(len > m_uniEat) - len = m_uniEat; - - m_dataBuffer.append(data.substr(len)); - m_uniEat -= len; - } - else - { - m_dataBuffer.append(data); - } -} - -void RtfParser::sendControlWord(string cw, int flags, int param) -{ - flushData(); - - if(m_handler) - m_handler->controlWord(cw, flags, param); -} - - -/* ---------------------------------------------------------------------------------- - * PARSE HELPERS - */ - -bool RtfParser::parseHexChar(int num) -{ - string data; - - // Get num chars and put them in the string - for(int i = 0; i < num; i++) - { - char ch = fgetc(m_file); - - if(ch == -1) - return false; - - if((ch >= 'A' && ch <= 'F') || - (ch >= 'a' && ch <= 'f') || - (ch >= '0' && ch <= '9')) - { - data.append(1, ch); - } - else - { - m_parseErrors.append((string)"invalid hex char: " + ch + "\n"); - } - } - - // If parsing hex, then convert to appropriate unicode - if(m_parseHex) - { - char* end = NULL; - int val = strtol(data.c_str(), &end, 16); - if(end == data.c_str() + data.size() && m_parseHex) - sendData(val); - else - m_parseErrors.append("invalid hex char: " + data + "\n"); - } - - // TODO: Why would we ever want to do this? - // Otherwise just send as a hex control word - else - { - sendControlWord(data, RtfHandler::kIsEncoded, -1); - } - - return true; -} - -bool RtfParser::parseControlWord() -{ - bool isAsterisk = false; - string controlword; - string param; - - while(1) - { - int ch = fgetc(m_file); - if(ch == WEOF) - return false; - - bool empty = controlword.empty(); - - // Part of the name of a control word - // NOTE: Although the RTF specification prohibits uppercase - // control words, MS Word uses them :-/ - if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z') - controlword.append(1, (char)ch); - - // Part of the parameter of a control word - else if(ch >= '0' && ch <= '9') - param.append(1, (char)ch); - - // Now handle escapes and other special types of - // control words. These are all only valid at beginning - // of the "control word" - - // hex spelled out character - else if(empty && ch == '\'') - { - parseHexChar(2); - break; - } - - // Asterisk type destination - else if(empty && ch == '*') - { - isAsterisk = true; - - ch = fgetc(m_file); - while(strchr("\r\n", ch)) - ch = fgetc(m_file); - - if(ch != '\\') - ungetc(ch, m_file); - } - - // Escaped backslash - else if(empty && ch == '\\') - { - sendData(L'\\'); - break; - } - - // Escaped braces - else if(empty && ch == '{') - { - sendData(L'{'); - } - - else if(empty && ch == '}') - { - sendData(L'}'); - } - - // Non breaking space - else if(empty && ch == '~') - { - sendData(0x00A0); - break; - } - - // Optional hyphen - else if(empty && ch == '-') - { - sendData(0x00AD); - break; - } - - // a hyphen right after control word is part of number - else if(!empty && param.empty() && ch == '-') - { - param.append(1, (char)ch); - } - - // Space at end a rtf code (it gets eaten) - else if(strchr(" ", ch)) - break; - - // Anything else (including a backslash ends a control word) - else - { - ungetc(ch, m_file); - break; - } - } - - // Empty out the control word buffers - if(!controlword.empty()) - { - int flags = isAsterisk ? RtfHandler::kAsterisk : 0; - int numPar = -1; - - if(!param.empty()) - { - char* end = NULL; - numPar = strtol(param.c_str(), &end, 10); - if(end == param.c_str() + param.size()) - flags += RtfHandler::kHasParam; - } - - // Here we check for common characters - if(controlword == "emdash") - sendData(0x2014); - else if(controlword == "endash") - sendData(0x2013); - else if(controlword == "emspace") - sendData(0x2003); - else if(controlword == "enspace") - sendData(0x2002); - else if(controlword == "bullet") - sendData(0x2022); - else if(controlword == "lquote") - sendData(0x2018); - else if(controlword == "rquote") - sendData(0x2019); - else if(controlword == "ldblquote") - sendData(0x201C); - else if(controlword == "rdblquote") - sendData(0x201D); - - // Unicode values get sent through - else if(m_parseUnicode && flags & RtfHandler::kHasParam && - controlword == "u" ) - { - sendData(numPar); - m_uniEat = m_uniEatStack.top(); - } - - // Unicode destination - else if(m_parseUnicode && controlword == "ud") - { - - } - - // Skip value for unicode characters - else if(m_parseUnicode && controlword == "uc") - { - m_uniEatStack.pop(); - m_uniEatStack.push(numPar); - } - - // Otherwise we send the control word - else - { - if(m_handler) - sendControlWord(controlword, flags, numPar); - } - } - - return true; -} - -- cgit v1.2.3