summaryrefslogtreecommitdiff
path: root/src/rtfparser.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/rtfparser.cpp')
-rw-r--r--src/rtfparser.cpp442
1 files changed, 0 insertions, 442 deletions
diff --git a/src/rtfparser.cpp b/src/rtfparser.cpp
deleted file mode 100644
index c136e95..0000000
--- a/src/rtfparser.cpp
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2004, Nate Nielsen
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the
- * following disclaimer.
- * * Redistributions in binary form must reproduce the
- * above copyright notice, this list of conditions and
- * the following disclaimer in the documentation and/or
- * other materials provided with the distribution.
- * * The names of contributors to this software may not be
- * used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
- * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- *
- * CONTRIBUTORS
- * Nate Nielsen <nielsen@memberwebs.com>
- *
- */
-
-#include "usuals.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "rtfparser.h"
-
-
-/* ----------------------------------------------------------------------------------
- * CONSTRUCTION
- */
-
-RtfParser::RtfParser()
-{
- m_handler = NULL;
- m_depth = 0;
- m_parseHex = true;
- m_parseUnicode = false;
- m_uniEat = 0;
- m_uniEatStack.push(0);
-}
-
-RtfParser::~RtfParser()
-{
-
-}
-
-
-/* ----------------------------------------------------------------------------------
- * PUBLIC METHODS
- */
-
-bool RtfParser::parse(string fileName)
-{
- FILE* file = fopen(fileName.c_str(), "r");
- if(!file)
- return false;
-
- bool ret = parse(file);
-
- fclose(file);
-
- return ret;
-}
-
-bool RtfParser::parse(FILE* file)
-{
- int ch = 0;
- bool isData = false;
-
- m_depth = 0;
- m_parseErrors = "";
- m_file = file;
-
- if(m_handler)
- m_handler->startDocument(this);
-
- while(1)
- {
- ch = fgetc(file);
- if(ch == EOF)
- goto done;
-
- switch(ch)
- {
-
- // Starting a control word
- case '\\':
- if(!parseControlWord())
- goto done;
- break;
-
- // Starting an RTF group
- case '{':
- {
- // Send all previous data
- flushData();
-
- // Handle any unicode destinations properly
- m_uniEatStack.push(m_uniEatStack.top());
-
- if(m_handler)
- m_handler->groupStart();
-
- m_depth++;
- }
- break;
-
- case '}':
- {
- // Send all previous data
- flushData();
-
- if(m_handler)
- m_handler->groupEnd();
-
- // Handle any unicode destinations properly
- if(!m_uniEatStack.empty())
- m_uniEatStack.pop();
-
- m_depth--;
- }
- break;
-
- default:
- isData = true;
- break;
- }
-
- if(isData)
- {
- // We translate tabs into the appropriate control word
- if(ch == '\t')
- sendControlWord("tab", 0, -1);
-
- // line endings aren't used
- else if(!strchr("\r\n", ch))
- sendData(ch);
-
- isData = false;
- }
- }
-
-
-done:
-
- if(m_depth != 0)
- m_parseErrors.append("unmatched braces\n");
-
- if(m_handler)
- m_handler->endDocument();
-
- m_file = NULL;
- m_dataBuffer.resize(0);
-
- // If any parse errors return failure
- return m_parseErrors.empty();
-}
-
-
-/* ----------------------------------------------------------------------------------
- * HANDLER CALLS
- */
-
-void RtfParser::flushData()
-{
- if(!m_dataBuffer.empty())
- {
- if(m_handler)
- m_handler->charData(m_dataBuffer);
-
- m_dataBuffer.resize(0);
- }
-}
-
-void RtfParser::sendData(wchar_t ch)
-{
- // Skip unicode chars we've been asked to
- if(m_uniEat > 0)
- m_uniEat--;
-
- else
- m_dataBuffer.append(1, ch);
-}
-
-void RtfParser::sendData(wstring data)
-{
- // Skip any unicode chars we've been asked to
- if(m_uniEat > 0)
- {
- int len = data.size();
- if(len > m_uniEat)
- len = m_uniEat;
-
- m_dataBuffer.append(data.substr(len));
- m_uniEat -= len;
- }
- else
- {
- m_dataBuffer.append(data);
- }
-}
-
-void RtfParser::sendControlWord(string cw, int flags, int param)
-{
- flushData();
-
- if(m_handler)
- m_handler->controlWord(cw, flags, param);
-}
-
-
-/* ----------------------------------------------------------------------------------
- * PARSE HELPERS
- */
-
-bool RtfParser::parseHexChar(int num)
-{
- string data;
-
- // Get num chars and put them in the string
- for(int i = 0; i < num; i++)
- {
- char ch = fgetc(m_file);
-
- if(ch == -1)
- return false;
-
- if((ch >= 'A' && ch <= 'F') ||
- (ch >= 'a' && ch <= 'f') ||
- (ch >= '0' && ch <= '9'))
- {
- data.append(1, ch);
- }
- else
- {
- m_parseErrors.append((string)"invalid hex char: " + ch + "\n");
- }
- }
-
- // If parsing hex, then convert to appropriate unicode
- if(m_parseHex)
- {
- char* end = NULL;
- int val = strtol(data.c_str(), &end, 16);
- if(end == data.c_str() + data.size() && m_parseHex)
- sendData(val);
- else
- m_parseErrors.append("invalid hex char: " + data + "\n");
- }
-
- // TODO: Why would we ever want to do this?
- // Otherwise just send as a hex control word
- else
- {
- sendControlWord(data, RtfHandler::kIsEncoded, -1);
- }
-
- return true;
-}
-
-bool RtfParser::parseControlWord()
-{
- bool isAsterisk = false;
- string controlword;
- string param;
-
- while(1)
- {
- int ch = fgetc(m_file);
- if(ch == WEOF)
- return false;
-
- bool empty = controlword.empty();
-
- // Part of the name of a control word
- // NOTE: Although the RTF specification prohibits uppercase
- // control words, MS Word uses them :-/
- if(ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
- controlword.append(1, (char)ch);
-
- // Part of the parameter of a control word
- else if(ch >= '0' && ch <= '9')
- param.append(1, (char)ch);
-
- // Now handle escapes and other special types of
- // control words. These are all only valid at beginning
- // of the "control word"
-
- // hex spelled out character
- else if(empty && ch == '\'')
- {
- parseHexChar(2);
- break;
- }
-
- // Asterisk type destination
- else if(empty && ch == '*')
- {
- isAsterisk = true;
-
- ch = fgetc(m_file);
- while(strchr("\r\n", ch))
- ch = fgetc(m_file);
-
- if(ch != '\\')
- ungetc(ch, m_file);
- }
-
- // Escaped backslash
- else if(empty && ch == '\\')
- {
- sendData(L'\\');
- break;
- }
-
- // Escaped braces
- else if(empty && ch == '{')
- {
- sendData(L'{');
- }
-
- else if(empty && ch == '}')
- {
- sendData(L'}');
- }
-
- // Non breaking space
- else if(empty && ch == '~')
- {
- sendData(0x00A0);
- break;
- }
-
- // Optional hyphen
- else if(empty && ch == '-')
- {
- sendData(0x00AD);
- break;
- }
-
- // a hyphen right after control word is part of number
- else if(!empty && param.empty() && ch == '-')
- {
- param.append(1, (char)ch);
- }
-
- // Space at end a rtf code (it gets eaten)
- else if(strchr(" ", ch))
- break;
-
- // Anything else (including a backslash ends a control word)
- else
- {
- ungetc(ch, m_file);
- break;
- }
- }
-
- // Empty out the control word buffers
- if(!controlword.empty())
- {
- int flags = isAsterisk ? RtfHandler::kAsterisk : 0;
- int numPar = -1;
-
- if(!param.empty())
- {
- char* end = NULL;
- numPar = strtol(param.c_str(), &end, 10);
- if(end == param.c_str() + param.size())
- flags += RtfHandler::kHasParam;
- }
-
- // Here we check for common characters
- if(controlword == "emdash")
- sendData(0x2014);
- else if(controlword == "endash")
- sendData(0x2013);
- else if(controlword == "emspace")
- sendData(0x2003);
- else if(controlword == "enspace")
- sendData(0x2002);
- else if(controlword == "bullet")
- sendData(0x2022);
- else if(controlword == "lquote")
- sendData(0x2018);
- else if(controlword == "rquote")
- sendData(0x2019);
- else if(controlword == "ldblquote")
- sendData(0x201C);
- else if(controlword == "rdblquote")
- sendData(0x201D);
-
- // Unicode values get sent through
- else if(m_parseUnicode && flags & RtfHandler::kHasParam &&
- controlword == "u" )
- {
- sendData(numPar);
- m_uniEat = m_uniEatStack.top();
- }
-
- // Unicode destination
- else if(m_parseUnicode && controlword == "ud")
- {
-
- }
-
- // Skip value for unicode characters
- else if(m_parseUnicode && controlword == "uc")
- {
- m_uniEatStack.pop();
- m_uniEatStack.push(numPar);
- }
-
- // Otherwise we send the control word
- else
- {
- if(m_handler)
- sendControlWord(controlword, flags, numPar);
- }
- }
-
- return true;
-}
-