#include "usuals.h" #include "rtfanalyser.h" const char* kElDest = "i_dest"; const char* kElBlock = "i_block"; const char* kAtFix = "i_fix"; const char* kAtCell = "i_cell"; const char* kElListtable = "i_listtable"; const char* kElListdef = "i_listdef"; const char* kElPara = "para"; const char* kElDoc = "document"; const char* kElTab = "tab"; const char* kElSect = "sect"; const char* kElPage = "page"; const char* kElStyle = "style"; const char* kElLine = "line"; const char* kElList = "list"; const char* kElStylesheet = "stylesheet"; const char* kElInfo = "info"; const char* kElTitle = "title"; const char* kElAuthor = "author"; const char* kElOperator = "operator"; const char* kElB = "b"; const char* kElHide = "hide"; const char* kElI = "i"; const char* kElStrike = "strike"; const char* kElU = "u"; const char* kElColor = "color"; const char* kElCell = "cell"; const char* kElRow = "row"; const char* kElTable = "table"; const char* kAtList = "list"; const char* kAtName = "name"; const char* kAtBold = "bold"; const char* kAtHidden = "hide"; const char* kAtItalic = "italic"; const char* kAtStrike = "strike"; const char* kAtUnderline = "underline"; const char* kAtColor = "color"; const char* kAtType = "type"; const char* kAtOrdered = "ordered"; const char* kAtStart = "start"; const char* kAtId = "id"; const char* kAtIndex = "id"; const wchar_t* kValDisc = L"disc"; const wchar_t* kValLowerAlpha = L"lower-alpha"; const wchar_t* kValUpperAlpha = L"upper-alpha"; const wchar_t* kValLowerRoman = L"lower-roman"; const wchar_t* kValUpperRoman = L"upper-roman"; const wchar_t* kValArabic = L"arabic"; const wchar_t* kValNull = L""; const wchar_t* kValList = L"list"; const wchar_t* kValPara = L"para"; const wchar_t* kValTable = L"table"; const char* kNoDuplicates[] = { kElB, kElU, kElI, kElColor, kElHide, kElColor, NULL }; const char* kRemoveTags[] = { kElDest, kElListdef, kElListtable, NULL }; const char* kBlockTags[] = { kElTable, kElPara, NULL }; const char* kHideList[] = { kAtId, kAtList, NULL }; const char* kNSPrefix = "xmlns"; ////////////////////////////////////////////////////////////////////// // Construction/Destruction ////////////////////////////////////////////////////////////////////// RtfParser::RtfParser(const RtfParserOptions& options) { m_document = NULL; memcpy(&m_options, &options, sizeof(options)); } RtfParser::~RtfParser() { clear(); if(m_impl != NULL) m_impl.release(); } void RtfParser::clear() { if(m_document != NULL) { try { m_document.release(); } catch(...) { } m_document = NULL; } LevelHandler::clear(); } void RtfParser::startDocument(RtfReader* reader) { LevelHandler::startDocument(reader); // Create a new document m_document = m_impl.createDocument("", kElDoc, DOM::DocumentType()); // TODO: Throw error if document is null ASSERT(m_document != NULL); ASSERT(m_curLevel != NULL); m_curLevel->setElement(m_document.getDocumentElement(), true); // Set the attributes on the top level setAnalyser(AnalyserPtr(new Root)); setDestination(DestinationPtr(new Content)); getTextFormatting().resetPara(); getTextFormatting().resetText(); } void RtfParser::endDocument() { LevelHandler::endDocument(); // Cleanup the tree removeDuplicates(m_document); breakTables(m_document); breakTags(m_document, kElTable, kElRow); breakTags(m_document, kElRow, kElCell); wrapTags(m_document, kElCell, kElDest); breakBlocks(m_document); breakLists(m_document); fixLists(m_document); fixStyles(m_document); fixBlocks(m_document); removeTags(m_document); breakBreak(m_document, kElDoc, kElPage); breakBreak(m_document, kElDoc, kElSect); return; } // ----------------------------------------------------------------------- // Helper functions DOM::Element RtfParser::createElement(const string& name) { ASSERT(name.length() > 0); return m_document.createElement(name); // TODO: Throw exception here if necessary } void RtfParser::replaceElement(const DOM::Element& element) { ASSERT(m_curLevel != NULL); m_curLevel->setElement(element, true); } void RtfParser::pushElement(const DOM::Element& element) { ASSERT(m_curLevel != NULL); getElement().appendChild(element); m_curLevel->setElement(element); } DOM::Element RtfParser::popElement() { DOM::Element element = getElement(); ASSERT(m_curLevel != NULL); DOM::Node parent = element.getParentNode(); ASSERT(parent.getNodeType() == DOM::Node::ELEMENT_NODE); // Set it deep so it replaces the current element m_curLevel->setElement((DOM::Element&)parent, true); return element; } void RtfParser::setAttribute(const string& name, const wstring& value, DOM::Element el) { ASSERT(name.length() > 0); if(el == NULL) el = getElement(); el.setAttribute(name, value); } void RtfParser::setDestination(DestinationPtr dest) { ASSERT(m_curLevel); m_curLevel->setDestination(dest); dest->m_parser = this; dest->initialize(); } DestinationPtr RtfParser::replaceDestination(DestinationPtr dest) { ASSERT(m_curLevel); DestinationPtr old = m_curLevel->getDestination(); m_curLevel->setDestination(dest, true); dest->m_parser = this; dest->initialize(); return old; } void RtfParser::setAnalyser(AnalyserPtr analy) { ASSERT(m_curLevel); ASSERT(analy != NULL); analy->m_parser = this; m_curLevel->setAnalyser(analy); analy->initialize(); } AnalyserPtr RtfParser::getAnalyser() { ASSERT(m_curLevel); return m_curLevel->getAnalyser(); } DestinationPtr RtfParser::getDestination() { ASSERT(m_curLevel); return m_curLevel->getDestination(); } RtfFormatting& RtfParser::getTextFormatting() { ASSERT(m_curLevel); return m_curLevel->getFormatting(); } // --------------------------------------------------------------------------------- // Pass this stuff on through to the appropriate analysers etc... void RtfParser::charData(wstring data) { ASSERT(m_curLevel != NULL); DestinationPtr destination = m_curLevel->getDestination(); if(destination) { destination->charData(data); } else { destination = DestinationPtr(new Content); setDestination(destination); } } void RtfParser::controlWord(const string& cw, int flags, int param) { ASSERT(m_curLevel != NULL); AnalyserPtr analyser = m_curLevel->getAnalyser(); if(analyser) analyser->controlWord(cw, flags, param); } void RtfParser::groupStart() { LevelHandler::groupStart(); ASSERT(m_curLevel != NULL); AnalyserPtr analyser = m_curLevel->getAnalyser(); if(analyser) analyser->groupStart(); } void RtfParser::groupEnd() { ASSERT(m_curLevel != NULL); AnalyserPtr analyser = m_curLevel->getAnalyser(); if(analyser) analyser->groupEnd(); LevelHandler::groupEnd(); } #define ON_INITIALIZE(cls) \ void RtfParser::cls::initialize() #define ON_CONTROLWORD(cls) \ void RtfParser::cls::controlWord(const string& cw, int flags, int param) #define ON_CHARDATA(cls) \ void RtfParser::cls::charData(wstring data) #define ON_GROUPSTART(cls) \ void RtfParser::cls::groupStart() #define ON_GROUPEND(cls) \ void RtfParser::cls::groupEnd() #define ON_DONE(cls) \ void RtfParser::cls::done() #define AN_ELEMENT(name) \ m_parser->pushElement(m_parser->createElement(name)) #define AN_POP_ELEMENT() \ m_parser->popElement() #define AN_ATTRIBUTE(name, value) \ m_parser->setAttribute(name, value) #define AN_DESTINATION_ATTR(name) \ m_parser->setDestination(new Attribute(name)) #define AN_DESTINATION(cls) \ m_parser->setDestination(new cls) #define AN_ANALYSER(cls) \ m_parser->setAnalyser(AnalyserPtr(new cls)) #define AN_SET_ANALYSER(cls) \ m_parser->setAnalyser(AnalyserPtr(cls)) #define HAS_PARAM (flags & kHasParam) #define DEFAULT_CONTROLWORD processDefault(cw, flags, param) #define DUMMY 1 == 1 #define NUM_ATTR(n) m_parser->formatInt(n) bool RtfParser::ParseAnalyser::processDefault(const string& cw, int flags, int param) { if(cw == "upr") { AnalyserPtr analy = m_parser->getAnalyser(); ASSERT(analy != NULL); AN_SET_ANALYSER(new Upr(analy)); return true; } return false; } void RtfParser::ParseAnalyser::applyParaFormatting(RtfFormatting* format, DOM::Element& el) { if(format == NULL) format = &(m_parser->getTextFormatting()); wstring fix = kValPara; int list = format->paraList(); if(list != -1) { el.setAttribute(kAtList, NUM_ATTR(list)); } else { el.removeAttribute(kAtList); } if(format->paraInTable()) el.setAttribute(kAtCell, L"1"); else el.removeAttribute(kAtCell); int style = format->paraStyle(); if(style != -1) el.setAttribute(kElStyle, NUM_ATTR(style)); else el.removeAttribute(kElStyle); el.setAttribute(kAtFix, fix); } DOM::Element RtfParser::ParseAnalyser::getCurrentBlock() { DOM::Node node = m_parser->getElement(); if(node.hasChildNodes()) node = node.getLastChild(); return m_parser->getPriorElement(node, kElBlock); } bool RtfParser::ParseAnalyser::processTextContent(const string& cw, int flags, int param) { DOM::Element el; bool process = false; RtfFormatting& format = m_parser->getTextFormatting(); if(cw == "par") { el = getCurrentBlock(); if(el != NULL) applyParaFormatting(&format, el); el = m_parser->createElement(kElBlock); applyParaFormatting(&format, el); } else if(cw == "intbl") format.paraSetTable(true); else if(cw == "cell") { el = getCurrentBlock(); if(el != NULL) applyParaFormatting(&format, el); el = m_parser->createElement(kElCell); m_parser->pushElement(el); m_parser->popElement(); el = m_parser->createElement(kElBlock); applyParaFormatting(&format, el); } else if(cw == "trowd") el = m_parser->createElement(kElRow); else if(cw == "tab") el = m_parser->createElement(kElTab); else if(cw == "sect") el = m_parser->createElement(kElSect); else if(cw == "page") el = m_parser->createElement(kElPage); else if(cw == "s" && HAS_PARAM) format.paraSetStyle(param); else if(cw == "line") el = m_parser->createElement(kElLine); else if(cw == "header") AN_ANALYSER(Skip); else if(cw == "footer") AN_ANALYSER(Skip); else if(cw == "bkmkstart") AN_ANALYSER(Skip); else if(cw == "listtext") AN_ANALYSER(Skip); else if(cw == "ls" && HAS_PARAM) format.paraSetList(param); if(el != NULL) { // This ensures that our content destination is open and ready DestinationPtr dest = m_parser->getDestination(); ASSERT(dest != NULL); dest->charData(kValNull); m_parser->pushElement(el); m_parser->popElement(); } return (el != NULL) || process; /* TODO: cell, row, intbl, cellx, trowd*/ } bool RtfParser::ParseAnalyser::processTextFormatting(const string& cw, int flags, int param, RtfFormatting& format) { bool on = true; if(flags & HAS_PARAM && param == 0) on = false; if(cw == "pard") { format.resetPara(); // applyParaFormatting(); } else if(cw == "plain") format.resetText(); else if(cw == "b") format.textSetBold(on); else if(cw == "i") format.textSetItalic(on); else if(cw == "v") format.textSetHidden(on); else if(cw == "ul") format.textSetUnderline(on); else if(cw == "cf" && HAS_PARAM) format.textSetColor(param); else return false; return true; } bool RtfParser::ParseAnalyser::processTextFormatting(const string& cw, int flags, int param) { return processTextFormatting(cw, flags, param, m_parser->getTextFormatting()); } ON_INITIALIZE(Skip) { AN_DESTINATION(Null); } ON_GROUPSTART(Skip) { AN_ANALYSER(Skip); } RtfParser::Upr::Upr(AnalyserPtr prv) { ASSERT(prv); prev = prv; } ON_GROUPSTART(Upr) { AN_ANALYSER(Skip); } ON_GROUPEND(Upr) { ASSERT(prev); m_parser->setAnalyser(prev); prev = NULL; } ON_INITIALIZE(Stylesheet) { AN_ELEMENT(kElStylesheet); } ON_GROUPSTART(Stylesheet) { AN_ANALYSER(Style); AN_DESTINATION(Null); } ON_INITIALIZE(Style) { // Were not sure if this element is really something // so we can't always create haveStyle = false; } ON_CONTROLWORD(Style) { // Get the style id if(flags & kAsterisk) { AN_ANALYSER(Skip); return; } if(!haveStyle) { AN_ELEMENT(kElStyle); AN_DESTINATION_ATTR(kAtName); haveStyle = true; } if(cw == "s" && flags & kHasParam) { AN_ATTRIBUTE(kAtId, NUM_ATTR(param)); } // Otherwise get as much formatting out of the tag as possible else if(processTextFormatting(cw, flags, param)) DUMMY; else DEFAULT_CONTROLWORD; } ON_GROUPSTART(Style) { AN_ANALYSER(Skip); } ON_GROUPEND(Style) { RtfFormatting& props = m_parser->getTextFormatting(); if(props.textIsBold()) AN_ATTRIBUTE(kAtBold, L"1"); if(props.textIsHidden()) AN_ATTRIBUTE(kAtHidden, L"1"); if(props.textIsItalic()) AN_ATTRIBUTE(kAtItalic, L"1"); if(props.textIsStrike()) AN_ATTRIBUTE(kAtStrike, L"1"); if(props.textIsUnderline()) AN_ATTRIBUTE(kAtUnderline, L"1"); if(props.textColor() != -1 && m_parser->getOptions().doColors) AN_ATTRIBUTE(kAtColor, NUM_ATTR(props.textColor())); } ON_INITIALIZE(ListTable) { AN_ELEMENT(kElListtable); } ON_GROUPSTART(ListTable) { AN_ANALYSER(List); AN_DESTINATION(Null); } ON_INITIALIZE(List) { AN_ELEMENT(kElListdef); AN_ATTRIBUTE(kAtType, kValDisc); AN_ATTRIBUTE(kAtOrdered, L"0"); levelsSeen = 0; } ON_CONTROLWORD(List) { if(cw == "listname") AN_DESTINATION_ATTR(kAtName); else if(cw == "listid" && HAS_PARAM) AN_ATTRIBUTE(kAtId, NUM_ATTR(param)); // We let listlevel in here too else if(cw == "levelstartat" && HAS_PARAM) AN_ATTRIBUTE(kAtStart, NUM_ATTR(param)); else if(cw == "levelnfc" && HAS_PARAM) { switch(param) { case 0: // 1, 2, 3 case 5: // 1st, 2nd, 3rd case 6: // One, Two, Three case 7: // First, Second, Third case 22: // 01, 02, 03 AN_ATTRIBUTE(kAtType, kValArabic); break; case 1: // I, II, III AN_ATTRIBUTE(kAtType, kValUpperRoman); break; case 2: // i, ii, iii AN_ATTRIBUTE(kAtType, kValLowerRoman); break; case 3: // A, B, C AN_ATTRIBUTE(kAtType, kValUpperAlpha); break; case 4: // a, b, c AN_ATTRIBUTE(kAtType, kValLowerAlpha); break; default: AN_ATTRIBUTE(kAtType, kValDisc); break; } switch(param) { case 0: case 5: case 6: case 7: case 22: case 1: case 2: case 3: case 4: AN_ATTRIBUTE(kAtOrdered, L"1"); break; default: AN_ATTRIBUTE(kAtOrdered, L"0"); } } else DEFAULT_CONTROLWORD; } ON_GROUPSTART(List) { if(levelsSeen > 0) AN_ANALYSER(Skip); levelsSeen++; } ON_INITIALIZE(ListOverrideTable) { DOM::Document document = m_parser->getDocument(); lists = document.getElementsByTagName(kElListdef); curList = NULL; lsId = -1; } ON_GROUPSTART(ListOverrideTable) { AN_DESTINATION(Null); } ON_CONTROLWORD(ListOverrideTable) { // New list override clear if(cw == "listoverride") curList = NULL; // List id for current listoverride else if(cw == "listid" && HAS_PARAM) { wstring id = NUM_ATTR(param); if(lists != NULL) { for(int i = 0; i < lists->getLength(); i++) { DOM::Node node = lists->item(i); if(node != NULL && node.getNodeType() == DOM::Node::ELEMENT_NODE) { DOM::Element element = (DOM::Element&)node; if(element.getAttribute(kAtId) == id) { curList = element; break; } } } } } // The actual list code else if(cw == "ls" && HAS_PARAM) lsId = param; // Override the starting level for the node else if(cw == "levelstartat" && HAS_PARAM) { if(curList != NULL) curList.setAttribute(kAtStart, NUM_ATTR(param)); } else DEFAULT_CONTROLWORD; // Okay before any overrides take effect we need to duplicate // the list node for overriding, using the 'listid' and 'ls' we gathered if(curList != NULL && lsId != -1) { DOM::Element parent = (const DOM::Element&)curList.getParentNode(); if(parent != NULL) { curList = (const DOM::Element&)curList.cloneNode(true); if(curList != NULL) { parent.appendChild(curList); curList.setAttribute(kAtList, NUM_ATTR(lsId)); } } lsId = -1; } } ON_GROUPEND(ListOverrideTable) { } ON_INITIALIZE(Info) { // Create a new element AN_ELEMENT(kElInfo); AN_DESTINATION(Null); } ON_CONTROLWORD(Info) { // The title if(cw == "title") { AN_ELEMENT(kElTitle); AN_DESTINATION(Raw); } else if(cw == "author") { AN_ELEMENT(kElAuthor); AN_DESTINATION(Raw); } else if(cw == "operator") { AN_ELEMENT(kElOperator); AN_DESTINATION(Raw); } else if(flags & kAsterisk) AN_ANALYSER(Skip); else DEFAULT_CONTROLWORD; } ON_INITIALIZE(Root) { } ON_CONTROLWORD(Root) { if(cw == "stylesheet") AN_ANALYSER(Stylesheet); else if(cw == "listtable") AN_ANALYSER(ListTable); else if(cw == "listoverridetable") AN_ANALYSER(ListOverrideTable); else if(cw == "info") AN_ANALYSER(Info); else if(cw == "fonttbl") AN_ANALYSER(Skip); else if(cw == "colortbl") AN_ANALYSER(Skip); else if(cw == "pict") { AN_ANALYSER(Skip); AN_DESTINATION(Null); } else if(flags & kAsterisk) AN_ANALYSER(Skip); else if(processTextContent(cw, flags, param)) DUMMY; else if(processTextFormatting(cw, flags, param)) DUMMY; else DEFAULT_CONTROLWORD; } ON_INITIALIZE(Content) { parent = m_parser->getElement(); created = false; } ON_CHARDATA(Content) { // Create the first time we get content if(!created) { DOM::Element dest = m_parser->createElement(kElDest); parent.appendChild(dest); m_parser->replaceElement(dest); DOM::Element el = m_parser->createElement(kElBlock); m_parser->pushElement(el); m_parser->popElement(); created = true; } if(data.length() == 0) return; int elements = 0; RtfFormatting& format = m_parser->getTextFormatting(); // Now do text Properties if necessary if(format.textIsBold()) { AN_ELEMENT(kElB); elements++; } if(format.textIsHidden()) { AN_ELEMENT(kElHide); elements++; } if(format.textIsItalic()) { AN_ELEMENT(kElI); elements++; } if(format.textIsStrike()) { AN_ELEMENT(kElStrike); elements++; } if(format.textIsUnderline()) { AN_ELEMENT(kElU); elements++; } if(format.textColor() != -1 && m_parser->getOptions().doColors) { AN_ELEMENT(kElColor); AN_ATTRIBUTE(kAtIndex, NUM_ATTR(format.textColor())); elements++; } // Write the data to the element m_parser->getElement().appendChild( m_parser->getDocument().createTextNode(data)); // Now drop out of all the above formatting while(elements-- > 0) AN_POP_ELEMENT(); } #if 0 ON_INITIALIZE(Table) { stack = 0; level = m_parser->getLevel(); AN_ELEMENT(kElTable); AN_DESTINATION(Content); } ON_CONTROLWORD(Table) { ASSERT(stack >= 0); ASSERT(level != NULL); if(cw == "trowd") { stack++; } else if(cw == "row") { stack--; if(stack <= 0) m_parser->rewindLevel(level); } else if(processTextContent(cw, flags, param)) DUMMY; else if(processTextFormatting(cw, flags, param)) DUMMY; else DEFAULT_CONTROLWORD; if(!m_parser->getTextFormatting().paraInTable()) { m_parser->rewindLevel(level); } } #endif ON_CHARDATA(Raw) { // Write the data to the element m_parser->getElement().appendChild( m_parser->getDocument().createTextNode(data)); } ON_INITIALIZE(Attribute) { element = m_parser->getElement(); ASSERT(element != NULL); } ON_CHARDATA(Attribute) { // Get the current value wstring cur = element.getAttribute(name); if(data.at(data.size() - 1) == L';') data.resize(data.size() - 1); // Append data cur.append(data); // Write it back element.setAttribute(name, cur); } /** * A quick check to see if a node is an element of a certain * name */ bool RtfParser::isElement(const DOM::Node& node, const string& name) { return node != NULL && node.getNodeType() == DOM::Node::ELEMENT_NODE && node.getNodeName() == name; } bool RtfParser::isEqualElement(const DOM::Element& el1, const DOM::Element& el2) { if(el1.getNodeName() == el2.getNodeName()) return false; DOM::NamedNodeMap at1 = el1.getAttributes(); DOM::NamedNodeMap at2 = el2.getAttributes(); if(at1 == NULL && at2 == NULL) return true; if(at1 == NULL || at2 == NULL || at1->getLength() != at2->getLength()) return false; for(int i = 0; i < at1->getLength(); i++) { DOM::Attr attr1 = (const DOM::Attr&)at1->item(0); if(attr1 != NULL) return false; DOM::Attr attr2 = (const DOM::Attr&)at2->getNamedItem(attr1.getNodeName()); if(attr2 != NULL) return false; if(attr1.getNodeValue() == attr2.getNodeValue()) return false; } return true; } wstring RtfParser::formatInt(int num) { wchar_t buff[12]; // The Win32 version isn't secure #ifdef _WIN32 swprintf(buff, L"%d", num); #else swprintf(buff, 12, L"%d", num); #endif wstring n(buff); return n; } /** * Gets the pertinent ancestor of this node, or returns null * if not found. */ DOM::Element RtfParser::getContainingElement(const DOM::Node& node, const string& name) { DOM::Node n = node; while(true) { n = n.getParentNode(); if(n == NULL) break; if(isElement(n, name)) return (DOM::Element&)n; } return DOM::Element(); } DOM::Element RtfParser::getPriorElement(const DOM::Node& node, const string& name) { DOM::Node n = node; while(n != NULL) { if(isElement(n, name)) return (DOM::Element&)n; n = n.getPreviousSibling(); } DOM::Node parent = node.getParentNode(); if(parent == NULL) return DOM::Element(); else return getPriorElement(parent, name); } bool isNsAttr(const string& name) { return strncmp(name.c_str(), kNSPrefix, strlen(kNSPrefix)) ? false : true; } void RtfParser::copyAttributes(const DOM::Element& src, DOM::Element& dest, const char** hideList) { // Now get both sets of attributes DOM::NamedNodeMap srcMap = src.getAttributes(); DOM::NamedNodeMap destMap = dest.getAttributes(); if(srcMap == NULL || destMap == NULL) return; // And copy them from one to the other for(int j = 0; j < srcMap->getLength(); j++) { DOM::Node attr = srcMap->item(j); if(attr != NULL) { // BUG: Sablotron seems to have a bug in it's // setAttributeNode implementation. It always // adds a blank namespace // attr = attr.cloneNode(false); // if(attr != NULL) // destMap.setNamedItem(attr); string name = attr.getNodeName(); if(hideList) { for(const char** t = hideList; *t != NULL; t++) { if(name == *t) name.erase(); } } if(name.length() > 0 && !isNsAttr(name)) dest.setAttribute(attr.getNodeName(), attr.getNodeValue()); } } } void RtfParser::breakBreak(DOM::Document& doc, const string& contain, const string& tag) { DOM::NodeList els = doc.getElementsByTagName(tag); if(els != NULL) { for(int i = 0; i < els->getLength(); i++) { DOM::Element el = (const DOM::Element&)els->item(i); #if 0 // See if parent node only has this break tag // in it. If so then replace parent with this DOM::Node parent = el.getParentNode(); if(parent != NULL) { DOM::Node grandparent = parent.getParentNode(); if(grandparent != NULL && el.getPreviousSibling() == NULL && el.getNextSibling() == NULL) { grandparent.replaceChild(parent.removeChild(el), parent); } } #endif breakElement(el, contain); } } } /** * Breaks a paragraph up through a previous level. Calls itself * recursively to break paragraphs totally free up to containing * destination. * * For example: * * * This is a * test of your concentration. * * * Becomes: * * * This is a * test of your concentration. * */ bool RtfParser::breakElement(const DOM::Element& el, const string& contain) { ASSERT(el != NULL); DOM::Element parent = (const DOM::Element&)el.getParentNode(); DOM::Element grandparent; string s = el.getNodeName(); s = parent.getNodeName(); // Get the parent node if(parent != NULL) grandparent = (const DOM::Element&)parent.getParentNode(); // Make sure we have something to work with before continuing if(grandparent == NULL || parent == NULL || isElement(parent, contain)) return true; DOM::Node e; // Check to see if this is the first node in the parent. // If so then just move out to before if(el.getPreviousSibling() == NULL) { e = grandparent.insertBefore(parent.removeChild(el), parent); } // Check to see if this is the last node in the parent. // If so then just move out to after the parent else if(el.getNextSibling() == NULL) { DOM::Node next = parent.getNextSibling(); if(next == NULL) e = grandparent.appendChild(parent.removeChild(el)); else e = grandparent.insertBefore(parent.removeChild(el), next); } // Otherwise it's in the middle so split the parent // element etc... else { // Clone it but not deep DOM::Element parent2 = (const DOM::Element&)parent.cloneNode(false); if(parent2 == NULL) return false; // Flag that tells us whether we moved anything up to parent bool moved = false; // Now move all nodes after this one to the second parent. while((e = el.getNextSibling()) != NULL) { parent2.appendChild(parent.removeChild(e)); moved = true; } // Remove the element from it's parent e = parent.removeChild(el); // Okay now we move the paragraph up to the parent insertAfter(grandparent, e, parent); if(moved) insertAfter(grandparent, parent2, e); } // Now call it again with the paragraph in the new position // untill everything's cut through! return breakElement((DOM::Element&)e, contain); } /** * Changes from a marker based paragraph system to a contained * paragraph system. Also applies paragraph attributes to the * appropriate paragraph. * * For example: * * * This is a * test of your concentration. * * * Becomes: * * This is a * test of your concentration. */ void RtfParser::breakBlocks(DOM::Document& document) { // First break out all the paragraphs to the destination level DOM::NodeList blocks = document.getElementsByTagName(kElBlock); if(blocks != NULL) { for(int i = 0; i < blocks->getLength(); i++) { DOM::Element block = (const DOM::Element&)blocks->item(i); // If it's the single closed style para then break it if(block != NULL && !block.hasChildNodes()) breakElement(block, kElDest); } } // Now group stuff in destinations into paras or other blocks DOM::NodeList destinations = document.getElementsByTagName(kElDest); if(destinations != NULL) { for(int i = 0; i < destinations->getLength(); i++) { DOM::Element dest = (const DOM::Element&)destinations->item(i); // Sanity Check if(dest == NULL || !dest.hasChildNodes()) continue; // Go through the children of this destination DOM::Node child = dest.getFirstChild(); DOM::Element block; while(child != NULL) { // If it's a block if(isElement(child, kElBlock)) { block = (DOM::Element&)child; child = child.getNextSibling(); continue; } // If it's already a real block element for(const char** t = kBlockTags; *t != NULL; t++) { if(isElement(child, *t)) { block = NULL; break; } } // If there's a block then add to it if(block != NULL) { block.appendChild(dest.removeChild(child)); child = block; } child = child.getNextSibling(); } } } } void RtfParser::wrapTags(DOM::Document& doc, const string& tagName, const string& wrapName) { DOM::NodeList tags = doc.getElementsByTagName(tagName); if(tags != NULL) { for(int i = 0; i < tags->getLength(); i++) { DOM::Element tag = (const DOM::Element&)tags->item(i); DOM::Element wrap = doc.createElement(wrapName); while(tag.hasChildNodes()) wrap.appendChild(tag.removeChild(tag.getFirstChild())); tag.appendChild(wrap); } } } void RtfParser::breakTags(DOM::Document& doc, const string& parentName, const string& tagName) { DOM::NodeList parents = doc.getElementsByTagName(parentName); if(parents != NULL) { for(int i = 0; i < parents->getLength(); i++) { DOM::Element parent = (const DOM::Element&)parents->item(i); if(!parent.hasChildNodes()) continue; DOM::NodeList tags = parent.getElementsByTagName(tagName); if(tags != NULL) { for(int i = 0; i < tags->getLength(); i++) breakElement((const DOM::Element&)tags->item(i), parentName); } DOM::Node tag = doc.createElement(tagName); parent.insertBefore(tag, parent.getFirstChild()); DOM::Node child = tag; while(child != NULL && (child = child.getNextSibling()) != NULL) { if(isElement(child, kElBlock)) { DOM::Node next = child.getNextSibling(); if(next == NULL) { parent.removeChild(child); continue; } if(isElement(next, tagName)) { DOM::Node twodown = next.getNextSibling(); if(!isElement(twodown, kElBlock)) { child = parent.insertBefore(parent.removeChild(next), child); } else { parent.removeChild(child); child = next; } } } if(isElement(child, tagName)) { if(!tag.hasChildNodes()) parent.removeChild(tag); tag = child; } else { tag.appendChild(parent.removeChild(child)); child = tag; } } if(!tag.hasChildNodes()) parent.removeChild(tag); } } DOM::NodeList tags = doc.getElementsByTagName(tagName); if(tags != NULL) { for(int i = 0; i < tags->getLength(); i++) { DOM::Element tag = (const DOM::Element&)tags->item(i); DOM::Node parent = tag.getParentNode(); if(parent != NULL && !isElement(parent, parentName)) parent.removeChild(tag); #if 0 else if(tag.hasChildNodes()) { DOM::NodeList children = tag.getChildNodes(); if(children != NULL && children->getLength() == 1) { DOM::Node child = children->item(0); if(child != NULL && !child.hasChildNodes() && isElement(child, kElBlock)) parent.removeChild(tag); } } #endif } } } void RtfParser::breakLists(DOM::Document& doc) { // Now group stuff in destinations into tables DOM::NodeList destinations = doc.getElementsByTagName(kElDest); if(destinations != NULL) { for(int i = 0; i < destinations->getLength(); i++) { DOM::Element dest = (const DOM::Element&)destinations->item(i); // Sanity Check if(dest == NULL) continue; // Go through the children of this destination DOM::Node child = dest.getFirstChild(); DOM::Element list; DOM::Element e; wstring previd; while(child != NULL) { // If it's a block and has a cell attribute if(isElement(child, kElBlock)) { e = (DOM::Element&)child; // if it has a cell attribute wstring listid = e.getAttribute(kAtList); if(listid.length() > 0) { e.removeAttribute(kAtList); if(list == NULL || previd != listid) { list = doc.createElement(kElList); list.setAttribute(kAtList, listid); dest.insertBefore(list, child); previd = listid; } } else { list = NULL; previd.erase(); } } // It's not a block if(list != NULL) { list.appendChild(dest.removeChild(child)); child = list; } child = child.getNextSibling(); } } } } void RtfParser::fixStyles(const DOM::Document doc) { DOM::NodeList styles = doc.getElementsByTagName(kElStyle); if(styles != NULL) { DOM::NodeList blocks = doc.getElementsByTagName(kElBlock); if(blocks != NULL) { for(int i = 0; i < blocks->getLength(); i++) { DOM::Element block = (const DOM::Element&)blocks->item(i); if(block == NULL || !block.hasAttribute(kElStyle)) continue; for(int j = 0; j < styles->getLength(); j++) { DOM::Element style = (const DOM::Element&)styles->item(j); if(style != NULL) { if(style.getAttribute(kAtId) == block.getAttribute(kElStyle)) { wstring name = style.getAttribute(kAtName); if(name.length() > 0) block.setAttribute(kElStyle, name); } } } } } for(int i = 0; i < styles->getLength(); i++) { DOM::Element style = (const DOM::Element&)styles->item(i); if(style != NULL) style.removeAttribute(kAtId); } } } void RtfParser::breakTables(DOM::Document& doc) { DOM::NodeList rows = doc.getElementsByTagName(kElRow); if(rows != NULL) { for(int i = 0; i < rows->getLength(); i++) { DOM::Element row = (const DOM::Element&)rows->item(i); DOM::Node parent = row.getParentNode(); if(parent == NULL) continue; if(isElement(parent, kElBlock)) { DOM::Node grandparent = parent.getParentNode(); if(grandparent != NULL && !row.hasChildNodes()) { if(row.getPreviousSibling() == NULL) grandparent.insertBefore(parent.removeChild(row), parent); else if(row.getNextSibling() == NULL) insertAfter(grandparent, parent.removeChild(row), parent); } } breakElement(row, kElDest); } } // Now group stuff in destinations into tables DOM::NodeList destinations = doc.getElementsByTagName(kElDest); if(destinations != NULL) { for(int i = 0; i < destinations->getLength(); i++) { DOM::Element dest = (const DOM::Element&)destinations->item(i); // Sanity Check if(dest == NULL) continue; // Go through the children of this destination DOM::Node child = dest.getFirstChild(); DOM::Element table; DOM::Element e; while(child != NULL) { // If it's a block and has a cell attribute if(isElement(child, kElBlock)) { e = (DOM::Element&)child; // if it has a cell attribute if(e.getAttribute(kAtCell).length() > 0) { e.removeAttribute(kAtCell); if(table == NULL) { table = doc.createElement(kElTable); dest.insertBefore(table, child); } } else { table = NULL; } } // It's not a block if(table != NULL) { table.appendChild(dest.removeChild(child)); child = table; } child = child.getNextSibling(); } } } } void RtfParser::insertAfter(DOM::Node& parent, const DOM::Node& node, const DOM::Node& ref) { DOM::Node sibling = ref.getNextSibling(); if(sibling == NULL) parent.appendChild(node); else parent.insertBefore(node, sibling); } void RtfParser::removeTags(const DOM::Document& doc) { // Go through the list of nodes for(const char** t = kRemoveTags; *t != NULL; t++) { DOM::NodeList elements = doc.getElementsByTagName(*t); if(elements != NULL) { for(int j = 0; j < elements->getLength(); j++) { DOM::Element el = (const DOM::Element&)elements->item(j); DOM::Node parent = el->getParentNode(); if(parent == NULL) continue; while(el.hasChildNodes()) parent.insertBefore(el.removeChild(el.getFirstChild()), el); parent.removeChild(el); } } } } void RtfParser::fixLists(const DOM::Document doc) { DOM::NodeList lists = doc.getElementsByTagName(kElList); if(lists != NULL) { DOM::NodeList listdefs = doc.getElementsByTagName(kElListdef); if(listdefs != NULL) { for(int i = 0; i < listdefs->getLength(); i++) { DOM::Element listdef = (const DOM::Element&)listdefs->item(i); if(listdef == NULL || !listdef.hasAttribute(kAtList)) continue; for(int j = 0; j < lists->getLength(); j++) { DOM::Element list = (const DOM::Element&)lists->item(j); if(list != NULL) { if(list.getAttribute(kAtList) == listdef.getAttribute(kAtList)) { copyAttributes(listdef, list, kHideList); list.removeAttribute(kAtList); } } } } } } } void RtfParser::fixBlocks(const DOM::Document doc) { // First break out all the paragraphs to the destination level DOM::NodeList blocks = doc.getElementsByTagName(kElBlock); if(blocks != NULL) { string fix; wstring val; for(int i = 0; i < blocks->getLength(); i++) { DOM::Element block = (const DOM::Element&)blocks->item(i); DOM::Node parent = block.getParentNode(); if(parent == NULL) continue; fix.resize(0); val.resize(0); val = block.getAttribute(kAtFix); if(val.length() > 0) block.removeAttribute(kAtFix); if(val.length() > 0) { val = block.getAttributeNS("", kAtFix); if(val.length() > 0) block.removeAttributeNS("", kAtFix); } if(val.length() > 0) DOM::transcode16to8(val, fix); if(fix.length() == 0) fix = kElPara; DOM::Element el = doc.createElement(fix); copyAttributes(block, el, NULL); while(block.hasChildNodes()) el.appendChild(block.removeChild(block.getFirstChild())); parent.replaceChild(el, block); } } } /** * Removes adjacent duplicate nodes of certain names */ void RtfParser::removeDuplicates(const DOM::Document& doc) { // Go through the list of nodes for(const char** t = kNoDuplicates; *t = NULL; t++) { DOM::NodeList elements = doc.getElementsByTagName(*t); if(elements != NULL) { int x = elements->getLength(); for(int j = 0; j < elements->getLength(); j++) { // Make sure it's a valid element DOM::Element element = (const DOM::Element&)elements->item(j); if(element == NULL) continue; // Get neighbors DOM::Node previous = element.getPreviousSibling(); DOM::Node next = element.getNextSibling(); // Make sure it's still in the document, as we may have // removed it on a previous loop DOM::Node parent = element.getParentNode(); if(parent == NULL) continue; // Combine previous if valid if(previous != NULL && previous.getNodeType() == DOM::Node::ELEMENT_NODE && isEqualElement((DOM::Element&)previous, element)) { while(previous.hasChildNodes()) { DOM::Node child = previous.removeChild(previous.getLastChild()); if(child != NULL) { if(element.hasChildNodes()) element.insertBefore(child, element.getFirstChild()); else element.appendChild(child); } } // Remove duplicate node parent.removeChild(previous); } // Combine next if valid if(next != NULL && next.getNodeType() == DOM::Node::ELEMENT_NODE && isEqualElement((DOM::Element&)next, element)) { while(next.hasChildNodes()) { DOM::Node child = next.removeChild(next.getFirstChild()); if(child != NULL) element.appendChild(child); } // Remove duplicate node parent.removeChild(next); } } } } }