diff options
author | Stef Walter <stef@memberwebs.com> | 2004-07-24 19:06:51 +0000 |
---|---|---|
committer | Stef Walter <stef@memberwebs.com> | 2004-07-24 19:06:51 +0000 |
commit | 8335fdb6b7e7afb57d096e0f3a453b662f7a23c0 (patch) | |
tree | ef3c3079f58b44cb9f1b2953f05e1628d6846e9b | |
parent | ff4568d01651afd615751f9fc683dbe30f2ced9b (diff) |
- Post processing code cleanup.
-rw-r--r-- | src/domhelpers.cpp | 153 | ||||
-rw-r--r-- | src/domhelpers.h | 85 | ||||
-rw-r--r-- | src/sablo.h | 18 | ||||
-rw-r--r-- | src/xmlcomposer.cpp | 42 | ||||
-rw-r--r-- | src/xmlfixups.cpp | 566 | ||||
-rw-r--r-- | src/xmlfixups.h | 55 | ||||
-rw-r--r-- | test-files/sample.xml | 2 |
7 files changed, 587 insertions, 334 deletions
diff --git a/src/domhelpers.cpp b/src/domhelpers.cpp index 6cf8052..ac93f10 100644 --- a/src/domhelpers.cpp +++ b/src/domhelpers.cpp @@ -40,6 +40,8 @@ #include "domhelpers.h" #include "tags.h" +using std::make_pair; + bool DOMHelpers::isElement(const DOM::Node& node, const string& name) { return node != NULL && node.getNodeType() == DOM::Node::ELEMENT_NODE && @@ -176,3 +178,154 @@ void DOMHelpers::insertAfter(DOM::Node& parent, const DOM::Node& node, else parent.insertBefore(node, sibling); } + +DOM::Element DOMHelpers::getChildElement(const DOM::Node& parent, const string& name) +{ + DOM::Node child = parent.getFirstChild(); + while(child != NULL) + { + if(isElement(child, name)) + return (DOM::Element&)child; + } + + return DOM::Element(); +} + +bool DOMHelpers::hasAncestor(const DOM::Node& ancestor, const DOM::Node& node) +{ + DOM::Node n = node; + + while(n != NULL) + { + if(n == ancestor) + return true; + + n = n.getParentNode(); + } + + return false; +} + + +/* ---------------------------------------------------------------------------------- + * ElementTable + */ + +void ElementTable::load(const DOM::Node& parent, const string& name) +{ + clear(); + + DOM::Node child = parent.getFirstChild(); + while(child != NULL) + { + if(DOMHelpers::isElement(child, name)) + { + DOM::Element& el = (DOM::Element&)child; + wstring id = el.getAttribute(kAtId); + + if(!id.empty()) + insert(make_pair(id, el)); + } + } +} + +DOM::Element ElementTable::get(const wstring& id) const +{ + const_iterator it = find(id); + return it == end() ? DOM::Element() : it->second; +} + +void ElementTable::removeIds() +{ + iterator it = begin(); + iterator e = end(); + + for( ; it != e; it++) + it->second.removeAttribute(kAtId); +} + +/* ---------------------------------------------------------------------------------- + * ElementIterator + */ + +void ElementIterator::next() +{ + if(m_current == NULL) + return; + + DOM::Node n; + + // Always descend into children first + if(m_current.hasChildNodes()) + { + m_current = nextel(m_current.getFirstChild()); + if(m_current != NULL) + return; + } + + // Look for siblings along the current level + m_current = nextel(m_current.getNextSibling()); + if(m_current != NULL) + return; + + // Go back up to parent + m_current = m_current.getParentNode(); + + // But check top against parent + if(m_current == NULL || m_current == m_top) + m_current = NULL; +} + +DOM::Element ElementIterator::nextel(DOM::Node node) +{ + while(node != NULL) + { + if(node.getNodeType() == DOM::Element::ELEMENT_NODE) + return (DOM::Element&)node; + + node = node.getNextSibling(); + } + + return DOM::Element(); +} + +void ElementIterator::prev() +{ + /* Allow backing into the iterator */ + if(m_current == NULL) + m_current = m_top; + + DOM::Node n; + + // Always descend into children first + if(m_current.hasChildNodes()) + { + m_current = prevel(m_current.getLastChild()); + if(m_current != NULL) + return; + } + + // Look for siblings along the current level + m_current = prevel(m_current.getPreviousSibling()); + if(m_current != NULL) + return; + + // Go back up to parent + DOM::Node parent = m_current.getParentNode(); + if(parent != m_top) + m_current = (DOM::Element&)parent; +} + + +DOM::Element ElementIterator::prevel(DOM::Node node) +{ + while(node != NULL) + { + if(node.getNodeType() == DOM::Element::ELEMENT_NODE) + return (DOM::Element&)node; + + node = node.getPreviousSibling(); + } + + return DOM::Element(); +} diff --git a/src/domhelpers.h b/src/domhelpers.h index 043ffd4..d125f80 100644 --- a/src/domhelpers.h +++ b/src/domhelpers.h @@ -40,6 +40,9 @@ #define __DOMHELPERS_H__ #include "sablo.h" +#include <map> +#include <stack> +#include <set> /* * DOMHelpers @@ -68,6 +71,88 @@ public: // Get previous element (in XML flow) of a given name static DOM::Element getPriorElement(const DOM::Node& node, const string& name); + + // Get first child element of a given name + static DOM::Element getChildElement(const DOM::Node& parent, const string& name); + + // Check if a given element is anothers ancestor + static bool hasAncestor(const DOM::Node& ancestor, const DOM::Node& node); +}; + +/* + * ElementTable + * + * A table of elements matched to their ids for quick access while applying + * things like fonts, styles, lists from their definitions. + */ +class ElementTable : + public std::map<wstring, DOM::Element> +{ +public: + void load(const DOM::Node& parent, const string& name); + + DOM::Element get(const wstring& id) const; + + bool has(const wstring& id) const + { return find(id) != end(); } + + void removeIds(); +}; + +// Some other handy types +typedef std::set<string> StringSet; +typedef std::stack<DOM::Node> NodeStack; + +/* + * ElementIterator + * + * For iterating through the elements in a document. + */ +class ElementIterator + : public std::iterator<std::input_iterator_tag, DOM::Element, ptrdiff_t> +{ +public: + ElementIterator() + { m_current = NULL; } + ElementIterator(const DOM::Element& top) + { m_top = top; m_current = top; next(); } + ElementIterator(const ElementIterator& x) + { m_top = x.m_top; m_current = x.m_current; } + + const DOM::Element& operator*() const + { return m_current; } + const DOM::Element* operator->() const + { return (&**this); } + const ElementIterator& operator++() + { next(); return (*this); } + const ElementIterator& operator--() + { prev(); return (*this); } + + // Friend comparision functions + friend bool operator==(const ElementIterator& x, const ElementIterator& y); + friend bool operator!=(const ElementIterator& x, const ElementIterator& y); + +// Implementation +protected: + + void next(); + DOM::Element nextel(DOM::Node node); + + void prev(); + DOM::Element prevel(DOM::Node node); + +// Data +protected: + + DOM::Element m_top; + DOM::Element m_current; + bool m_done; }; +// friend functions +inline bool operator==(const ElementIterator& x, const ElementIterator& y) + { return y.m_current == x.m_current && y.m_top == x.m_top; } +inline bool operator!=(const ElementIterator& x, const ElementIterator& y) + { return (!(x == y)); } + #endif // __DOMHELPERS_H__ diff --git a/src/sablo.h b/src/sablo.h index aecde18..196b70b 100644 --- a/src/sablo.h +++ b/src/sablo.h @@ -777,6 +777,8 @@ namespace DOM Element(const Element& node) : Node(node) {} + Element& operator=(const Node& other) + { Node::operator=(other); return *this; } Element& operator=(const Element& other) { Node::operator=(other); return *this; } Element& operator=(const void* null) @@ -946,6 +948,8 @@ namespace DOM CharacterData(const Node& node) : Node(node) { } + CharacterData& operator=(const Node& other) + { Node::operator=(other); return *this; } CharacterData& operator=(const CharacterData& other) { Node::operator=(other); return *this; } CharacterData& operator=(const void* null) @@ -1109,6 +1113,8 @@ namespace DOM Comment(const Comment& node) : CharacterData(node) { } + Comment& operator=(const Node& other) + { Node::operator=(other); return *this; } Comment& operator=(const Comment& other) { CharacterData::operator=(other); return *this; } Comment& operator=(void* null) @@ -1133,6 +1139,8 @@ namespace DOM ProcessingInstruction(const ProcessingInstruction& node) : Node(node) { } + ProcessingInstruction& operator=(const Node& other) + { Node::operator=(other); return *this; } ProcessingInstruction& operator=(const ProcessingInstruction& other) { Node::operator=(other); return *this; } ProcessingInstruction& operator=(void* null) @@ -1182,6 +1190,8 @@ namespace DOM DocumentFragment(const DocumentFragment& node) : Node(node) { } + DocumentFragment& operator=(const Node& other) + { Node::operator=(other); return *this; } DocumentFragment& operator=(const DocumentFragment& other) { Node::operator=(other); return *this; } DocumentFragment& operator=(void* null) @@ -1200,6 +1210,8 @@ namespace DOM Entity(const Entity& node) : Node(node) { } + Entity& operator=(const Node& other) + { Node::operator=(other); return *this; } Entity& operator=(const Entity& other) { Node::operator=(other); return *this; } Entity& operator=(void* null) @@ -1245,6 +1257,8 @@ namespace DOM EntityReference(const EntityReference& node) : Node(node) { } + EntityReference& operator=(const Node& other) + { Node::operator=(other); return *this; } EntityReference& operator=(const EntityReference& other) { Node::operator=(other); return *this; } EntityReference& operator=(void* null) @@ -1263,6 +1277,8 @@ namespace DOM Notation(const Notation& node) : Node(node) { } + Notation& operator=(const Node& other) + { Node::operator=(other); return *this; } Notation& operator=(const Notation& other) { Node::operator=(other); return *this; } Notation& operator=(void* null) @@ -1299,6 +1315,8 @@ namespace DOM DocumentType(const DocumentType& node) : Node(node) { } + DocumentType& operator=(const Node& other) + { Node::operator=(other); return *this; } DocumentType& operator=(const DocumentType& other) { Node::operator=(other); return *this; } DocumentType& operator=(void* null) diff --git a/src/xmlcomposer.cpp b/src/xmlcomposer.cpp index cb1bf16..dcd4c34 100644 --- a/src/xmlcomposer.cpp +++ b/src/xmlcomposer.cpp @@ -124,29 +124,33 @@ void XmlComposer::endDocument() { LevelHandler::endDocument(); - // Pass 0: Cleanup the tree - XmlFixups::combineDuplicates(m_document); - XmlFixups::consolidateStartTags(m_document); - XmlFixups::consolidateEndTags(m_document); + XmlFixups fix; + + // Pass 0: Cleanup the tree + // XmlFixups::combineDuplicates(m_document); + // XmlFixups::consolidateStartTags(m_document); + // XmlFixups::consolidateEndTags(m_document); // Pass 1: Block breakout - XmlFixups::breakTables(m_document); - XmlFixups::breakTags(m_document, kElTable, kElRow); - XmlFixups::breakTags(m_document, kElRow, kElCell); - XmlFixups::wrapTags(m_document, kElCell, kElDest); - XmlFixups::breakBlocks(m_document); - XmlFixups::breakLists(m_document); + fix.breakTables(m_document); + fix.breakTags(m_document, kElTable, kElRow); + fix.breakTags(m_document, kElRow, kElCell); + fix.wrapTags(m_document, kElCell, kElDest); + fix.breakBlocks(m_document); + fix.breakLists(m_document); // Pass 2: Fixups - XmlFixups::fixLists(m_document); - XmlFixups::fixStyles(m_document); - XmlFixups::fixBlocks(m_document); - XmlFixups::removeTags(m_document); - XmlFixups::breakBreak(m_document, kElDoc, kElPage); - XmlFixups::breakBreak(m_document, kElDoc, kElSect); + fix.runPassTwo(m_document); + + // XmlFixups::fixLists(m_document); + // XmlFixups::fixStyles(m_document); + // XmlFixups::fixBlocks(m_document); + // XmlFixups::removeTags(m_document); + // XmlFixups::breakBreak(m_document, kElDoc, kElPage); + // XmlFixups::breakBreak(m_document, kElDoc, kElSect); // Pass 3: Final cleanup - XmlFixups::combineDuplicates(m_document); + // XmlFixups::combineDuplicates(m_document); return; } @@ -330,7 +334,7 @@ void XmlComposer::incrementAutoCount(int type) #define DEFAULT_CONTROLWORD processDefault(cw, flags, param) #define DUMMY 1 == 1 #define NUM_ATTR(x) formatInt(x) -#define DO_EXTRAS() (m_composer->GetOptions().extras) +#define DO_EXTRAS() (m_composer->getOptions().extras) /* ---------------------------------------------------------------------------------- * BASE ANALYSER @@ -1049,7 +1053,7 @@ ON_CHARDATA(Content) AN_ELEMENT(kElFont); if(font != -1) - AN_ATTRIBUTE(kAtId, font); + AN_ATTRIBUTE(kAtName, font); if(fontsize != -1) AN_ATTRIBUTE(kAtSize, fontsize); diff --git a/src/xmlfixups.cpp b/src/xmlfixups.cpp index 707294d..b8c84f4 100644 --- a/src/xmlfixups.cpp +++ b/src/xmlfixups.cpp @@ -45,7 +45,7 @@ static const char* kNoDuplicates[] = { kElB, kElU, kElI, kElColor, kElHide, kElColor, kElSuper, kElSub, NULL }; static const char* kRemoveTags[] = - { kElDest, kElListdef, kElListtable, NULL }; + { kElDest, kElListdef, kElListtable, kElFontTable, NULL }; static const char* kBlockTags[] = { kElTable, kElPara, NULL }; @@ -59,19 +59,18 @@ static const char* kConsolidateEnd[] = static const char* kConsolidateStart[] = { kElStylesheet, kElInfo, NULL }; +void loadStringSet(StringSet& set, const char** strings) +{ + while(*strings) + set.insert(string(*strings)); +} -void XmlFixups::breakBreak(DOM::Document& doc, const string& contain, - const string& tag) +XmlFixups::XmlFixups() { - DOM::NodeList els = doc.getElementsByTagName(tag); - if(els != NULL) - { - for(int i = 0; i < els->getLength(); i++) - { - DOM::Element el = (const DOM::Element&)els->item(i); - breakElement(el, contain); - } - } + loadStringSet(m_duplicates, kNoDuplicates); + loadStringSet(m_removes, kRemoveTags); + loadStringSet(m_consolidateStart, kConsolidateStart); + loadStringSet(m_consolidateEnd, kConsolidateEnd); } bool XmlFixups::breakElement(const DOM::Element& el, const string& contain) @@ -315,19 +314,6 @@ void XmlFixups::breakTags(DOM::Document& doc, const string& parentName, if(parent != NULL && !DOMHelpers::isElement(parent, parentName)) parent.removeChild(tag); -#if 0 - else if(tag.hasChildNodes()) - { - DOM::NodeList children = tag.getChildNodes(); - if(children != NULL && children->getLength() == 1) - { - DOM::Node child = children->item(0); - if(child != NULL && !child.hasChildNodes() && - DOMHelpers::isElement(child, kElBlock)) - parent.removeChild(tag); - } - } -#endif } } } @@ -394,51 +380,255 @@ void XmlFixups::breakLists(DOM::Document& doc) } } -void XmlFixups::fixStyles(const DOM::Document doc) +void XmlFixups::runPassTwo(const DOM::Document& doc) { - // Get all stylesheet styles - DOM::NodeList styles = doc.getElementsByTagName(kElStyle); - if(styles != NULL) + /* + * Okay, this function is complicated and long. It was all broken up into + * shorter functions previously but that sucked for efficiency. Basically + * we want to iterate over the document as few times as possible and because + * of that we combine all of that here. + * + * In this pass: + * o Fix: + * - font names + * - style names + * - list attributes + * - block elements + * o Consolidate certain tags to end of doc + * o Consolidate certain tags to start of doc + * o Combine duplicates of certain tags + * o Remove certain tags + * o Break out pages and sections + */ + + bool haveStyles = false; + ElementTable styles; + + bool haveFonts = false; + ElementTable fonts; + + bool haveLists = false; + ElementTable lists; + + DOM::Element top = doc.getDocumentElement(); + + // Get stylesheet block + DOM::Element el = DOMHelpers::getChildElement(top, kElStylesheet); + if(el != NULL) + { + // Load the styles into a id mapped table + styles.load(el, kElStyle); + + if(!styles.empty()) + { + styles.removeIds(); + haveStyles = true; + } + } + + // Get the font block + el = DOMHelpers::getChildElement(top, kElFontTable); + if(el != NULL) + { + // Load the fonts into an id mapped table + fonts.load(el, kElFont); + + if(!fonts.empty()) + { + fonts.removeIds(); + haveFonts = true; + } + } + + // Get the list definition block + el = DOMHelpers::getChildElement(top, kElListtable); + if(el != NULL) { - // Get list of blocks in the document - DOM::NodeList blocks = doc.getElementsByTagName(kElBlock); - if(blocks != NULL) + // Load the lists into an id mapped table + lists.load(el, kElListdef); + + if(!lists.empty()) { - for(int i = 0; i < blocks->getLength(); i++) + lists.removeIds(); + haveLists = true; + } + } + + NodeStack toStart; // Nodes that get moved to beginning of document + NodeStack toEnd; // Nodes that get moved to the end of the document + + ElementIterator it(top); + ElementIterator end; + + for( ; it != end; ++it) + { + el = *it; + + // Mark each node as we've seen it so we don't + // do a given element twice + if((int)el.getUserData() == PASS_TWO) + continue; + + el.setUserData((void*)PASS_TWO); + string name = el.getNodeName(); + + if(name == kElBlock) + { + // Change style attribute on blocks to name + if(haveStyles && el.hasAttribute(kElStyle)) { - DOM::Element block = (const DOM::Element&)blocks->item(i); + DOM::Element style = styles.get(el.getAttribute(kElStyle)); + if(style != NULL) + el.setAttribute(kElStyle, style.getAttribute(kAtName)); + } - if(block == NULL || !block.hasAttribute(kElStyle)) - continue; + /* + * The below function call replaces the current element with another + * new element. The new element still needs to be processed, so we + * just backup one, and then short circuit the loop below. + */ - // Lookup block styles - for(int j = 0; j < styles->getLength(); j++) + // Now fix the block itself + fixBlock(doc, el); + + continue; // Current element no longer valid + } + + // Change id attribute on fonts to name + else if(haveFonts && name == kElFont) + { + if(el.hasAttribute(kAtId)) + { + DOM::Element font = fonts.get(el.getAttribute(kAtId)); + if(font != NULL) + el.setAttribute(kAtName, font.getAttribute(kAtName)); + } + } + + // Copy list attributes onto the lists + else if(haveLists && name == kElList) + { + if(el.hasAttribute(kAtList)) + { + DOM::Element list = lists.get(el.getAttribute(kAtList)); + if(list != NULL) { - DOM::Element style = (const DOM::Element&)styles->item(j); - if(style != NULL) - { - if(style.getAttribute(kAtId) == block.getAttribute(kElStyle)) - { - // And change to the name - wstring name = style.getAttribute(kAtName); - if(name.length() > 0) - block.setAttribute(kElStyle, name); - } - } + // And copy all the attributes from the list definition to the list + DOMHelpers::copyAttributes(list, el, kHideList); + el.removeAttribute(kAtList); } } } - // A little cleanup of the stylesheet styles - for(int i = 0; i < styles->getLength(); i++) + // Break out pages and sections all the way to document + if(name == kElPage || name == kElSect) { - DOM::Element style = (const DOM::Element&)styles->item(i); - if(style != NULL) - style.removeAttribute(kAtId); + breakElement(el, kElDoc); + + /* + * NOTE: The flow of the document is changed here. But the current + * element is still in a valid place for iterating over the document + * so we don't have to worry about it. + */ } - } -} + // Tags that just plain get removed + if(m_removes.find(name) != m_removes.end()) + { + DOM::Node parent = el->getParentNode(); + + if(parent != NULL) + { + /* + * After the element is removed, the current element is no longer + * valid for iterating over the document. In addition we insert + * all the child nodes of the current element before it. We need + * to be sure to iterate over these elements, and to do so we + * decrement the iterator. + */ + --it; + + while(el.hasChildNodes()) + parent.insertBefore(el.removeChild(el.getFirstChild()), el); + + parent.removeChild(el); + continue; /* Current element doesn't need any more processing */ + } + } + + + // Tags that need to get consolidated to start + if(m_consolidateStart.find(name) != m_consolidateStart.end()) + toStart.push(el); + + // Tags that need to get consolidated to end + else if(m_consolidateEnd.find(name) != m_consolidateEnd.end()) + toEnd.push(el); + + + // Tags for which duplicates need to be combined + if(m_duplicates.find(name) != m_duplicates.end()) + { + DOM::Element parent = (const DOM::Element&)el.getParentNode(); + if(parent != NULL) + { + // Loop till we find no more of the same + for(;;) + { + DOM::Node next = el.getNextSibling(); + + // If it's the same type of element ... + if(!DOMHelpers::isElement(next, name)) + break; + + // NOTE: Notice we do nothing with attributes. Currently + // all elements in the duplicates list don't need that. + + while(next.hasChildNodes()) + el.appendChild(next.removeChild(next.getFirstChild())); + + // Remove duplicate node + parent.removeChild(next); + } + } + } + } + + // Complete consolidation to front + while(!toStart.empty()) + { + DOM::Node node = toStart.top(); + DOM::Node parent = node.getParentNode(); + if(parent != NULL && DOMHelpers::hasAncestor(top, node)) + { + // Remove it from it's child + parent.removeChild(el); + + // And put at start of the document of the document + top.insertBefore(el, top.getFirstChild()); + } + + toStart.pop(); + } + + // Complete consolidation to end + while(!toEnd.empty()) + { + DOM::Node node = toEnd.top(); + DOM::Node parent = node.getParentNode(); + if(parent != NULL && DOMHelpers::hasAncestor(top, node)) + { + // Remove it from it's child + parent.removeChild(el); + + // And put at end of the document of the document + top.appendChild(el); + } + + toEnd.pop(); + } + +} void XmlFixups::breakTables(DOM::Document& doc) { @@ -526,258 +716,44 @@ void XmlFixups::breakTables(DOM::Document& doc) } } -void XmlFixups::removeTags(const DOM::Document& doc) -{ - // Go through the list of nodes - for(const char** t = kRemoveTags; *t != NULL; t++) - { - DOM::NodeList elements = doc.getElementsByTagName(*t); - if(elements != NULL) - { - for(int j = 0; j < elements->getLength(); j++) - { - DOM::Element el = (const DOM::Element&)elements->item(j); - DOM::Node parent = el->getParentNode(); - if(parent == NULL) - continue; - - while(el.hasChildNodes()) - parent.insertBefore(el.removeChild(el.getFirstChild()), el); - - parent.removeChild(el); - } - } - } -} - -void XmlFixups::fixLists(const DOM::Document doc) +void XmlFixups::fixBlock(const DOM::Document& doc, DOM::Element& block) { - // Get all the lists - DOM::NodeList lists = doc.getElementsByTagName(kElList); - if(lists != NULL) - { - // And all the list definitions - DOM::NodeList listdefs = doc.getElementsByTagName(kElListdef); - if(listdefs != NULL) - { - for(int i = 0; i < listdefs->getLength(); i++) - { - DOM::Element listdef = (const DOM::Element&)listdefs->item(i); + // Okay now change blocks to whatever element they're supposed to be + string fix; + wstring val; - if(listdef == NULL || !listdef.hasAttribute(kAtList)) - continue; + DOM::Node parent = block.getParentNode(); - for(int j = 0; j < lists->getLength(); j++) - { - DOM::Element list = (const DOM::Element&)lists->item(j); - if(list != NULL) - { - if(list.getAttribute(kAtList) == listdef.getAttribute(kAtList)) - { - // And copy all the attributes from the list definition to the list - DOMHelpers::copyAttributes(listdef, list, kHideList); - list.removeAttribute(kAtList); - } - } - } - } - } - } -} - -void XmlFixups::fixBlocks(const DOM::Document doc) -{ - // Get all the blocks - DOM::NodeList blocks = doc.getElementsByTagName(kElBlock); - if(blocks != NULL) + if(parent != NULL) { - string fix; - wstring val; + // Figure out what kind of element they want block fixed to + val = block.getAttribute(kAtFix); + if(val.length() > 0) + block.removeAttribute(kAtFix); - for(int i = 0; i < blocks->getLength(); i++) + // BUG: Sablotron bug work around + if(val.length() > 0) { - DOM::Element block = (const DOM::Element&)blocks->item(i); - DOM::Node parent = block.getParentNode(); - - if(parent == NULL) - continue; - - fix.resize(0); - val.resize(0); - - // Figure out what kind of element they want block fixed to - val = block.getAttribute(kAtFix); - if(val.length() > 0) - block.removeAttribute(kAtFix); - - // BUG: Sablotron bug work around - if(val.length() > 0) - { - val = block.getAttributeNS("", kAtFix); - if(val.length() > 0) - block.removeAttributeNS("", kAtFix); - } - + val = block.getAttributeNS("", kAtFix); if(val.length() > 0) - DOM::transcode16to8(val, fix); - - if(fix.length() == 0) - fix = kElPara; - - // Create duplicate of the 'fix' element - DOM::Element el = doc.createElement(fix); - DOMHelpers::copyAttributes(block, el, NULL); - - // Replace block with the given 'fix' element - while(block.hasChildNodes()) - el.appendChild(block.removeChild(block.getFirstChild())); - - parent.replaceChild(el, block); + block.removeAttributeNS("", kAtFix); } - } -} -void XmlFixups::consolidateEndTags(DOM::Document& doc) -{ - DOM::Element top = doc.getDocumentElement(); - ASSERT(top != NULL); + if(val.length() > 0) + DOM::transcode16to8(val, fix); - for(const char** t = kConsolidateEnd; *t != NULL; t++) - { - DOM::NodeList elements = doc.getElementsByTagName(*t); - if(elements != NULL) - { - int x = elements->getLength(); - for(int j = 0; j < x; j++) - { - // Make sure it's a valid element - DOM::Element element = (const DOM::Element&)elements->item(j); - if(element == NULL) - continue; - - DOM::Element parent = (const DOM::Element&)element.getParentNode(); - if(parent == NULL) - continue; - - // Remove it from it's child - parent.removeChild(element); - - // And append it to the end of the document - top.appendChild(element); - } - } - } -} - -void XmlFixups::consolidateStartTags(DOM::Document& doc) -{ - DOM::Element top = doc.getDocumentElement(); - ASSERT(top != NULL); - - DOM::Node first = top.getFirstChild(); + if(fix.length() == 0) + fix = kElPara; - for(const char** t = kConsolidateStart; *t != NULL; t++) - { - DOM::NodeList elements = doc.getElementsByTagName(*t); - if(elements != NULL) - { - int x = elements->getLength(); - for(int j = 0; j < x; j++) - { - // Make sure it's a valid element - DOM::Element element = (const DOM::Element&)elements->item(j); - if(element == NULL || element == first) - continue; - - DOM::Element parent = (const DOM::Element&)element.getParentNode(); - if(parent == NULL) - continue; - - // Remove it from it's child - parent.removeChild(element); + // Create duplicate of the 'fix' element + DOM::Element el = doc.createElement(fix); + DOMHelpers::copyAttributes(block, el, NULL); - // And put at start of the document of the document - ASSERT(first != NULL); - top.insertBefore(element, first); - } - } - } -} - -void XmlFixups::combineDuplicates(const DOM::Document& doc) -{ - bool found; - - do - { - found = false; - - // Go through the list of nodes - for(const char** t = kNoDuplicates; *t != NULL; t++) - { - DOM::NodeList elements = doc.getElementsByTagName(*t); - if(elements != NULL) - { - int x = elements->getLength(); - for(int j = 0; j < x; j++) - { - // Make sure it's a valid element - DOM::Element element = (const DOM::Element&)elements->item(j); - if(element == NULL) - continue; - - // Get neighbors - DOM::Node previous = element.getPreviousSibling(); - DOM::Node next = element.getNextSibling(); - - // Make sure it's still in the document, as we may have - // removed it on a previous loop - DOM::Node parent = element.getParentNode(); - if(parent == NULL) - continue; - - // Combine previous if valid - if(previous != NULL && previous.getNodeType() == DOM::Node::ELEMENT_NODE && - DOMHelpers::isEqualElement((DOM::Element&)previous, element)) - { - while(previous.hasChildNodes()) - { - DOM::Node child = previous.removeChild(previous.getLastChild()); - if(child != NULL) - { - if(element.hasChildNodes()) - element.insertBefore(child, element.getFirstChild()); - else - element.appendChild(child); - } - } - - // Remove duplicate node - parent.removeChild(previous); - found = true; - } - - // Combine next if valid - if(next != NULL && next.getNodeType() == DOM::Node::ELEMENT_NODE && - DOMHelpers::isEqualElement((DOM::Element&)next, element)) - { - while(next.hasChildNodes()) - { - DOM::Node child = next.removeChild(next.getFirstChild()); - if(child != NULL) - element.appendChild(child); - } - - // Remove duplicate node - parent.removeChild(next); - found = true; - } - } - } - } + // Replace block with the given 'fix' element + while(block.hasChildNodes()) + el.appendChild(block.removeChild(block.getFirstChild())); - // Keep looping until no more duplicates found + parent.replaceChild(el, block); } - while(found); } diff --git a/src/xmlfixups.h b/src/xmlfixups.h index f00bb66..01c2d67 100644 --- a/src/xmlfixups.h +++ b/src/xmlfixups.h @@ -40,6 +40,7 @@ #define __XMLFIXUPS_H__ #include "sablo.h" +#include "domhelpers.h" /* * XmlFixups @@ -55,14 +56,7 @@ class XmlFixups { public: - // Replace blocks with 'fix' elements like paragraphs - static void fixBlocks(DOM::Document doc); - - // Pass 2 list fixups - static void fixLists(const DOM::Document doc); - - // Pass 2 style fixups - static void fixStyles(const DOM::Document doc); + XmlFixups(); /* * Breaks a paragraph up through a previous level. Calls itself @@ -83,19 +77,19 @@ public: * <b>test of </b> your concentration. * </dest> */ - static bool breakElement(const DOM::Element& el, const string& contain); + bool breakElement(const DOM::Element& el, const string& contain); // Break all tags of a given type to a previous level (see above) - static void breakBreak(DOM::Document& doc, const string& contain, const string& tag); + void breakBreak(DOM::Document& doc, const string& contain, const string& tag); // Used to break tables cells and rows into blocks (but more complicated) - static void breakTags(DOM::Document& doc, const string& parentName, const string& tagName); + void breakTags(DOM::Document& doc, const string& parentName, const string& tagName); // Fixes and combines list elements with the same id - static void breakLists(DOM::Document& document); + void breakLists(DOM::Document& document); // Used to find and create tables and perform initial break out - static void breakTables(DOM::Document& document); + void breakTables(DOM::Document& document); /* @@ -115,22 +109,45 @@ public: * <para style="10"> This is <b> a </b></para> * <para><b>test of </b> your concentration.</para> */ - static void breakBlocks(DOM::Document& document); + void breakBlocks(DOM::Document& document); // Wrap certain tags in a wrapper tag of given name - static void wrapTags(DOM::Document& document, const string& tagName, const string& wrapName); + void wrapTags(DOM::Document& document, const string& tagName, const string& wrapName); // Remove certain tags from document - static void removeTags(const DOM::Document& doc); + void removeTags(const DOM::Document& doc); // Combines certain adjacent duplicate tags - static void combineDuplicates(const DOM::Document& doc); + void combineDuplicates(const DOM::Document& doc); // Consolidates a certain tag types at the beginning of the document - static void consolidateStartTags(DOM::Document& doc); + void consolidateStartTags(DOM::Document& doc); // Consolidates a certain tag types at the end of the document - static void consolidateEndTags(DOM::Document& doc); + void consolidateEndTags(DOM::Document& doc); + + + // The main pass 2 function + void runPassTwo(const DOM::Document& doc); + + // Replace blocks with 'fix' elements like paragraphs + void fixBlock(const DOM::Document& doc, DOM::Element& block); + + +protected: + + enum + { + PASS_0, + PASS_1, + PASS_TWO + }; + + // Our tables cached for efficiency + StringSet m_duplicates; + StringSet m_removes; + StringSet m_consolidateStart; + StringSet m_consolidateEnd; }; #endif // __XMLFIXUPS_H__ diff --git a/test-files/sample.xml b/test-files/sample.xml index 963387c..d1b1318 100644 --- a/test-files/sample.xml +++ b/test-files/sample.xml @@ -1 +1 @@ -<?xml version="1.0" encoding="UTF-8"?><document><stylesheet><style name="Normal"/><style name="heading 1"/><style name="footnote text"/></stylesheet><info><title>This is a test RTF</title><author>Nate</author><operator>Nate</operator></info><para style="heading 1"><i_fonttable><font id="0"/><font id="1"/><font id="2"/><font id="3"/><font id="10"/><font id="121"/><font id="122"/><font id="124"/><font id="125"/><font id="126"/><font id="127"/><font id="128"/><font id="129"/><font id="131"/><font id="132"/><font id="134"/><font id="135"/><font id="136"/><font id="137"/><font id="138"/><font id="139"/><font id="141"/><font id="142"/><font id="144"/><font id="145"/><font id="146"/><font id="147"/><font id="148"/><font id="149"/></i_fonttable><b>This is a test RTF</b></para><para>Hi! I’m a test file. This is some <b>bold</b> text, and some <i>italic</i> text, as well as some <u>underline</u> text. And a bit of <hide>hidden</hide> text. So we’re going to end this paragraph here and go on to a nice little list:</para><para/><list type="disc" ordered="0" start="1"><para>Item 1</para><para>Item 2</para><para>Item 3</para><para>Item 4</para></list><para/><para>And now comes a fun table:</para><para/><table><row><cell><para>Cell 1</para></cell><cell><para>Cell 2</para><para>More in cell 2</para></cell><cell><para>Cell 3</para></cell></row><row><cell><para>Next row</para></cell><cell><para>Next row </para></cell><cell><para>Next row</para></cell></row></table><para/><para>A page break:</para><page/><para>And here we’re on the next page. </para><para>This para has a <ref type="footnote" to="1"><super>1</super></ref>footnote.</para><para>And here’s yet another paragraph. </para><para/><footnote id="1"><para><super>1</super> This is the actual content of the footnote.</para></footnote></document>
\ No newline at end of file +<?xml version="1.0" encoding="UTF-8"?><document><stylesheet><style name="Normal"/><style name="heading 1"/><style name="footnote text"/></stylesheet><info><title>This is a test RTF</title><author>Nate</author><operator>Nate</operator></info><para style="heading 1"><i_fonttable><font id="0" name="Times New Roman"/><font id="1" name="Arial"/><font id="2" name="Courier New"/><font id="3" name="Symbol"/><font id="10" name="Wingdings"/><font id="121" name="Times New Roman CE"/><font id="122" name="Times New Roman Cyr"/><font id="124" name="Times New Roman Greek"/><font id="125" name="Times New Roman Tur"/><font id="126" name="Times New Roman (Hebrew)"/><font id="127" name="Times New Roman (Arabic)"/><font id="128" name="Times New Roman Baltic"/><font id="129" name="Times New Roman (Vietnamese)"/><font id="131" name="Arial CE"/><font id="132" name="Arial Cyr"/><font id="134" name="Arial Greek"/><font id="135" name="Arial Tur"/><font id="136" name="Arial (Hebrew)"/><font id="137" name="Arial (Arabic)"/><font id="138" name="Arial Baltic"/><font id="139" name="Arial (Vietnamese)"/><font id="141" name="Courier New CE"/><font id="142" name="Courier New Cyr"/><font id="144" name="Courier New Greek"/><font id="145" name="Courier New Tur"/><font id="146" name="Courier New (Hebrew)"/><font id="147" name="Courier New (Arabic)"/><font id="148" name="Courier New Baltic"/><font id="149" name="Courier New (Vietnamese)"/></i_fonttable><font id="1" size="32"><b>This is a test RTF</b></font></para><para><font size="24">Hi! I’m a test file. This is some </font><font size="24"><b>bold</b></font><font size="24"> text, and some </font><font size="24"><i>italic</i></font><font size="24"> text, as well as some </font><font size="24"><u>underline</u></font><font size="24"> text. And a bit of </font><font size="24"><hide>hidden</hide></font><font size="24"> text. So we’re going to end this paragraph here and go on to a nice little list:</font></para><para/><list type="disc" ordered="0" start="1"><para><font size="24">Item 1</font></para><para><font size="24">Item 2</font></para><para><font size="24">Item 3</font></para><para><font size="24">Item 4</font></para></list><para/><para><font size="24">And now comes a fun table:</font></para><para/><table><row><cell><para><font size="24">Cell 1</font></para></cell><cell><para><font size="24">Cell 2</font></para><para><font size="24">More in cell 2</font></para></cell><cell><para><font size="24">Cell 3</font></para></cell></row><row><cell><para><font size="24">Next row</font></para></cell><cell><para><font size="24">Next row </font></para></cell><cell><para><font size="24">Next row</font></para></cell></row></table><para/><para><font size="24">A page break:</font></para><page/><para><font size="24">And here we’re on the next page.</font><font size="24"> </font></para><para><font size="24">This para has a </font><ref type="footnote" to="1"><font size="24"><super>1</super></font></ref><font size="24">footnote.</font></para><para><font size="24">And here’s yet another paragraph. </font></para><para/><footnote id="1"><para><font size="20"><super>1</super></font><font size="20"> This is the actual content of the footnote.</font></para></footnote></document>
\ No newline at end of file |