summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStef Walter <stef@memberwebs.com>2004-07-24 19:06:51 +0000
committerStef Walter <stef@memberwebs.com>2004-07-24 19:06:51 +0000
commit8335fdb6b7e7afb57d096e0f3a453b662f7a23c0 (patch)
treeef3c3079f58b44cb9f1b2953f05e1628d6846e9b
parentff4568d01651afd615751f9fc683dbe30f2ced9b (diff)
- Post processing code cleanup.
-rw-r--r--src/domhelpers.cpp153
-rw-r--r--src/domhelpers.h85
-rw-r--r--src/sablo.h18
-rw-r--r--src/xmlcomposer.cpp42
-rw-r--r--src/xmlfixups.cpp566
-rw-r--r--src/xmlfixups.h55
-rw-r--r--test-files/sample.xml2
7 files changed, 587 insertions, 334 deletions
diff --git a/src/domhelpers.cpp b/src/domhelpers.cpp
index 6cf8052..ac93f10 100644
--- a/src/domhelpers.cpp
+++ b/src/domhelpers.cpp
@@ -40,6 +40,8 @@
#include "domhelpers.h"
#include "tags.h"
+using std::make_pair;
+
bool DOMHelpers::isElement(const DOM::Node& node, const string& name)
{
return node != NULL && node.getNodeType() == DOM::Node::ELEMENT_NODE &&
@@ -176,3 +178,154 @@ void DOMHelpers::insertAfter(DOM::Node& parent, const DOM::Node& node,
else
parent.insertBefore(node, sibling);
}
+
+DOM::Element DOMHelpers::getChildElement(const DOM::Node& parent, const string& name)
+{
+ DOM::Node child = parent.getFirstChild();
+ while(child != NULL)
+ {
+ if(isElement(child, name))
+ return (DOM::Element&)child;
+ }
+
+ return DOM::Element();
+}
+
+bool DOMHelpers::hasAncestor(const DOM::Node& ancestor, const DOM::Node& node)
+{
+ DOM::Node n = node;
+
+ while(n != NULL)
+ {
+ if(n == ancestor)
+ return true;
+
+ n = n.getParentNode();
+ }
+
+ return false;
+}
+
+
+/* ----------------------------------------------------------------------------------
+ * ElementTable
+ */
+
+void ElementTable::load(const DOM::Node& parent, const string& name)
+{
+ clear();
+
+ DOM::Node child = parent.getFirstChild();
+ while(child != NULL)
+ {
+ if(DOMHelpers::isElement(child, name))
+ {
+ DOM::Element& el = (DOM::Element&)child;
+ wstring id = el.getAttribute(kAtId);
+
+ if(!id.empty())
+ insert(make_pair(id, el));
+ }
+ }
+}
+
+DOM::Element ElementTable::get(const wstring& id) const
+{
+ const_iterator it = find(id);
+ return it == end() ? DOM::Element() : it->second;
+}
+
+void ElementTable::removeIds()
+{
+ iterator it = begin();
+ iterator e = end();
+
+ for( ; it != e; it++)
+ it->second.removeAttribute(kAtId);
+}
+
+/* ----------------------------------------------------------------------------------
+ * ElementIterator
+ */
+
+void ElementIterator::next()
+{
+ if(m_current == NULL)
+ return;
+
+ DOM::Node n;
+
+ // Always descend into children first
+ if(m_current.hasChildNodes())
+ {
+ m_current = nextel(m_current.getFirstChild());
+ if(m_current != NULL)
+ return;
+ }
+
+ // Look for siblings along the current level
+ m_current = nextel(m_current.getNextSibling());
+ if(m_current != NULL)
+ return;
+
+ // Go back up to parent
+ m_current = m_current.getParentNode();
+
+ // But check top against parent
+ if(m_current == NULL || m_current == m_top)
+ m_current = NULL;
+}
+
+DOM::Element ElementIterator::nextel(DOM::Node node)
+{
+ while(node != NULL)
+ {
+ if(node.getNodeType() == DOM::Element::ELEMENT_NODE)
+ return (DOM::Element&)node;
+
+ node = node.getNextSibling();
+ }
+
+ return DOM::Element();
+}
+
+void ElementIterator::prev()
+{
+ /* Allow backing into the iterator */
+ if(m_current == NULL)
+ m_current = m_top;
+
+ DOM::Node n;
+
+ // Always descend into children first
+ if(m_current.hasChildNodes())
+ {
+ m_current = prevel(m_current.getLastChild());
+ if(m_current != NULL)
+ return;
+ }
+
+ // Look for siblings along the current level
+ m_current = prevel(m_current.getPreviousSibling());
+ if(m_current != NULL)
+ return;
+
+ // Go back up to parent
+ DOM::Node parent = m_current.getParentNode();
+ if(parent != m_top)
+ m_current = (DOM::Element&)parent;
+}
+
+
+DOM::Element ElementIterator::prevel(DOM::Node node)
+{
+ while(node != NULL)
+ {
+ if(node.getNodeType() == DOM::Element::ELEMENT_NODE)
+ return (DOM::Element&)node;
+
+ node = node.getPreviousSibling();
+ }
+
+ return DOM::Element();
+}
diff --git a/src/domhelpers.h b/src/domhelpers.h
index 043ffd4..d125f80 100644
--- a/src/domhelpers.h
+++ b/src/domhelpers.h
@@ -40,6 +40,9 @@
#define __DOMHELPERS_H__
#include "sablo.h"
+#include <map>
+#include <stack>
+#include <set>
/*
* DOMHelpers
@@ -68,6 +71,88 @@ public:
// Get previous element (in XML flow) of a given name
static DOM::Element getPriorElement(const DOM::Node& node, const string& name);
+
+ // Get first child element of a given name
+ static DOM::Element getChildElement(const DOM::Node& parent, const string& name);
+
+ // Check if a given element is anothers ancestor
+ static bool hasAncestor(const DOM::Node& ancestor, const DOM::Node& node);
+};
+
+/*
+ * ElementTable
+ *
+ * A table of elements matched to their ids for quick access while applying
+ * things like fonts, styles, lists from their definitions.
+ */
+class ElementTable :
+ public std::map<wstring, DOM::Element>
+{
+public:
+ void load(const DOM::Node& parent, const string& name);
+
+ DOM::Element get(const wstring& id) const;
+
+ bool has(const wstring& id) const
+ { return find(id) != end(); }
+
+ void removeIds();
+};
+
+// Some other handy types
+typedef std::set<string> StringSet;
+typedef std::stack<DOM::Node> NodeStack;
+
+/*
+ * ElementIterator
+ *
+ * For iterating through the elements in a document.
+ */
+class ElementIterator
+ : public std::iterator<std::input_iterator_tag, DOM::Element, ptrdiff_t>
+{
+public:
+ ElementIterator()
+ { m_current = NULL; }
+ ElementIterator(const DOM::Element& top)
+ { m_top = top; m_current = top; next(); }
+ ElementIterator(const ElementIterator& x)
+ { m_top = x.m_top; m_current = x.m_current; }
+
+ const DOM::Element& operator*() const
+ { return m_current; }
+ const DOM::Element* operator->() const
+ { return (&**this); }
+ const ElementIterator& operator++()
+ { next(); return (*this); }
+ const ElementIterator& operator--()
+ { prev(); return (*this); }
+
+ // Friend comparision functions
+ friend bool operator==(const ElementIterator& x, const ElementIterator& y);
+ friend bool operator!=(const ElementIterator& x, const ElementIterator& y);
+
+// Implementation
+protected:
+
+ void next();
+ DOM::Element nextel(DOM::Node node);
+
+ void prev();
+ DOM::Element prevel(DOM::Node node);
+
+// Data
+protected:
+
+ DOM::Element m_top;
+ DOM::Element m_current;
+ bool m_done;
};
+// friend functions
+inline bool operator==(const ElementIterator& x, const ElementIterator& y)
+ { return y.m_current == x.m_current && y.m_top == x.m_top; }
+inline bool operator!=(const ElementIterator& x, const ElementIterator& y)
+ { return (!(x == y)); }
+
#endif // __DOMHELPERS_H__
diff --git a/src/sablo.h b/src/sablo.h
index aecde18..196b70b 100644
--- a/src/sablo.h
+++ b/src/sablo.h
@@ -777,6 +777,8 @@ namespace DOM
Element(const Element& node) :
Node(node) {}
+ Element& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
Element& operator=(const Element& other)
{ Node::operator=(other); return *this; }
Element& operator=(const void* null)
@@ -946,6 +948,8 @@ namespace DOM
CharacterData(const Node& node) :
Node(node) { }
+ CharacterData& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
CharacterData& operator=(const CharacterData& other)
{ Node::operator=(other); return *this; }
CharacterData& operator=(const void* null)
@@ -1109,6 +1113,8 @@ namespace DOM
Comment(const Comment& node) :
CharacterData(node) { }
+ Comment& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
Comment& operator=(const Comment& other)
{ CharacterData::operator=(other); return *this; }
Comment& operator=(void* null)
@@ -1133,6 +1139,8 @@ namespace DOM
ProcessingInstruction(const ProcessingInstruction& node) :
Node(node) { }
+ ProcessingInstruction& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
ProcessingInstruction& operator=(const ProcessingInstruction& other)
{ Node::operator=(other); return *this; }
ProcessingInstruction& operator=(void* null)
@@ -1182,6 +1190,8 @@ namespace DOM
DocumentFragment(const DocumentFragment& node) :
Node(node) { }
+ DocumentFragment& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
DocumentFragment& operator=(const DocumentFragment& other)
{ Node::operator=(other); return *this; }
DocumentFragment& operator=(void* null)
@@ -1200,6 +1210,8 @@ namespace DOM
Entity(const Entity& node) :
Node(node) { }
+ Entity& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
Entity& operator=(const Entity& other)
{ Node::operator=(other); return *this; }
Entity& operator=(void* null)
@@ -1245,6 +1257,8 @@ namespace DOM
EntityReference(const EntityReference& node) :
Node(node) { }
+ EntityReference& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
EntityReference& operator=(const EntityReference& other)
{ Node::operator=(other); return *this; }
EntityReference& operator=(void* null)
@@ -1263,6 +1277,8 @@ namespace DOM
Notation(const Notation& node) :
Node(node) { }
+ Notation& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
Notation& operator=(const Notation& other)
{ Node::operator=(other); return *this; }
Notation& operator=(void* null)
@@ -1299,6 +1315,8 @@ namespace DOM
DocumentType(const DocumentType& node) :
Node(node) { }
+ DocumentType& operator=(const Node& other)
+ { Node::operator=(other); return *this; }
DocumentType& operator=(const DocumentType& other)
{ Node::operator=(other); return *this; }
DocumentType& operator=(void* null)
diff --git a/src/xmlcomposer.cpp b/src/xmlcomposer.cpp
index cb1bf16..dcd4c34 100644
--- a/src/xmlcomposer.cpp
+++ b/src/xmlcomposer.cpp
@@ -124,29 +124,33 @@ void XmlComposer::endDocument()
{
LevelHandler::endDocument();
- // Pass 0: Cleanup the tree
- XmlFixups::combineDuplicates(m_document);
- XmlFixups::consolidateStartTags(m_document);
- XmlFixups::consolidateEndTags(m_document);
+ XmlFixups fix;
+
+ // Pass 0: Cleanup the tree
+ // XmlFixups::combineDuplicates(m_document);
+ // XmlFixups::consolidateStartTags(m_document);
+ // XmlFixups::consolidateEndTags(m_document);
// Pass 1: Block breakout
- XmlFixups::breakTables(m_document);
- XmlFixups::breakTags(m_document, kElTable, kElRow);
- XmlFixups::breakTags(m_document, kElRow, kElCell);
- XmlFixups::wrapTags(m_document, kElCell, kElDest);
- XmlFixups::breakBlocks(m_document);
- XmlFixups::breakLists(m_document);
+ fix.breakTables(m_document);
+ fix.breakTags(m_document, kElTable, kElRow);
+ fix.breakTags(m_document, kElRow, kElCell);
+ fix.wrapTags(m_document, kElCell, kElDest);
+ fix.breakBlocks(m_document);
+ fix.breakLists(m_document);
// Pass 2: Fixups
- XmlFixups::fixLists(m_document);
- XmlFixups::fixStyles(m_document);
- XmlFixups::fixBlocks(m_document);
- XmlFixups::removeTags(m_document);
- XmlFixups::breakBreak(m_document, kElDoc, kElPage);
- XmlFixups::breakBreak(m_document, kElDoc, kElSect);
+ fix.runPassTwo(m_document);
+
+ // XmlFixups::fixLists(m_document);
+ // XmlFixups::fixStyles(m_document);
+ // XmlFixups::fixBlocks(m_document);
+ // XmlFixups::removeTags(m_document);
+ // XmlFixups::breakBreak(m_document, kElDoc, kElPage);
+ // XmlFixups::breakBreak(m_document, kElDoc, kElSect);
// Pass 3: Final cleanup
- XmlFixups::combineDuplicates(m_document);
+ // XmlFixups::combineDuplicates(m_document);
return;
}
@@ -330,7 +334,7 @@ void XmlComposer::incrementAutoCount(int type)
#define DEFAULT_CONTROLWORD processDefault(cw, flags, param)
#define DUMMY 1 == 1
#define NUM_ATTR(x) formatInt(x)
-#define DO_EXTRAS() (m_composer->GetOptions().extras)
+#define DO_EXTRAS() (m_composer->getOptions().extras)
/* ----------------------------------------------------------------------------------
* BASE ANALYSER
@@ -1049,7 +1053,7 @@ ON_CHARDATA(Content)
AN_ELEMENT(kElFont);
if(font != -1)
- AN_ATTRIBUTE(kAtId, font);
+ AN_ATTRIBUTE(kAtName, font);
if(fontsize != -1)
AN_ATTRIBUTE(kAtSize, fontsize);
diff --git a/src/xmlfixups.cpp b/src/xmlfixups.cpp
index 707294d..b8c84f4 100644
--- a/src/xmlfixups.cpp
+++ b/src/xmlfixups.cpp
@@ -45,7 +45,7 @@ static const char* kNoDuplicates[] =
{ kElB, kElU, kElI, kElColor, kElHide, kElColor, kElSuper, kElSub, NULL };
static const char* kRemoveTags[] =
- { kElDest, kElListdef, kElListtable, NULL };
+ { kElDest, kElListdef, kElListtable, kElFontTable, NULL };
static const char* kBlockTags[] =
{ kElTable, kElPara, NULL };
@@ -59,19 +59,18 @@ static const char* kConsolidateEnd[] =
static const char* kConsolidateStart[] =
{ kElStylesheet, kElInfo, NULL };
+void loadStringSet(StringSet& set, const char** strings)
+{
+ while(*strings)
+ set.insert(string(*strings));
+}
-void XmlFixups::breakBreak(DOM::Document& doc, const string& contain,
- const string& tag)
+XmlFixups::XmlFixups()
{
- DOM::NodeList els = doc.getElementsByTagName(tag);
- if(els != NULL)
- {
- for(int i = 0; i < els->getLength(); i++)
- {
- DOM::Element el = (const DOM::Element&)els->item(i);
- breakElement(el, contain);
- }
- }
+ loadStringSet(m_duplicates, kNoDuplicates);
+ loadStringSet(m_removes, kRemoveTags);
+ loadStringSet(m_consolidateStart, kConsolidateStart);
+ loadStringSet(m_consolidateEnd, kConsolidateEnd);
}
bool XmlFixups::breakElement(const DOM::Element& el, const string& contain)
@@ -315,19 +314,6 @@ void XmlFixups::breakTags(DOM::Document& doc, const string& parentName,
if(parent != NULL && !DOMHelpers::isElement(parent, parentName))
parent.removeChild(tag);
-#if 0
- else if(tag.hasChildNodes())
- {
- DOM::NodeList children = tag.getChildNodes();
- if(children != NULL && children->getLength() == 1)
- {
- DOM::Node child = children->item(0);
- if(child != NULL && !child.hasChildNodes() &&
- DOMHelpers::isElement(child, kElBlock))
- parent.removeChild(tag);
- }
- }
-#endif
}
}
}
@@ -394,51 +380,255 @@ void XmlFixups::breakLists(DOM::Document& doc)
}
}
-void XmlFixups::fixStyles(const DOM::Document doc)
+void XmlFixups::runPassTwo(const DOM::Document& doc)
{
- // Get all stylesheet styles
- DOM::NodeList styles = doc.getElementsByTagName(kElStyle);
- if(styles != NULL)
+ /*
+ * Okay, this function is complicated and long. It was all broken up into
+ * shorter functions previously but that sucked for efficiency. Basically
+ * we want to iterate over the document as few times as possible and because
+ * of that we combine all of that here.
+ *
+ * In this pass:
+ * o Fix:
+ * - font names
+ * - style names
+ * - list attributes
+ * - block elements
+ * o Consolidate certain tags to end of doc
+ * o Consolidate certain tags to start of doc
+ * o Combine duplicates of certain tags
+ * o Remove certain tags
+ * o Break out pages and sections
+ */
+
+ bool haveStyles = false;
+ ElementTable styles;
+
+ bool haveFonts = false;
+ ElementTable fonts;
+
+ bool haveLists = false;
+ ElementTable lists;
+
+ DOM::Element top = doc.getDocumentElement();
+
+ // Get stylesheet block
+ DOM::Element el = DOMHelpers::getChildElement(top, kElStylesheet);
+ if(el != NULL)
+ {
+ // Load the styles into a id mapped table
+ styles.load(el, kElStyle);
+
+ if(!styles.empty())
+ {
+ styles.removeIds();
+ haveStyles = true;
+ }
+ }
+
+ // Get the font block
+ el = DOMHelpers::getChildElement(top, kElFontTable);
+ if(el != NULL)
+ {
+ // Load the fonts into an id mapped table
+ fonts.load(el, kElFont);
+
+ if(!fonts.empty())
+ {
+ fonts.removeIds();
+ haveFonts = true;
+ }
+ }
+
+ // Get the list definition block
+ el = DOMHelpers::getChildElement(top, kElListtable);
+ if(el != NULL)
{
- // Get list of blocks in the document
- DOM::NodeList blocks = doc.getElementsByTagName(kElBlock);
- if(blocks != NULL)
+ // Load the lists into an id mapped table
+ lists.load(el, kElListdef);
+
+ if(!lists.empty())
{
- for(int i = 0; i < blocks->getLength(); i++)
+ lists.removeIds();
+ haveLists = true;
+ }
+ }
+
+ NodeStack toStart; // Nodes that get moved to beginning of document
+ NodeStack toEnd; // Nodes that get moved to the end of the document
+
+ ElementIterator it(top);
+ ElementIterator end;
+
+ for( ; it != end; ++it)
+ {
+ el = *it;
+
+ // Mark each node as we've seen it so we don't
+ // do a given element twice
+ if((int)el.getUserData() == PASS_TWO)
+ continue;
+
+ el.setUserData((void*)PASS_TWO);
+ string name = el.getNodeName();
+
+ if(name == kElBlock)
+ {
+ // Change style attribute on blocks to name
+ if(haveStyles && el.hasAttribute(kElStyle))
{
- DOM::Element block = (const DOM::Element&)blocks->item(i);
+ DOM::Element style = styles.get(el.getAttribute(kElStyle));
+ if(style != NULL)
+ el.setAttribute(kElStyle, style.getAttribute(kAtName));
+ }
- if(block == NULL || !block.hasAttribute(kElStyle))
- continue;
+ /*
+ * The below function call replaces the current element with another
+ * new element. The new element still needs to be processed, so we
+ * just backup one, and then short circuit the loop below.
+ */
- // Lookup block styles
- for(int j = 0; j < styles->getLength(); j++)
+ // Now fix the block itself
+ fixBlock(doc, el);
+
+ continue; // Current element no longer valid
+ }
+
+ // Change id attribute on fonts to name
+ else if(haveFonts && name == kElFont)
+ {
+ if(el.hasAttribute(kAtId))
+ {
+ DOM::Element font = fonts.get(el.getAttribute(kAtId));
+ if(font != NULL)
+ el.setAttribute(kAtName, font.getAttribute(kAtName));
+ }
+ }
+
+ // Copy list attributes onto the lists
+ else if(haveLists && name == kElList)
+ {
+ if(el.hasAttribute(kAtList))
+ {
+ DOM::Element list = lists.get(el.getAttribute(kAtList));
+ if(list != NULL)
{
- DOM::Element style = (const DOM::Element&)styles->item(j);
- if(style != NULL)
- {
- if(style.getAttribute(kAtId) == block.getAttribute(kElStyle))
- {
- // And change to the name
- wstring name = style.getAttribute(kAtName);
- if(name.length() > 0)
- block.setAttribute(kElStyle, name);
- }
- }
+ // And copy all the attributes from the list definition to the list
+ DOMHelpers::copyAttributes(list, el, kHideList);
+ el.removeAttribute(kAtList);
}
}
}
- // A little cleanup of the stylesheet styles
- for(int i = 0; i < styles->getLength(); i++)
+ // Break out pages and sections all the way to document
+ if(name == kElPage || name == kElSect)
{
- DOM::Element style = (const DOM::Element&)styles->item(i);
- if(style != NULL)
- style.removeAttribute(kAtId);
+ breakElement(el, kElDoc);
+
+ /*
+ * NOTE: The flow of the document is changed here. But the current
+ * element is still in a valid place for iterating over the document
+ * so we don't have to worry about it.
+ */
}
- }
-}
+ // Tags that just plain get removed
+ if(m_removes.find(name) != m_removes.end())
+ {
+ DOM::Node parent = el->getParentNode();
+
+ if(parent != NULL)
+ {
+ /*
+ * After the element is removed, the current element is no longer
+ * valid for iterating over the document. In addition we insert
+ * all the child nodes of the current element before it. We need
+ * to be sure to iterate over these elements, and to do so we
+ * decrement the iterator.
+ */
+ --it;
+
+ while(el.hasChildNodes())
+ parent.insertBefore(el.removeChild(el.getFirstChild()), el);
+
+ parent.removeChild(el);
+ continue; /* Current element doesn't need any more processing */
+ }
+ }
+
+
+ // Tags that need to get consolidated to start
+ if(m_consolidateStart.find(name) != m_consolidateStart.end())
+ toStart.push(el);
+
+ // Tags that need to get consolidated to end
+ else if(m_consolidateEnd.find(name) != m_consolidateEnd.end())
+ toEnd.push(el);
+
+
+ // Tags for which duplicates need to be combined
+ if(m_duplicates.find(name) != m_duplicates.end())
+ {
+ DOM::Element parent = (const DOM::Element&)el.getParentNode();
+ if(parent != NULL)
+ {
+ // Loop till we find no more of the same
+ for(;;)
+ {
+ DOM::Node next = el.getNextSibling();
+
+ // If it's the same type of element ...
+ if(!DOMHelpers::isElement(next, name))
+ break;
+
+ // NOTE: Notice we do nothing with attributes. Currently
+ // all elements in the duplicates list don't need that.
+
+ while(next.hasChildNodes())
+ el.appendChild(next.removeChild(next.getFirstChild()));
+
+ // Remove duplicate node
+ parent.removeChild(next);
+ }
+ }
+ }
+ }
+
+ // Complete consolidation to front
+ while(!toStart.empty())
+ {
+ DOM::Node node = toStart.top();
+ DOM::Node parent = node.getParentNode();
+ if(parent != NULL && DOMHelpers::hasAncestor(top, node))
+ {
+ // Remove it from it's child
+ parent.removeChild(el);
+
+ // And put at start of the document of the document
+ top.insertBefore(el, top.getFirstChild());
+ }
+
+ toStart.pop();
+ }
+
+ // Complete consolidation to end
+ while(!toEnd.empty())
+ {
+ DOM::Node node = toEnd.top();
+ DOM::Node parent = node.getParentNode();
+ if(parent != NULL && DOMHelpers::hasAncestor(top, node))
+ {
+ // Remove it from it's child
+ parent.removeChild(el);
+
+ // And put at end of the document of the document
+ top.appendChild(el);
+ }
+
+ toEnd.pop();
+ }
+
+}
void XmlFixups::breakTables(DOM::Document& doc)
{
@@ -526,258 +716,44 @@ void XmlFixups::breakTables(DOM::Document& doc)
}
}
-void XmlFixups::removeTags(const DOM::Document& doc)
-{
- // Go through the list of nodes
- for(const char** t = kRemoveTags; *t != NULL; t++)
- {
- DOM::NodeList elements = doc.getElementsByTagName(*t);
- if(elements != NULL)
- {
- for(int j = 0; j < elements->getLength(); j++)
- {
- DOM::Element el = (const DOM::Element&)elements->item(j);
- DOM::Node parent = el->getParentNode();
- if(parent == NULL)
- continue;
-
- while(el.hasChildNodes())
- parent.insertBefore(el.removeChild(el.getFirstChild()), el);
-
- parent.removeChild(el);
- }
- }
- }
-}
-
-void XmlFixups::fixLists(const DOM::Document doc)
+void XmlFixups::fixBlock(const DOM::Document& doc, DOM::Element& block)
{
- // Get all the lists
- DOM::NodeList lists = doc.getElementsByTagName(kElList);
- if(lists != NULL)
- {
- // And all the list definitions
- DOM::NodeList listdefs = doc.getElementsByTagName(kElListdef);
- if(listdefs != NULL)
- {
- for(int i = 0; i < listdefs->getLength(); i++)
- {
- DOM::Element listdef = (const DOM::Element&)listdefs->item(i);
+ // Okay now change blocks to whatever element they're supposed to be
+ string fix;
+ wstring val;
- if(listdef == NULL || !listdef.hasAttribute(kAtList))
- continue;
+ DOM::Node parent = block.getParentNode();
- for(int j = 0; j < lists->getLength(); j++)
- {
- DOM::Element list = (const DOM::Element&)lists->item(j);
- if(list != NULL)
- {
- if(list.getAttribute(kAtList) == listdef.getAttribute(kAtList))
- {
- // And copy all the attributes from the list definition to the list
- DOMHelpers::copyAttributes(listdef, list, kHideList);
- list.removeAttribute(kAtList);
- }
- }
- }
- }
- }
- }
-}
-
-void XmlFixups::fixBlocks(const DOM::Document doc)
-{
- // Get all the blocks
- DOM::NodeList blocks = doc.getElementsByTagName(kElBlock);
- if(blocks != NULL)
+ if(parent != NULL)
{
- string fix;
- wstring val;
+ // Figure out what kind of element they want block fixed to
+ val = block.getAttribute(kAtFix);
+ if(val.length() > 0)
+ block.removeAttribute(kAtFix);
- for(int i = 0; i < blocks->getLength(); i++)
+ // BUG: Sablotron bug work around
+ if(val.length() > 0)
{
- DOM::Element block = (const DOM::Element&)blocks->item(i);
- DOM::Node parent = block.getParentNode();
-
- if(parent == NULL)
- continue;
-
- fix.resize(0);
- val.resize(0);
-
- // Figure out what kind of element they want block fixed to
- val = block.getAttribute(kAtFix);
- if(val.length() > 0)
- block.removeAttribute(kAtFix);
-
- // BUG: Sablotron bug work around
- if(val.length() > 0)
- {
- val = block.getAttributeNS("", kAtFix);
- if(val.length() > 0)
- block.removeAttributeNS("", kAtFix);
- }
-
+ val = block.getAttributeNS("", kAtFix);
if(val.length() > 0)
- DOM::transcode16to8(val, fix);
-
- if(fix.length() == 0)
- fix = kElPara;
-
- // Create duplicate of the 'fix' element
- DOM::Element el = doc.createElement(fix);
- DOMHelpers::copyAttributes(block, el, NULL);
-
- // Replace block with the given 'fix' element
- while(block.hasChildNodes())
- el.appendChild(block.removeChild(block.getFirstChild()));
-
- parent.replaceChild(el, block);
+ block.removeAttributeNS("", kAtFix);
}
- }
-}
-void XmlFixups::consolidateEndTags(DOM::Document& doc)
-{
- DOM::Element top = doc.getDocumentElement();
- ASSERT(top != NULL);
+ if(val.length() > 0)
+ DOM::transcode16to8(val, fix);
- for(const char** t = kConsolidateEnd; *t != NULL; t++)
- {
- DOM::NodeList elements = doc.getElementsByTagName(*t);
- if(elements != NULL)
- {
- int x = elements->getLength();
- for(int j = 0; j < x; j++)
- {
- // Make sure it's a valid element
- DOM::Element element = (const DOM::Element&)elements->item(j);
- if(element == NULL)
- continue;
-
- DOM::Element parent = (const DOM::Element&)element.getParentNode();
- if(parent == NULL)
- continue;
-
- // Remove it from it's child
- parent.removeChild(element);
-
- // And append it to the end of the document
- top.appendChild(element);
- }
- }
- }
-}
-
-void XmlFixups::consolidateStartTags(DOM::Document& doc)
-{
- DOM::Element top = doc.getDocumentElement();
- ASSERT(top != NULL);
-
- DOM::Node first = top.getFirstChild();
+ if(fix.length() == 0)
+ fix = kElPara;
- for(const char** t = kConsolidateStart; *t != NULL; t++)
- {
- DOM::NodeList elements = doc.getElementsByTagName(*t);
- if(elements != NULL)
- {
- int x = elements->getLength();
- for(int j = 0; j < x; j++)
- {
- // Make sure it's a valid element
- DOM::Element element = (const DOM::Element&)elements->item(j);
- if(element == NULL || element == first)
- continue;
-
- DOM::Element parent = (const DOM::Element&)element.getParentNode();
- if(parent == NULL)
- continue;
-
- // Remove it from it's child
- parent.removeChild(element);
+ // Create duplicate of the 'fix' element
+ DOM::Element el = doc.createElement(fix);
+ DOMHelpers::copyAttributes(block, el, NULL);
- // And put at start of the document of the document
- ASSERT(first != NULL);
- top.insertBefore(element, first);
- }
- }
- }
-}
-
-void XmlFixups::combineDuplicates(const DOM::Document& doc)
-{
- bool found;
-
- do
- {
- found = false;
-
- // Go through the list of nodes
- for(const char** t = kNoDuplicates; *t != NULL; t++)
- {
- DOM::NodeList elements = doc.getElementsByTagName(*t);
- if(elements != NULL)
- {
- int x = elements->getLength();
- for(int j = 0; j < x; j++)
- {
- // Make sure it's a valid element
- DOM::Element element = (const DOM::Element&)elements->item(j);
- if(element == NULL)
- continue;
-
- // Get neighbors
- DOM::Node previous = element.getPreviousSibling();
- DOM::Node next = element.getNextSibling();
-
- // Make sure it's still in the document, as we may have
- // removed it on a previous loop
- DOM::Node parent = element.getParentNode();
- if(parent == NULL)
- continue;
-
- // Combine previous if valid
- if(previous != NULL && previous.getNodeType() == DOM::Node::ELEMENT_NODE &&
- DOMHelpers::isEqualElement((DOM::Element&)previous, element))
- {
- while(previous.hasChildNodes())
- {
- DOM::Node child = previous.removeChild(previous.getLastChild());
- if(child != NULL)
- {
- if(element.hasChildNodes())
- element.insertBefore(child, element.getFirstChild());
- else
- element.appendChild(child);
- }
- }
-
- // Remove duplicate node
- parent.removeChild(previous);
- found = true;
- }
-
- // Combine next if valid
- if(next != NULL && next.getNodeType() == DOM::Node::ELEMENT_NODE &&
- DOMHelpers::isEqualElement((DOM::Element&)next, element))
- {
- while(next.hasChildNodes())
- {
- DOM::Node child = next.removeChild(next.getFirstChild());
- if(child != NULL)
- element.appendChild(child);
- }
-
- // Remove duplicate node
- parent.removeChild(next);
- found = true;
- }
- }
- }
- }
+ // Replace block with the given 'fix' element
+ while(block.hasChildNodes())
+ el.appendChild(block.removeChild(block.getFirstChild()));
- // Keep looping until no more duplicates found
+ parent.replaceChild(el, block);
}
- while(found);
}
diff --git a/src/xmlfixups.h b/src/xmlfixups.h
index f00bb66..01c2d67 100644
--- a/src/xmlfixups.h
+++ b/src/xmlfixups.h
@@ -40,6 +40,7 @@
#define __XMLFIXUPS_H__
#include "sablo.h"
+#include "domhelpers.h"
/*
* XmlFixups
@@ -55,14 +56,7 @@
class XmlFixups
{
public:
- // Replace blocks with 'fix' elements like paragraphs
- static void fixBlocks(DOM::Document doc);
-
- // Pass 2 list fixups
- static void fixLists(const DOM::Document doc);
-
- // Pass 2 style fixups
- static void fixStyles(const DOM::Document doc);
+ XmlFixups();
/*
* Breaks a paragraph up through a previous level. Calls itself
@@ -83,19 +77,19 @@ public:
* <b>test of </b> your concentration.
* </dest>
*/
- static bool breakElement(const DOM::Element& el, const string& contain);
+ bool breakElement(const DOM::Element& el, const string& contain);
// Break all tags of a given type to a previous level (see above)
- static void breakBreak(DOM::Document& doc, const string& contain, const string& tag);
+ void breakBreak(DOM::Document& doc, const string& contain, const string& tag);
// Used to break tables cells and rows into blocks (but more complicated)
- static void breakTags(DOM::Document& doc, const string& parentName, const string& tagName);
+ void breakTags(DOM::Document& doc, const string& parentName, const string& tagName);
// Fixes and combines list elements with the same id
- static void breakLists(DOM::Document& document);
+ void breakLists(DOM::Document& document);
// Used to find and create tables and perform initial break out
- static void breakTables(DOM::Document& document);
+ void breakTables(DOM::Document& document);
/*
@@ -115,22 +109,45 @@ public:
* <para style="10"> This is <b> a </b></para>
* <para><b>test of </b> your concentration.</para>
*/
- static void breakBlocks(DOM::Document& document);
+ void breakBlocks(DOM::Document& document);
// Wrap certain tags in a wrapper tag of given name
- static void wrapTags(DOM::Document& document, const string& tagName, const string& wrapName);
+ void wrapTags(DOM::Document& document, const string& tagName, const string& wrapName);
// Remove certain tags from document
- static void removeTags(const DOM::Document& doc);
+ void removeTags(const DOM::Document& doc);
// Combines certain adjacent duplicate tags
- static void combineDuplicates(const DOM::Document& doc);
+ void combineDuplicates(const DOM::Document& doc);
// Consolidates a certain tag types at the beginning of the document
- static void consolidateStartTags(DOM::Document& doc);
+ void consolidateStartTags(DOM::Document& doc);
// Consolidates a certain tag types at the end of the document
- static void consolidateEndTags(DOM::Document& doc);
+ void consolidateEndTags(DOM::Document& doc);
+
+
+ // The main pass 2 function
+ void runPassTwo(const DOM::Document& doc);
+
+ // Replace blocks with 'fix' elements like paragraphs
+ void fixBlock(const DOM::Document& doc, DOM::Element& block);
+
+
+protected:
+
+ enum
+ {
+ PASS_0,
+ PASS_1,
+ PASS_TWO
+ };
+
+ // Our tables cached for efficiency
+ StringSet m_duplicates;
+ StringSet m_removes;
+ StringSet m_consolidateStart;
+ StringSet m_consolidateEnd;
};
#endif // __XMLFIXUPS_H__
diff --git a/test-files/sample.xml b/test-files/sample.xml
index 963387c..d1b1318 100644
--- a/test-files/sample.xml
+++ b/test-files/sample.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="UTF-8"?><document><stylesheet><style name="Normal"/><style name="heading 1"/><style name="footnote text"/></stylesheet><info><title>This is a test RTF</title><author>Nate</author><operator>Nate</operator></info><para style="heading 1"><i_fonttable><font id="0"/><font id="1"/><font id="2"/><font id="3"/><font id="10"/><font id="121"/><font id="122"/><font id="124"/><font id="125"/><font id="126"/><font id="127"/><font id="128"/><font id="129"/><font id="131"/><font id="132"/><font id="134"/><font id="135"/><font id="136"/><font id="137"/><font id="138"/><font id="139"/><font id="141"/><font id="142"/><font id="144"/><font id="145"/><font id="146"/><font id="147"/><font id="148"/><font id="149"/></i_fonttable><b>This is a test RTF</b></para><para>Hi! I’m a test file. This is some <b>bold</b> text, and some <i>italic</i> text, as well as some <u>underline</u> text. And a bit of <hide>hidden</hide> text. So we’re going to end this paragraph here and go on to a nice little list:</para><para/><list type="disc" ordered="0" start="1"><para>Item 1</para><para>Item 2</para><para>Item 3</para><para>Item 4</para></list><para/><para>And now comes a fun table:</para><para/><table><row><cell><para>Cell 1</para></cell><cell><para>Cell 2</para><para>More in cell 2</para></cell><cell><para>Cell 3</para></cell></row><row><cell><para>Next row</para></cell><cell><para>Next row </para></cell><cell><para>Next row</para></cell></row></table><para/><para>A page break:</para><page/><para>And here we’re on the next page. </para><para>This para has a <ref type="footnote" to="1"><super>1</super></ref>footnote.</para><para>And here’s yet another paragraph. </para><para/><footnote id="1"><para><super>1</super> This is the actual content of the footnote.</para></footnote></document> \ No newline at end of file
+<?xml version="1.0" encoding="UTF-8"?><document><stylesheet><style name="Normal"/><style name="heading 1"/><style name="footnote text"/></stylesheet><info><title>This is a test RTF</title><author>Nate</author><operator>Nate</operator></info><para style="heading 1"><i_fonttable><font id="0" name="Times New Roman"/><font id="1" name="Arial"/><font id="2" name="Courier New"/><font id="3" name="Symbol"/><font id="10" name="Wingdings"/><font id="121" name="Times New Roman CE"/><font id="122" name="Times New Roman Cyr"/><font id="124" name="Times New Roman Greek"/><font id="125" name="Times New Roman Tur"/><font id="126" name="Times New Roman (Hebrew)"/><font id="127" name="Times New Roman (Arabic)"/><font id="128" name="Times New Roman Baltic"/><font id="129" name="Times New Roman (Vietnamese)"/><font id="131" name="Arial CE"/><font id="132" name="Arial Cyr"/><font id="134" name="Arial Greek"/><font id="135" name="Arial Tur"/><font id="136" name="Arial (Hebrew)"/><font id="137" name="Arial (Arabic)"/><font id="138" name="Arial Baltic"/><font id="139" name="Arial (Vietnamese)"/><font id="141" name="Courier New CE"/><font id="142" name="Courier New Cyr"/><font id="144" name="Courier New Greek"/><font id="145" name="Courier New Tur"/><font id="146" name="Courier New (Hebrew)"/><font id="147" name="Courier New (Arabic)"/><font id="148" name="Courier New Baltic"/><font id="149" name="Courier New (Vietnamese)"/></i_fonttable><font id="1" size="32"><b>This is a test RTF</b></font></para><para><font size="24">Hi! I’m a test file. This is some </font><font size="24"><b>bold</b></font><font size="24"> text, and some </font><font size="24"><i>italic</i></font><font size="24"> text, as well as some </font><font size="24"><u>underline</u></font><font size="24"> text. And a bit of </font><font size="24"><hide>hidden</hide></font><font size="24"> text. So we’re going to end this paragraph here and go on to a nice little list:</font></para><para/><list type="disc" ordered="0" start="1"><para><font size="24">Item 1</font></para><para><font size="24">Item 2</font></para><para><font size="24">Item 3</font></para><para><font size="24">Item 4</font></para></list><para/><para><font size="24">And now comes a fun table:</font></para><para/><table><row><cell><para><font size="24">Cell 1</font></para></cell><cell><para><font size="24">Cell 2</font></para><para><font size="24">More in cell 2</font></para></cell><cell><para><font size="24">Cell 3</font></para></cell></row><row><cell><para><font size="24">Next row</font></para></cell><cell><para><font size="24">Next row </font></para></cell><cell><para><font size="24">Next row</font></para></cell></row></table><para/><para><font size="24">A page break:</font></para><page/><para><font size="24">And here we’re on the next page.</font><font size="24"> </font></para><para><font size="24">This para has a </font><ref type="footnote" to="1"><font size="24"><super>1</super></font></ref><font size="24">footnote.</font></para><para><font size="24">And here’s yet another paragraph. </font></para><para/><footnote id="1"><para><font size="20"><super>1</super></font><font size="20"> This is the actual content of the footnote.</font></para></footnote></document> \ No newline at end of file