/* * Copyright (c) 2004, Nate Nielsen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above * copyright notice, this list of conditions and the * following disclaimer. * * Redistributions in binary form must reproduce the * above copyright notice, this list of conditions and * the following disclaimer in the documentation and/or * other materials provided with the distribution. * * The names of contributors to this software may not be * used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * * CONTRIBUTORS * Nate Nielsen * */ #include "usuals.h" #include "xmlfixups.h" #include "domhelpers.h" #include "tags.h" static const char* kNoDuplicates[] = { kElB, kElU, kElI, kElColor, kElHide, kElColor, kElSuper, kElSub, NULL }; static const char* kRemoveTags[] = { kElDest, kElListdef, kElListtable, kElFontTable, NULL }; static const char* kBlockTags[] = { kElTable, kElPara, NULL }; static const char* kHideList[] = { kAtId, kAtList, NULL }; static const char* kConsolidateEnd[] = { kElFootNote, NULL }; static const char* kConsolidateStart[] = { kElStylesheet, kElInfo, NULL }; void loadStringSet(StringSet& set, const char** strings) { while(*strings) set.insert(string(*strings)); } XmlFixups::XmlFixups() { loadStringSet(m_duplicates, kNoDuplicates); loadStringSet(m_removes, kRemoveTags); loadStringSet(m_consolidateStart, kConsolidateStart); loadStringSet(m_consolidateEnd, kConsolidateEnd); } bool XmlFixups::breakElement(const DOM::Element& el, const string& contain) { ASSERT(el != NULL); DOM::Element parent = (const DOM::Element&)el.getParentNode(); DOM::Element grandparent; string s = el.getNodeName(); s = parent.getNodeName(); // Get the parent node if(parent != NULL) grandparent = (const DOM::Element&)parent.getParentNode(); // Make sure we have something to work with before continuing if(grandparent == NULL || parent == NULL || DOMHelpers::isElement(parent, contain)) return true; DOM::Node e; // Check to see if this is the first node in the parent. // If so then just move out to before if(el.getPreviousSibling() == NULL) { e = grandparent.insertBefore(parent.removeChild(el), parent); } // Check to see if this is the last node in the parent. // If so then just move out to after the parent else if(el.getNextSibling() == NULL) { DOM::Node next = parent.getNextSibling(); if(next == NULL) e = grandparent.appendChild(parent.removeChild(el)); else e = grandparent.insertBefore(parent.removeChild(el), next); } // Otherwise it's in the middle so split the parent // element etc... else { // Clone it but not deep DOM::Element parent2 = (const DOM::Element&)parent.cloneNode(false); if(parent2 == NULL) return false; // Flag that tells us whether we moved anything up to parent bool moved = false; // Now move all nodes after this one to the second parent. while((e = el.getNextSibling()) != NULL) { parent2.appendChild(parent.removeChild(e)); moved = true; } // Remove the element from it's parent e = parent.removeChild(el); // Okay now we move the paragraph up to the parent DOMHelpers::insertAfter(grandparent, e, parent); if(moved) DOMHelpers::insertAfter(grandparent, parent2, e); } // Now call it again with the paragraph in the new position // until everything's cut through! return breakElement((DOM::Element&)e, contain); } void XmlFixups::breakBlocks(DOM::Document& document) { // First break out all the paragraphs to the destination level DOM::NodeList blocks = document.getElementsByTagName(kElBlock); if(blocks != NULL) { for(int i = 0; i < blocks->getLength(); i++) { DOM::Element block = (const DOM::Element&)blocks->item(i); // If it's the single closed style para then break it if(block != NULL && !block.hasChildNodes()) breakElement(block, kElDest); } } // Now group stuff in destinations into paras or other blocks DOM::NodeList destinations = document.getElementsByTagName(kElDest); if(destinations != NULL) { for(int i = 0; i < destinations->getLength(); i++) { DOM::Element dest = (const DOM::Element&)destinations->item(i); // Sanity Check if(dest == NULL || !dest.hasChildNodes()) continue; // Go through the children of this destination DOM::Node child = dest.getFirstChild(); DOM::Element block; while(child != NULL) { // If it's a block if(DOMHelpers::isElement(child, kElBlock)) { block = (DOM::Element&)child; child = child.getNextSibling(); continue; } // If it's already a real block element for(const char** t = kBlockTags; *t != NULL; t++) { if(DOMHelpers::isElement(child, *t)) { block = NULL; break; } } // If there's a block then add to it if(block != NULL) { block.appendChild(dest.removeChild(child)); child = block; } child = child.getNextSibling(); } } } } void XmlFixups::wrapTags(DOM::Document& doc, const string& tagName, const string& wrapName) { DOM::NodeList tags = doc.getElementsByTagName(tagName); if(tags != NULL) { for(int i = 0; i < tags->getLength(); i++) { DOM::Element tag = (const DOM::Element&)tags->item(i); DOM::Element wrap = doc.createElement(wrapName); while(tag.hasChildNodes()) wrap.appendChild(tag.removeChild(tag.getFirstChild())); tag.appendChild(wrap); } } } void XmlFixups::breakTags(DOM::Document& doc, const string& parentName, const string& tagName) { DOM::NodeList parents = doc.getElementsByTagName(parentName); if(parents != NULL) { for(int i = 0; i < parents->getLength(); i++) { DOM::Element parent = (const DOM::Element&)parents->item(i); if(!parent.hasChildNodes()) continue; // First perform the breaks DOM::NodeList tags = parent.getElementsByTagName(tagName); if(tags != NULL) { for(int i = 0; i < tags->getLength(); i++) breakElement((const DOM::Element&)tags->item(i), parentName); } DOM::Node tag = doc.createElement(tagName); parent.insertBefore(tag, parent.getFirstChild()); DOM::Node child = tag; while(child != NULL && (child = child.getNextSibling()) != NULL) { if(DOMHelpers::isElement(child, kElBlock)) { DOM::Node next = child.getNextSibling(); if(next == NULL) { parent.removeChild(child); continue; } if(DOMHelpers::isElement(next, tagName)) { DOM::Node twodown = next.getNextSibling(); if(!DOMHelpers::isElement(twodown, kElBlock)) { child = parent.insertBefore(parent.removeChild(next), child); } else { parent.removeChild(child); child = next; } } } if(DOMHelpers::isElement(child, tagName)) { if(!tag.hasChildNodes()) parent.removeChild(tag); tag = child; } else { tag.appendChild(parent.removeChild(child)); child = tag; } } if(!tag.hasChildNodes()) parent.removeChild(tag); } } DOM::NodeList tags = doc.getElementsByTagName(tagName); if(tags != NULL) { for(int i = 0; i < tags->getLength(); i++) { DOM::Element tag = (const DOM::Element&)tags->item(i); DOM::Node parent = tag.getParentNode(); if(parent != NULL && !DOMHelpers::isElement(parent, parentName)) parent.removeChild(tag); } } } void XmlFixups::breakLists(DOM::Document& doc) { DOM::NodeList destinations = doc.getElementsByTagName(kElDest); if(destinations != NULL) { for(int i = 0; i < destinations->getLength(); i++) { DOM::Element dest = (const DOM::Element&)destinations->item(i); // Sanity Check if(dest == NULL) continue; // Go through the children of this destination DOM::Node child = dest.getFirstChild(); DOM::Element list; DOM::Element e; wstring previd; while(child != NULL) { // If it's a block ... if(DOMHelpers::isElement(child, kElBlock)) { e = (DOM::Element&)child; // ... and has a list attribute wstring listid = e.getAttribute(kAtList); if(listid.length() > 0) { e.removeAttribute(kAtList); if(list == NULL || previd != listid) { list = doc.createElement(kElList); list.setAttribute(kAtList, listid); dest.insertBefore(list, child); previd = listid; } } else { list = NULL; previd.erase(); } } // It's not a block if(list != NULL) { list.appendChild(dest.removeChild(child)); child = list; } child = child.getNextSibling(); } } } } void XmlFixups::runPassTwo(const DOM::Document& doc) { /* * Okay, this function is complicated and long. It was all broken up into * shorter functions previously but that sucked for efficiency. Basically * we want to iterate over the document as few times as possible and because * of that we combine all of that here. * * In this pass: * o Fix: * - font names * - style names * - list attributes * - block elements * o Consolidate certain tags to end of doc * o Consolidate certain tags to start of doc * o Combine duplicates of certain tags * o Remove certain tags * o Break out pages and sections */ bool haveStyles = false; ElementTable styles; bool haveFonts = false; ElementTable fonts; bool haveLists = false; ElementTable lists; DOM::Element top = doc.getDocumentElement(); // Get stylesheet block DOM::Element el = DOMHelpers::getChildElement(top, kElStylesheet); if(el != NULL) { // Load the styles into a id mapped table styles.load(el, kElStyle); if(!styles.empty()) { styles.removeIds(); haveStyles = true; } } // Get the font block el = DOMHelpers::getChildElement(top, kElFontTable); if(el != NULL) { // Load the fonts into an id mapped table fonts.load(el, kElFont); if(!fonts.empty()) { fonts.removeIds(); haveFonts = true; } } // Get the list definition block el = DOMHelpers::getChildElement(top, kElListtable); if(el != NULL) { // Load the lists into an id mapped table lists.load(el, kElListdef); if(!lists.empty()) { lists.removeIds(); haveLists = true; } } NodeStack toStart; // Nodes that get moved to beginning of document NodeStack toEnd; // Nodes that get moved to the end of the document ElementIterator it(top); ElementIterator end; for( ; it != end; ++it) { el = *it; // Mark each node as we've seen it so we don't // do a given element twice if((int)el.getUserData() == PASS_TWO) continue; el.setUserData((void*)PASS_TWO); string name = el.getNodeName(); if(name == kElBlock) { // Change style attribute on blocks to name if(haveStyles && el.hasAttribute(kElStyle)) { DOM::Element style = styles.get(el.getAttribute(kElStyle)); if(style != NULL) el.setAttribute(kElStyle, style.getAttribute(kAtName)); } /* * The below function call replaces the current element with another * new element. The new element still needs to be processed, so we * just backup one, and then short circuit the loop below. */ // Now fix the block itself fixBlock(doc, el); continue; // Current element no longer valid } // Change id attribute on fonts to name else if(haveFonts && name == kElFont) { if(el.hasAttribute(kAtId)) { DOM::Element font = fonts.get(el.getAttribute(kAtId)); if(font != NULL) el.setAttribute(kAtName, font.getAttribute(kAtName)); } } // Copy list attributes onto the lists else if(haveLists && name == kElList) { if(el.hasAttribute(kAtList)) { DOM::Element list = lists.get(el.getAttribute(kAtList)); if(list != NULL) { // And copy all the attributes from the list definition to the list DOMHelpers::copyAttributes(list, el, kHideList); el.removeAttribute(kAtList); } } } // Break out pages and sections all the way to document if(name == kElPage || name == kElSect) { breakElement(el, kElDoc); /* * NOTE: The flow of the document is changed here. But the current * element is still in a valid place for iterating over the document * so we don't have to worry about it. */ } // Tags that just plain get removed if(m_removes.find(name) != m_removes.end()) { DOM::Node parent = el->getParentNode(); if(parent != NULL) { /* * After the element is removed, the current element is no longer * valid for iterating over the document. In addition we insert * all the child nodes of the current element before it. We need * to be sure to iterate over these elements, and to do so we * decrement the iterator. */ --it; while(el.hasChildNodes()) parent.insertBefore(el.removeChild(el.getFirstChild()), el); parent.removeChild(el); continue; /* Current element doesn't need any more processing */ } } // Tags that need to get consolidated to start if(m_consolidateStart.find(name) != m_consolidateStart.end()) toStart.push(el); // Tags that need to get consolidated to end else if(m_consolidateEnd.find(name) != m_consolidateEnd.end()) toEnd.push(el); // Tags for which duplicates need to be combined if(m_duplicates.find(name) != m_duplicates.end()) { DOM::Element parent = (const DOM::Element&)el.getParentNode(); if(parent != NULL) { // Loop till we find no more of the same for(;;) { DOM::Node next = el.getNextSibling(); // If it's the same type of element ... if(!DOMHelpers::isElement(next, name)) break; // NOTE: Notice we do nothing with attributes. Currently // all elements in the duplicates list don't need that. while(next.hasChildNodes()) el.appendChild(next.removeChild(next.getFirstChild())); // Remove duplicate node parent.removeChild(next); } } } } // Complete consolidation to front while(!toStart.empty()) { DOM::Node node = toStart.top(); DOM::Node parent = node.getParentNode(); if(parent != NULL && DOMHelpers::hasAncestor(top, node)) { // Remove it from it's child parent.removeChild(el); // And put at start of the document of the document top.insertBefore(el, top.getFirstChild()); } toStart.pop(); } // Complete consolidation to end while(!toEnd.empty()) { DOM::Node node = toEnd.top(); DOM::Node parent = node.getParentNode(); if(parent != NULL && DOMHelpers::hasAncestor(top, node)) { // Remove it from it's child parent.removeChild(el); // And put at end of the document of the document top.appendChild(el); } toEnd.pop(); } } void XmlFixups::breakTables(DOM::Document& doc) { // Break rows out to destinations DOM::NodeList rows = doc.getElementsByTagName(kElRow); if(rows != NULL) { for(int i = 0; i < rows->getLength(); i++) { DOM::Element row = (const DOM::Element&)rows->item(i); DOM::Node parent = row.getParentNode(); if(parent == NULL) continue; if(DOMHelpers::isElement(parent, kElBlock)) { DOM::Node grandparent = parent.getParentNode(); if(grandparent != NULL && !row.hasChildNodes()) { if(row.getPreviousSibling() == NULL) grandparent.insertBefore(parent.removeChild(row), parent); else if(row.getNextSibling() == NULL) DOMHelpers::insertAfter(grandparent, parent.removeChild(row), parent); } } breakElement(row, kElDest); } } // Now group stuff in destinations into tables DOM::NodeList destinations = doc.getElementsByTagName(kElDest); if(destinations != NULL) { for(int i = 0; i < destinations->getLength(); i++) { DOM::Element dest = (const DOM::Element&)destinations->item(i); // Sanity Check if(dest == NULL) continue; // Go through the children of this destination DOM::Node child = dest.getFirstChild(); DOM::Element table; DOM::Element e; while(child != NULL) { // If it's a block and has a cell attribute if(DOMHelpers::isElement(child, kElBlock)) { e = (DOM::Element&)child; // if it has a cell attribute if(e.getAttribute(kAtCell).length() > 0) { e.removeAttribute(kAtCell); if(table == NULL) { table = doc.createElement(kElTable); dest.insertBefore(table, child); } } else { table = NULL; } } // It's not a block if(table != NULL) { table.appendChild(dest.removeChild(child)); child = table; } child = child.getNextSibling(); } } } } void XmlFixups::fixBlock(const DOM::Document& doc, DOM::Element& block) { // Okay now change blocks to whatever element they're supposed to be string fix; wstring val; DOM::Node parent = block.getParentNode(); if(parent != NULL) { // Figure out what kind of element they want block fixed to val = block.getAttribute(kAtFix); if(val.length() > 0) block.removeAttribute(kAtFix); // BUG: Sablotron bug work around if(val.length() > 0) { val = block.getAttributeNS("", kAtFix); if(val.length() > 0) block.removeAttributeNS("", kAtFix); } if(val.length() > 0) DOM::transcode16to8(val, fix); if(fix.length() == 0) fix = kElPara; // Create duplicate of the 'fix' element DOM::Element el = doc.createElement(fix); DOMHelpers::copyAttributes(block, el, NULL); // Replace block with the given 'fix' element while(block.hasChildNodes()) el.appendChild(block.removeChild(block.getFirstChild())); parent.replaceChild(el, block); } }