/************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * Copyright 2008 by Sun Microsystems, Inc. * * OpenOffice.org - a multi-platform office productivity suite * * $RCSfile: DocumentDeserializerImpl.java,v $ * $Revision: 1.4 $ * * This file is part of OpenOffice.org. * * OpenOffice.org is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License version 3 * only, as published by the Free Software Foundation. * * OpenOffice.org is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License version 3 for more details * (a copy is included in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU Lesser General Public License * version 3 along with OpenOffice.org. If not, see * * for a copy of the LGPLv3 License. * ************************************************************************/ package org.openoffice.xmerge.converter.xml.sxw.wordsmith; import org.w3c.dom.*; import java.io.IOException; import java.util.Enumeration; import org.openoffice.xmerge.Document; import org.openoffice.xmerge.ConvertData; import org.openoffice.xmerge.ConvertException; import org.openoffice.xmerge.DocumentDeserializer; import org.openoffice.xmerge.converter.xml.OfficeConstants; import org.openoffice.xmerge.converter.palm.PalmDB; import org.openoffice.xmerge.converter.palm.Record; import org.openoffice.xmerge.converter.palm.PdbDecoder; import org.openoffice.xmerge.converter.palm.PalmDocument; import org.openoffice.xmerge.converter.xml.sxw.SxwDocument; import java.util.Vector; import java.io.ByteArrayInputStream; import org.openoffice.xmerge.converter.xml.*; import org.openoffice.xmerge.util.Debug; import org.openoffice.xmerge.util.XmlUtil; /** *

WordSmith implementation of * org.openoffice.xmerge.DocumentDeserializer * for the {@link * org.openoffice.xmerge.converter.xml.sxw.wordsmith.PluginFactoryImpl * PluginFactoryImpl}.

* * The deserialize method uses a * DocDecoder to read the WordSmith format into a * String object, then it calls buildDocument * to create a SxwDocument object from it. * * @author Herbie Ong, David Proulx */ public final class DocumentDeserializerImpl implements DOCConstants, OfficeConstants, DocumentDeserializer { /** A Decoder object for decoding WordSmith format. */ private WSDecoder decoder = null; WseFontTable fontTable = null; WseColorTable colorTable = null; StyleCatalog styleCat = null; StyleCatalog oldStyleCat = null; /** A ConvertData object assigned to this object. */ private ConvertData cd = null; /** * Constructor that assigns the given ConvertData * to the object. * * @param cd A ConvertData object to read data for * the conversion process by the deserialize method. */ public DocumentDeserializerImpl(ConvertData cd) { this.cd = cd; } /** * Convert the given ConvertData into a * SxwDocument object. * * @return Resulting Document object. * * @throws ConvertException If any conversion error occurs. * @throws IOException If any I/O error occurs. */ public Document deserialize() throws ConvertException, IOException { return deserialize(null, cd); } public Document deserialize(Document origDoc, ConvertData cd) throws IOException { Document doc = null; PalmDocument palmDoc = null; Enumeration e = cd.getDocumentEnumeration(); while(e.hasMoreElements()) { palmDoc = (PalmDocument) e.nextElement(); PalmDB pdb = palmDoc.getPdb(); Record[] recs = pdb.getRecords(); decoder = new WSDecoder(); Wse[] b = decoder.parseDocument(recs); String docName = palmDoc.getName(); doc = buildDocument(docName, b, origDoc); } return doc; } /** * Temporary method to read existing StyleCatalog * as a starting point. * * @param parentDoc The parent Document. */ private void readStyleCatalog(Document parentDoc) { Element rootNode = null; try { java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); parentDoc.write(bos); SxwDocument sxwDoc = new SxwDocument("old"); sxwDoc.read(new ByteArrayInputStream(bos.toByteArray())); org.w3c.dom.Document domDoc = sxwDoc.getContentDOM(); String families[] = new String[3]; families[0] = "text"; families[1] = "paragraph"; families[2] = "paragraph"; Class classes[] = new Class[3]; classes[0] = TextStyle.class; classes[1] = ParaStyle.class; classes[2] = TextStyle.class; NodeList nl = domDoc.getElementsByTagName(TAG_OFFICE_STYLES); oldStyleCat.add(nl.item(0), families, classes, null, false); nl = domDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); oldStyleCat.add(nl.item(0), families, classes, null, false); nl = domDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES); oldStyleCat.add(nl.item(0), families, classes, null, false); } catch (Exception e) { Debug.log(Debug.ERROR, "", e); } } /** * Given an array of paragraph Style objects, see if * there is exactly one which matches the text formatting * Style of tStyle. * * @param paraStyles An array of paragraph Style * objects. * @param tStyle Text Style to match. * * @return The paragraph Style that matches. */ private ParaStyle matchParaByText(Style paraStyles[], TextStyle tStyle) { int matchIndex = -1; int matchCount = 0; Style txtMatches[] = (Style[]) oldStyleCat.getMatching(tStyle); if (txtMatches.length >= 1) { for (int j = 0; j < txtMatches.length; j++) { TextStyle t = (TextStyle)txtMatches[j]; if (!t.getFamily().equals("paragraph")) continue; for (int k = 0; k < paraStyles.length; k++) { if (t.getName().equals(paraStyles[k].getName())) { matchCount++; matchIndex = k; } } } } if (matchCount == 1) return (ParaStyle)paraStyles[matchIndex]; else return null; } /** * Take a String of text and turn it into a sequence * of Node objects. * * @param text String of text. * @param parentDoc Parent Document. * * @return Array of Node objects. */ private Node[] parseText(String text, org.w3c.dom.Document parentDoc) { Vector nodeVec = new Vector(); // Break up the text from the WordSmith text run into Open // Office text runs. There may be more runs in OO because // runs of 2 or more spaces map to nodes. while ((text.indexOf(" ") != -1) || (text.indexOf("\t") != 1)) { // Find the indices of tabs and multiple spaces, and // figure out which of them occurs first in the string. int spaceIndex = text.indexOf(" "); int tabIndex = text.indexOf("\t"); if ((spaceIndex == -1) && (tabIndex == -1)) break; // DJP This should not be necessary. What is wrong // with the while() stmt up above? int closerIndex; // Index of the first of these if (spaceIndex == -1) closerIndex = tabIndex; else if (tabIndex == -1) closerIndex = spaceIndex; else closerIndex = (spaceIndex > tabIndex) ? tabIndex : spaceIndex; // If there is any text prior to the first occurrence of a // tab or spaces, create a text node from it, then chop it // off the string we're working with. if (closerIndex > 0) { String beginningText = text.substring(0, closerIndex); Text textNode = parentDoc.createTextNode(beginningText); nodeVec.addElement(textNode); log(""); log(beginningText); log(""); } text = text.substring(closerIndex); // Handle either tab character or space sequence by creating // an element for it, and then chopping out the text that // represented it in "text". if (closerIndex == tabIndex) { Element tabNode = parentDoc.createElement(TAG_TAB_STOP); nodeVec.add(tabNode); text = text.substring(1); // tab is always a single character log(""); } else { // Compute length of space sequence. int nrSpaces = 2; while ((nrSpaces < text.length()) && text.substring(nrSpaces, nrSpaces + 1).equals(" ")) nrSpaces++; Element spaceNode = parentDoc.createElement(TAG_SPACE); spaceNode.setAttribute(ATTRIBUTE_SPACE_COUNT, new Integer(nrSpaces).toString()); nodeVec.add(spaceNode); text = text.substring(nrSpaces); log(""); } } // No more tabs or space sequences. If there's any remaining // text create a text node for it. if (text.length() > 0) { Text textNode = parentDoc.createTextNode(text); nodeVec.add(textNode); log(""); log(text); log(""); } // Now create and populate an array to return the nodes in. Node nodes[] = new Node[nodeVec.size()]; for (int i = 0; i < nodeVec.size(); i++) nodes[i] = (Node)nodeVec.elementAt(i); return nodes; } /** * Parses the text content of a WordSmith format and builds a * SXWDocument. * * @param docName Document name * @param str Text content of WordSmith format * * @return Resulting SXWDocument object. * * @throws IOException If any I/O error occurs. */ private SxwDocument buildDocument(String docName, Wse[] data, Document origDoc) throws IOException { // create minimum office xml document. SxwDocument sxwDoc = new SxwDocument(docName); sxwDoc.initContentDOM(); org.w3c.dom.Document doc = sxwDoc.getContentDOM(); // Grab hold of the office:body tag, // Assume there should be one. // This is where top level paragraphs will append to. NodeList list = doc.getElementsByTagName(TAG_OFFICE_BODY); Node bodyNode = list.item(0); styleCat = new StyleCatalog(50); oldStyleCat = new StyleCatalog(50); if (origDoc != null) readStyleCatalog(origDoc); Element currPara = null; ParaStyle currParaStyle = null; int newTextStyleNr = 0; int newParaStyleNr = 0; // Now write out the document body by running through // the list of WordSmith elements and processing each one // in turn. for (int i = 0; i < data.length; i++) { if (data[i].getClass() == WsePara.class) { currPara = doc.createElement(TAG_PARAGRAPH); log(""); log(""); WsePara p = (WsePara)data[i]; // Save info about the first text run, if there is one. WseTextRun firstTextRun = null; if ((data.length >= i + 2) && (data[i+1].getClass() == WseTextRun.class)) firstTextRun = (WseTextRun)data[i+1]; Style matches[] = oldStyleCat.getMatching(p.makeStyle()); // See if we can find a unique match in the catalog // of existing styles from the original document. ParaStyle pStyle = null; if (matches.length == 1) { pStyle = (ParaStyle)matches[0]; log("using an existing style"); } else if ((matches.length > 1) && (firstTextRun != null)) { pStyle = matchParaByText(matches, firstTextRun.makeStyle()); log("resolved a para by looking @ text"); } // If nothing found so far, try looking in the catalog // of newly-created styles. // DJP FIXME: if we need to add two para styles with the // same para formatting info but different default text // styles, this won't work! if (pStyle == null) { log("had " + matches.length + " matches in old catalog"); matches = styleCat.getMatching(p.makeStyle()); if (matches.length == 0) { pStyle = p.makeStyle(); String newName = new String("PPP" + ++newParaStyleNr); pStyle.setName(newName); styleCat.add(pStyle); // DJP: write in the text format info here log("created a new style"); } else if (matches.length == 1) { pStyle = (ParaStyle)matches[0]; log("re-using a new style"); } else if (firstTextRun != null) { pStyle = matchParaByText(matches, firstTextRun.makeStyle()); if (pStyle != null) { log("resolved a (new) para by looking @ text"); } else log("Hey this shouldn't happen! - nr of matches is " + matches.length); } } if (pStyle == null) log("Unable to figure out a para style"); // Figured out a style to use. Specify the style in this // paragraph's attributes. currPara.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, pStyle.getName()); bodyNode.appendChild(currPara); currParaStyle = pStyle; } else if (data[i].getClass() == WseTextRun.class) { WseTextRun tr = (WseTextRun)data[i]; TextStyle trStyle = null; Node trNodes[] = parseText(tr.getText(), doc); // First see if the formatting of this text run matches // the default text formatting for this paragraph. If // it does, then just make the text node(s) children of // the current paragraph. Style[] cps = new Style[1]; cps[0] = currParaStyle; if (matchParaByText(cps, tr.makeStyle()) != null) { for (int ii = 0; ii < trNodes.length; ii++) { currPara.appendChild(trNodes[ii]); } continue; } // Check for existing, matching styles in the old style // catalog. If exactly one is found, use it. Otherwise, // check the new style catalog, and either use the style // found or add this new one to it. Style matches[] = oldStyleCat.getMatching(tr.makeStyle()); if (matches.length == 1) trStyle = (TextStyle)matches[0]; else { matches = styleCat.getMatching(tr.makeStyle()); if (matches.length == 0) { trStyle = tr.makeStyle(); String newName = new String("TTT" + ++newTextStyleNr); trStyle.setName(newName); styleCat.add(trStyle); } else if (matches.length == 1) trStyle = (TextStyle)matches[0]; else log("multiple text style matches from new catalog"); } // Create a text span node, set the style attribute, make the // text node(s) its children, and append it to current paragraph's // list of children. Element textSpanNode = doc.createElement(TAG_SPAN); textSpanNode.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, trStyle.getName()); for (int ii = 0; ii < trNodes.length; ii++) { textSpanNode.appendChild(trNodes[ii]); } currPara.appendChild(textSpanNode); log(""); } else if (data[i].getClass() == WseFontTable.class) { fontTable = (WseFontTable)data[i]; } else if (data[i].getClass() == WseColorTable.class) { colorTable = (WseColorTable)data[i]; } } //NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT); NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT_CONTENT); Node rootNode = r.item(0); // read the original document org.w3c.dom.NodeList nl; if (origDoc != null) { java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream(); origDoc.write(bos); SxwDocument origSxwDoc = new SxwDocument("old"); origSxwDoc.read(new ByteArrayInputStream(bos.toByteArray())); org.w3c.dom.Document origDomDoc = origSxwDoc.getContentDOM(); XmlUtil xu = new XmlUtil(); org.w3c.dom.DocumentFragment df; org.w3c.dom.Node newNode; // copy font declarations from original document to the new document nl = origDomDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS); df = doc.createDocumentFragment(); newNode = xu.deepClone(df, nl.item(0)); rootNode.insertBefore(newNode, bodyNode); // copy style catalog from original document to the new document nl = origDomDoc.getElementsByTagName(TAG_OFFICE_STYLES); df = doc.createDocumentFragment(); newNode = xu.deepClone(df, nl.item(0)); rootNode.insertBefore(newNode, bodyNode); nl = origDomDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); df = doc.createDocumentFragment(); newNode = xu.deepClone(df, nl.item(0)); rootNode.insertBefore(newNode, bodyNode); nl = origDomDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES); df = doc.createDocumentFragment(); newNode = xu.deepClone(df, nl.item(0)); rootNode.insertBefore(newNode, bodyNode); } // Original document not specified. We need to add font declarations. // DJP: this might just be for debugging. Merger will probably put // the "real" ones in. // DJP: if really doing it this way, do it right: gather font names // from style catalog(s). else { org.w3c.dom.Node declNode; log(""); declNode = doc.createElement(TAG_OFFICE_FONT_DECLS); rootNode.insertBefore(declNode, bodyNode); org.w3c.dom.Element fontNode; fontNode = doc.createElement(TAG_STYLE_FONT_DECL); fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arial"); fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arial"); fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable"); declNode.appendChild(fontNode); fontNode = doc.createElement(TAG_STYLE_FONT_DECL); fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arioso"); fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arioso"); fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable"); declNode.appendChild(fontNode); } // Now add any new styles we have created in this document. nl = doc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES); Node autoStylesNode = nl.item(0); if (autoStylesNode == null) { autoStylesNode = doc.createElement(TAG_OFFICE_AUTOMATIC_STYLES); log(""); rootNode.insertBefore(autoStylesNode, bodyNode); } Node newStyleCatNode = styleCat.writeNode(doc, "dummy"); nl = newStyleCatNode.getChildNodes(); int nNodes = nl.getLength(); for (int i = 0; i < nNodes; i++) { autoStylesNode.appendChild(nl.item(0)); } oldStyleCat.dumpCSV(true); styleCat.dumpCSV(true); return sxwDoc; } /** * Sends message to the log object. * * @param str Debug message. */ private void log(String str) { Debug.log(Debug.TRACE, str); } /* public static void main(String args[]) { // DocumentDeserializerImpl d = new DocumentDeserializerImpl(new InputStream()); Node nodes[] = parseText("Tab here:\tThen some more text"); } */ }