diff options
Diffstat (limited to 'xmerge/source/wordsmith/java/org/openoffice/xmerge/converter/xml/sxw/wordsmith/WSDecoder.java')
-rw-r--r-- | xmerge/source/wordsmith/java/org/openoffice/xmerge/converter/xml/sxw/wordsmith/WSDecoder.java | 352 |
1 files changed, 352 insertions, 0 deletions
diff --git a/xmerge/source/wordsmith/java/org/openoffice/xmerge/converter/xml/sxw/wordsmith/WSDecoder.java b/xmerge/source/wordsmith/java/org/openoffice/xmerge/converter/xml/sxw/wordsmith/WSDecoder.java new file mode 100644 index 000000000000..5ac9bc01c725 --- /dev/null +++ b/xmerge/source/wordsmith/java/org/openoffice/xmerge/converter/xml/sxw/wordsmith/WSDecoder.java @@ -0,0 +1,352 @@ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ + +package org.openoffice.xmerge.converter.xml.sxw.wordsmith; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.IOException; +import java.io.FileInputStream; +import java.io.UnsupportedEncodingException; +import org.openoffice.xmerge.util.Debug; + +import org.openoffice.xmerge.converter.palm.*; +import org.openoffice.xmerge.util.Resources; + +/** + * This class is used by {@link + * org.openoffice.xmerge.converter.xml.sxw.wordsmith.DocumentDeserializerImpl + * DocumentDeserializerImpl} to decode a WordSmith format. It currently + * decodes the text content into a single <code>String</code> object. + * + * @author Herbie Ong, David Proulx + */ +final class WSDecoder implements DOCConstants { + + /** For decoding purposes. */ + private final static int COUNT_BITS = 3; + + /** Resources object for I18N. */ + private Resources res = null; + + /** + * Default constructor creates a header and + * a text buffer for holding all the text in + * the DOC db. + */ + WSDecoder() { + res = Resources.getInstance(); + } + + /** + * Decode the text records into a single <code>byte</code> array. + * + * @param Record <code>Record</code> array holding WordSmith + * contents. + * + * @throws IOException If any I/O error occurs. + */ + byte[] parseRecords(Record[] recs) throws IOException { + + // read the header record + HeaderInfo header = readHeader(recs[0].getBytes()); + dumpHeader(header); + byte[][] byteArrays = new byte[recs.length - 1][]; + for (int i = 0; i < recs.length - 1; i++) byteArrays[i] = null; + + switch (header.version & ~4) { // DJP: "4" indicates OOB data is present. + // Add a constant to handle this, might also need code to handle it. + + case COMPRESSED: + case 3: // DJP: determined this empirically. Are Herbie's constants wrong? + for (int i = 1; i < recs.length; i++) { + byteArrays[i-1] = decompress(recs[i].getBytes(), + header.textRecordSize); + Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes"); + } + + break; + + case UNCOMPRESSED: + for (int i = 1; i < recs.length; i++) { + byteArrays[i-1] = recs[i].getBytes(); + Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes"); + } + + break; + + default: + throw new IOException(res.getString("UNKNOWN_DOC_VERSION")); + + } + + // Concatenate byteArrays[][] into a single byte array. + int length = 0; + for (int i = 0; i < recs.length - 1; i++) + length += byteArrays[i].length; + byte bigArray[] = new byte[length]; + int offset = 0; + for (int i = 0; i < recs.length - 1; i++) { + System.arraycopy(byteArrays[i], 0, bigArray, offset, + byteArrays[i].length); + offset += byteArrays[i].length; + } + return bigArray; + } + + + /** + * Decode the text records into a <code>Wse</code> array. + * + * @param Record[] <code>Record</code> array holding DOC + * contents. + * + * @throws IOException If any I/O error occurs. + */ + Wse[] parseDocument(Record[] recs) throws IOException { + + java.util.Vector v = new java.util.Vector(20, 20); + WseFontTable fontTable = null; + WseColorTable colorTable = null; + + // rawData is the document data to be parsed. + byte rawData[] = parseRecords(recs); + + // beginning of document has some header information, including + // optional font and color tables. + // DJP: maybe should add a new WSelement (docHeader) to hold + // header info. + // DJP: finish code here to parse header + if (rawData[0] != 2) throw new IOException(); + int nParagraphs = util.intFrom4bytes(rawData, 2); + int nAtoms = util.intFrom4bytes(rawData, 6); + int nChars = util.intFrom4bytes(rawData, 10); + int miscSize = util.intFrom4bytes(rawData, 14); + int curIndex = 18; + + while (curIndex < rawData.length) { + if (WsePara.isValid(rawData, curIndex)) { + v.add(new WsePara(rawData, curIndex)); + curIndex = WsePara.computeNewIndex(rawData, curIndex); + } else if (WseTextRun.isValid(rawData, curIndex)) { + v.add(new WseTextRun(rawData, curIndex, fontTable, colorTable)); + curIndex = WseTextRun.computeNewIndex(rawData, curIndex); + } else if (WseFontTable.isValid(rawData, curIndex)) { + fontTable = new WseFontTable(rawData, curIndex); + v.add(fontTable); + curIndex = WseFontTable.computeNewIndex(rawData, curIndex); + } else if (WseColorTable.isValid(rawData, curIndex)) { + colorTable = new WseColorTable(rawData, curIndex); + v.add(colorTable); + curIndex = WseColorTable.computeNewIndex(rawData, curIndex); + } else { + Debug.log(Debug.ERROR, "Unknown code " + rawData[curIndex]); + throw new IOException(); + } + } + + return (Wse[])v.toArray(new Wse[2]); + } + + + /** + * <p>Decompress the <code>byte</code> array.</p> + * + * <p>The resulting uncompressed <code>byte</code> array + * should be within <code>textRecordSize</code> length, + * definitely within twice the size it claims, else treat + * it as a problem with the encoding of that PDB and + * throw <code>IOException</code>.</p> + * + * @param bytes Compressed <code>byte</code> array + * @param textRecordSize Size of uncompressed <code>byte</code> + * array + * + * @throws IOException If <code>textRecordSize</codeL < + * <code>cBytes.length</code>. + */ + private byte[] decompress(byte[] cBytes, int textRecordSize) + throws IOException { + + // create byte array for storing uncompressed bytes + // it should be within textRecordSize range, definitely + // within twice of textRecordSize! if not, then + // an ArrayIndexOutOfBoundsException will get thrown, + // and it should be converted into an IOException, and + // treat it as a conversion error. + byte[] uBytes = new byte[textRecordSize*2]; + + int up = 0; + int cp = 0; + + try { + + while (cp < cBytes.length) { + + int c = cBytes[cp++] & 0xff; + + // codes 1...8 mean copy that many bytes + if (c > 0 && c < 9) { + + while (c-- > 0) + uBytes[up++] = cBytes[cp++]; + } + + // codes 0, 9...0x7F represent themselves + else if (c < 0x80) { + uBytes[up++] = (byte) c; + } + + // codes 0xC0...0xFF represent "space + ascii char" + else if (c >= 0xC0) { + uBytes[up++] = (byte) ' '; + uBytes[up++] = (byte) (c ^ 0x80); + } + + // codes 0x80...0xBf represent sequences + else { + c <<= 8; + c += cBytes[cp++] & 0xff; + int m = (c & 0x3fff) >> COUNT_BITS; + int n = c & ((1 << COUNT_BITS) - 1); + n += COUNT_BITS; + while (n-- > 0) { + uBytes[up] = uBytes[up - m]; + up++; + } + } + } + + } catch (ArrayIndexOutOfBoundsException e) { + + throw new IOException( + res.getString("DOC_TEXT_RECORD_SIZE_EXCEEDED")); + } + + // note that ubytes may be larger that the amount of + // uncompressed bytes, so trim it to another byte array + // with the exact size. + byte[] textBytes = new byte[up]; + System.arraycopy(uBytes, 0, textBytes, 0, up); + + return textBytes; + } + + + /** + * Read the header <code>byte</code> array. + * + * @param bytes <code>byte</code> array containing header + * record data. + * + * @return <code>HeaderInfo</code> object. + * + * @throws IOException If any I/O error occurs. + */ + private HeaderInfo readHeader(byte[] bytes) throws IOException { + + HeaderInfo header = new HeaderInfo(); + + ByteArrayInputStream bis = new ByteArrayInputStream(bytes); + DataInputStream dis = new DataInputStream(bis); + + // Normally the first 2 bytes comprised of the version + // which should either be COMPRESSED or UNCOMPRESSED + // SmartDoc/Quickword would add a 0x01 to the first + // byte, thus their version would be 0x0101 for UNCOMPRESSED + // instead of 0x0001 and 0x0102 for UNCOMPRESSED instead of + // 0x0002. + + dis.readByte(); + header.version = dis.readByte(); + + // read extra 2 unused bytes + dis.readShort(); + + // Read the text length, this should be unsigned 4 bytes. + // We could store the read value into a long, but then + // our current buffer limit is the max positive of an int. + // That is a large enough limit, thus we shall stay with + // storing the value in an int. If it exceeds, then + // an IOException should be thrown. + header.textLen = dis.readInt(); + if (header.textLen < 0) { + throw new IOException(res.getString("DOC_TEXT_LENGTH_EXCEEDED")); + } + + // read the number of records - unsigned 2 bytes + header.textRecordCount = ((int) dis.readShort()) & 0x0000ffff; + + // read the record size - unsigned 2 bytes + header.textRecordSize = ((int) dis.readShort()) & 0x0000ffff; + + // read extra 4 unused bytes + dis.readInt(); + + return header; + } + + + /** + * Prints out header info into log. + * Used for debugging purposes only. + * + * @param header <code>HeaderInfo</code> structure. + */ + private void dumpHeader(HeaderInfo header) { + /* + log("<DOC_INFO "); + log("version=\"" + header.version + "\" "); + log("text-length=\"" + header.textLen + "\" "); + log("number-of-records=\"" + header.textRecordCount + "\" "); + log("record-size=\"" + header.textRecordSize + "\" />\n"); + */ + } + + + /** + * Inner class to store DOC header information. + */ + private class HeaderInfo { + + /** length of text section */ + int textLen = 0; + + /** number of text records */ + int textRecordCount = 0; + + /** + * size of a text record. This is normally the same as + * TEXT_RECORD_SIZE, but some applications may modify this. + */ + int textRecordSize = 0; + + /** compression type */ + int version = 0; + } +} + |