diff options
Diffstat (limited to 'xmerge/source/pocketword/java/org/openoffice/xmerge/converter/xml/sxw/pocketword/Paragraph.java')
-rw-r--r-- | xmerge/source/pocketword/java/org/openoffice/xmerge/converter/xml/sxw/pocketword/Paragraph.java | 858 |
1 files changed, 858 insertions, 0 deletions
diff --git a/xmerge/source/pocketword/java/org/openoffice/xmerge/converter/xml/sxw/pocketword/Paragraph.java b/xmerge/source/pocketword/java/org/openoffice/xmerge/converter/xml/sxw/pocketword/Paragraph.java new file mode 100644 index 000000000000..0302a5d6efba --- /dev/null +++ b/xmerge/source/pocketword/java/org/openoffice/xmerge/converter/xml/sxw/pocketword/Paragraph.java @@ -0,0 +1,858 @@ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ + +package org.openoffice.xmerge.converter.xml.sxw.pocketword; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import java.util.Vector; +import java.util.Enumeration; + +import java.awt.Color; + +import org.openoffice.xmerge.util.EndianConverter; +import org.openoffice.xmerge.util.ColourConverter; +import org.openoffice.xmerge.converter.xml.ParaStyle; +import org.openoffice.xmerge.converter.xml.TextStyle; + + +/** + * Represents a paragraph data structure within a Pocket Word document. + * + * @author Mark Murnane + * @version 1.1 + */ +class Paragraph implements PocketWordConstants { + /* + * The data elements of a Paragraph. + * + * As the 'unknown' values are not calculated they are declared static. + * They are not declared final because they do have a calcuable value. + */ + private static short unknown1 = 0x23; + private short dataWords = 0; + private short textLength = 0; + private short lengthWithFormatting = 0; + private short lines = 0; + + private static final short marker = (short)0xFFFF; + private static int unknown2 = 0x22; // May be two short values + + private short specialIndentation = 0; + private short leftIndentation = 0; + private short rightIndentation = 0; + + private byte bullets = 0; + private byte alignment = 0; + + private static int unknown3 = 0; + + // Will always have at least these formatting settings in each paragraph + private short defaultFont = 2; // Courier New for the time being + private short defaultSize = 10; + + + /* + * Remaining elements assist in calculating correct values for the paragraph + * representation. + */ + + private Vector textSegments = null; + + private Vector lineDescriptors = null; + + private ParaStyle pStyle = null; + + private boolean isLastParagraph = false; + + + /* + * Private class constructor used by all constructors. Ensures the proper + * initialisation of the Vector storing the paragraph's text. + */ + private Paragraph () { + textSegments = new Vector(0, 1); + } + + + /** + * <p>Constructor for use when converting from SXW format to Pocket Word + * format.</p> + * + * @param style Paragraph style object describing the formatting style + * of this paragraph. + */ + public Paragraph (ParaStyle style) { + this(); + + lineDescriptors = new Vector(0, 1); + pStyle = style; + } + + + /** + * <p>Constructor for use when converting from Pocket Word format to SXW + * format.</p> + * + * @param data Byte array containing byte data describing this paragraph + * from the Pocket Word file. + */ + public Paragraph (byte[] data) { + this(); + + /* + * Read in all fixed data from the array + * + * unknown1 appears at data[0] and data[1] + */ + dataWords = EndianConverter.readShort(new byte[] { data[2], data[3] } ); + textLength = EndianConverter.readShort(new byte[] { data[4], data [5] } ); + lengthWithFormatting = EndianConverter.readShort( + new byte[] { data[6], data[7] } ); + lines = EndianConverter.readShort(new byte[] { data[8], data [9] } ); + + /* + * The marker appears at data[10] and data[11]. + * + * The value of unknown2 is at data[12], data[13], data[14] and data[15]. + */ + + specialIndentation = EndianConverter.readShort(new byte[] { data[16], data[17] } ); + leftIndentation = EndianConverter.readShort(new byte[] { data[18], data [19] } ); + rightIndentation = EndianConverter.readShort(new byte[] { data[20], data [21] } ); + + bullets = data[22]; + alignment = data[23]; + + // The value of unknown3 is at data[24], data[25], data[26] and data[27]. + + /* + * The actual paragraph data is in the remainder of the byte sequence. + * + * Only the actual text seqence with the embedded formatting tags is + * relevant to the conversion from Pocket Word to SXW format. + */ + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + bos.write(data, 28, lengthWithFormatting); + parseText(bos.toByteArray()); + } + + + /* + * Processes the text portion of the raw paragraph data from the Pocket Word + * file. This data also includes formatting settings for the text in the + * paragraph. + * + * Formatting changes appear like XML/HTML tags. Formatted blocks are + * preceded by a sequence of bytes switching on a formatting change and + * followed by a sequence switching off that formatting change. + */ + private void parseText (byte[] data) { + + int totalLength = data.length; + + StringBuffer sb = new StringBuffer(""); + + // Setup text style information + int mask = TextStyle.BOLD | TextStyle.ITALIC | TextStyle.UNDERLINE + | TextStyle.STRIKETHRU; + + + String fontName = null; + int fontSize = 0; + Color textColour = null; + Color backColour = null; + int modifiers = 0; + + TextStyle ts = null; + + int attrsSet = 0; // If this is 0, we have no extra style + boolean inSequence = false; + boolean sawText = false; + + String s = new String(); // For debugging + + // Start from the very beginning + for (int i = 0; i < totalLength; i++) { + // Will encounter at least two codes first + if ((byte)(data[i] & 0xF0) == FORMATTING_TAG) { + if (sawText) { + // Style change so dump previous segment and style info + addTextSegment(sb.toString(), ts); + sb = new StringBuffer(""); + sawText = false; + } + + switch (data[i]) { + case FONT_TAG: + int index = EndianConverter.readShort( + new byte[] { data[i + 1], data[i + 2] } ); + + /* + * Standard font. + * + * Should really be one, but as the only supported font + * currently is Courier New, want to leave it at Courier + * New for round trip conversions. + * + * Also need to account for the fact that Tahoma is the + * correct standard font. + */ + if (fontName == null || fontName.equals("2")) { + if (index != 2 && index != 1) { + fontName = String.valueOf(index); + attrsSet++; + } + } + else { + // Font is set, but not the default + if (index == 2 || index == 1) { + fontName = "2"; + attrsSet--; + } + else { + fontName = String.valueOf(index); + } + } + i += 2; + break; + + + case FONT_SIZE_TAG: + int size = EndianConverter.readShort( + new byte[] { data[i + 1], data[i + 2] } ); + + if (size == 0) { + // Flags the end of the last paragraph + isLastParagraph = true; + i += 2; + break; + } + + // Standard size + if (fontSize == 0 || fontSize == 10) { + if (size != 10) { + fontSize = size; + attrsSet++; + } + } + else { + // Font size is set, but not to standard + if (size == 10) { + fontSize = 10; + attrsSet--; + } + else { + fontSize = size; + } + } + i += 2; + break; + + + case COLOUR_TAG: + if (data[i + 1] != 0) { + ColourConverter cc = new ColourConverter(); + textColour = cc.convertToRGB( + EndianConverter.readShort(new byte[] { data[i + 1], + data[i + 2] } )); + attrsSet++; + } + else { + textColour = null; + attrsSet--; + } + i += 2; + break; + + + case FONT_WEIGHT_TAG: + if (data[i + 1] == FONT_WEIGHT_BOLD + || data[i + 1] == FONT_WEIGHT_THICK) { + modifiers |= TextStyle.BOLD; + attrsSet++; + } + else { + // Its a bit field so subtracting should work okay. + modifiers ^= TextStyle.BOLD; + attrsSet--; + } + i += 2; + break; + + + case ITALIC_TAG: + if (data[i + 1] == (byte)0x01) { + modifiers |= TextStyle.ITALIC; + attrsSet++; + } + else { + modifiers ^= TextStyle.ITALIC; + attrsSet--; + } + i++; + break; + + + case UNDERLINE_TAG: + if (data[i + 1] == (byte)0x01) { + modifiers |= TextStyle.UNDERLINE; + attrsSet++; + } + else { + modifiers ^= TextStyle.UNDERLINE; + attrsSet--; + } + i++; + break; + + + case STRIKETHROUGH_TAG: + if (data[i + 1] == (byte)0x01) { + modifiers |= TextStyle.STRIKETHRU; + attrsSet++; + } + else { + modifiers ^= TextStyle.STRIKETHRU; + attrsSet--; + } + i++; + break; + + case HIGHLIGHT_TAG: + /* + * Highlighting is treated by OpenOffice as a + * background colour. + */ + if (data[i + 1] == (byte)0x01) { + backColour = Color.yellow; + attrsSet++; + } + else { + backColour = null; + attrsSet--; + } + i++; + break; + } + + inSequence = true; + continue; + } + + if (inSequence) { + // Style information has been changed. Create new style here + + inSequence = false; + if (attrsSet > 0) { + ts = new TextStyle(null, TEXT_STYLE_FAMILY, DEFAULT_STYLE, + mask, modifiers, fontSize, fontName, null); + ts.setColors(textColour, backColour); + } + else { + ts = null; + } + } + + /* + * C4 xx seems to indicate a control code. C4 00 indicates the end + * of a paragraph; C4 04 indicates a tab space. Only these two + * have been seen so far. + */ + if (data[i] == (byte)0xC4) { + /* + * Redundant nodes are sometimes added to the last paragraph + * because a new sequence is being processed when the flag is + * set. + * + * To avoid this, do nothing with the last paragraph unless no + * text has been added for it already. In that case, add the + * empty text segment being process to ensure that all + * paragraphs have at least one text segment. + */ + if (data[i + 1] == (byte)0x00) { + if (isLastParagraph && textSegments.size() > 0) { + return; + } + addTextSegment(sb.toString(), ts); + return; + } + sb.append("\t"); + sawText = true; + i++; + continue; + } + + sb.append((char)data[i]); + sawText = true; + s = sb.toString(); + } + } + + + /** + * <p>Adds details of a new text block to the <code>Paragraph</code> object. + * </p> + * + * @param text The text of the new block. + * @param style Text style object describing the formatting attached + * to this block of text. + */ + public void addTextSegment(String text, TextStyle style) { + textLength += text.length(); + textSegments.add(new ParagraphTextSegment(text, style)); + } + + + /** + * <p>This method alters the state of the <code>Paragraph</code> object to + * indicate whether or not it is the final paragraph in the document.</p> + * + * <p>It is used during conversion from SXW format to Pocket Word format. + * In Pocket Word files, the last paragraph finishes with a different byte + * sequence to other paragraphs.</p> + * + * @param isLast true if the Paragraph is the last in the document, + * false otherwise. + */ + public void setLastParagraph(boolean isLast) { + isLastParagraph = isLast; + } + + + /** + * <p>Complementary method to {@link #setLastParagraph(boolean) + * setLastParagraph}. Returns the terminal status of this + * <code>Paragraph</code> within the Pocket Word document.</p> + * + * @return true if the Paragraph is the last in the document; false otherwise. + */ + public boolean getLastParagraph () { + return isLastParagraph; + } + + + /** + * <p>This method returns the Pocket Word representation of this + * <code>Paragraph</code> in Little Endian byte order.</p> + * + * <p>Used when converting from SXW format to Pocket Word format.</p> + * + * @return <code>byte</code> array containing the formatted representation + * of this Paragraph. + */ + public byte[] getParagraphData() { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + + postProcessText(); + + /* + * Need information about the paragraph segments in two places + * so calculate them first. + * + * The stream contains the text wrapped in any formatting sequences that + * are necessary. + */ + ByteArrayOutputStream segs = new ByteArrayOutputStream(); + + try { + for (int i = 0; i < textSegments.size(); i++) { + ParagraphTextSegment pts = (ParagraphTextSegment)textSegments.elementAt(i); + segs.write(pts.getData()); + } + } + catch (IOException ioe) { + // Should never happen in a memory based stream + } + + /* + * Number of data words for this paragraph descriptor: + * + * 26 is the number of bytes prior to the start of the segment. + * 3 comes from the C4 00 00 termintating sequence. + */ + dataWords = (short)(26 + segs.size() + 3 + 4); + if (isLastParagraph) { + dataWords += 6; + } + if (dataWords % 4 != 0) { + dataWords += (4 - (dataWords % 4)); + } + dataWords /= 4; + + /* + * The 8 bytes are made up of E6 ?0 00 and E5 ?0 00 at the start of the + * text along with the C4 00 that terminates it. + * + * In the event that the paragraph is the last one E6 00 00 is also + * present at the end of the text. Also, as we currently use a font + * other than the first in the index (Tahoma) E5 01 00 is also present. + * + * Make sure this is accurate when font specifications change + */ + lengthWithFormatting = (short)(segs.size() + (isLastParagraph ? 14 : 8)); + + try { + bos.write(EndianConverter.writeShort(unknown1)); + bos.write(EndianConverter.writeShort(dataWords)); + bos.write(EndianConverter.writeShort((short)(textLength + 1))); + bos.write(EndianConverter.writeShort(lengthWithFormatting)); + bos.write(EndianConverter.writeShort(lines)); + + bos.write(EndianConverter.writeShort(marker)); + bos.write(EndianConverter.writeInt(unknown2)); + + bos.write(EndianConverter.writeShort(specialIndentation)); + bos.write(EndianConverter.writeShort(leftIndentation)); + bos.write(EndianConverter.writeShort(rightIndentation)); + + bos.write(bullets); + + if (pStyle != null && pStyle.isAttributeSet(ParaStyle.TEXT_ALIGN)) { + switch (pStyle.getAttribute(ParaStyle.TEXT_ALIGN)) { + + case ParaStyle.ALIGN_RIGHT: + bos.write(0x01); + break; + + case ParaStyle.ALIGN_CENTER: + bos.write(0x02); + break; + + default: + bos.write(0x00); // Left align in all other circumstances + break; + } + } + else { + bos.write(0x00); + } + + bos.write(EndianConverter.writeInt(unknown3)); + + + /* + * Write out font and size. + * + * If font support is added then this should change as the information + * will have to be calculated from a Font table. + */ + bos.write(FONT_TAG); + bos.write(EndianConverter.writeShort(defaultFont)); + bos.write(FONT_SIZE_TAG); + bos.write(EndianConverter.writeShort(defaultSize)); + + // Write out the text segments + bos.write(segs.toByteArray()); + + /* + * If this is the last paragraph in the document then we need to make + * sure that the paragraph text is terminated correctly with an E6 00 00 + * before the C4 00 00. + */ + if (isLastParagraph) { + if (defaultFont != 1) { + // Must always go back to the first font. + bos.write(FONT_TAG); + bos.write(EndianConverter.writeShort((short)0x01)); + } + bos.write(FONT_SIZE_TAG); + bos.write(EndianConverter.writeShort((short)0x00)); + } + + bos.write(new byte[] { (byte)0xC4, 0x00, 0x00 } ); + + int padding = 0; + if (bos.size() % 4 != 0) { + padding = 4 - (bos.size() % 4); + } + for (int i = 0; i < padding; i++) { + bos.write(0x00); + } + + // Third byte should match first byte after 0xFF 0xFF + bos.write(new byte[] { 0x42, 0x00, 0x22, 0x00} ); + + /* + * Meaning of last two bytes seems to be the number of words describing + * lines. This is calculated at 10 bytes per descriptor. + * + * May have two extra padding bytes that need to be accounted for too + * The division below may lose 2 bytes (integer result). + */ + int wordsRemaining = (lineDescriptors.size() * 10) / 4; + if ((lineDescriptors.size() * 10) % 4 != 0) { + wordsRemaining++; + } + bos.write(EndianConverter.writeShort((short)wordsRemaining)); + + + // Now write out the line descriptors + for (int i = 0; i < lineDescriptors.size(); i++) { + LineDescriptor ld = (LineDescriptor)lineDescriptors.elementAt(i); + + bos.write(ld.getDescriptorInfo()); + } + + + if (!isLastParagraph) { + /* + * There may be a need to pad this. Will be writing at + * either start of 4 byte block or 2 bytes into it. + */ + if (bos.size() % 4 != 2) { + bos.write(EndianConverter.writeShort((short)0)); + } + bos.write(EndianConverter.writeShort((short)0x41)); + } + } + catch (IOException ioe) { + // Should never occur for a memory based stream + } + + return bos.toByteArray(); + } + + + /* + * This method handles the calculation of correct values for line lengths + * in each individual descriptor and the number of lines in the document. + * + * TODO: Update to take account of different font metrics. + */ + private void postProcessText() { + /* + * The post-processing ... + * + * For each line, we need to add a line descriptor and increment + * the number of lines in the paragraph data structure. + * + * To do this, make sure that no sequence goes over the given screen + * width unless the last char is a whitespace character. + */ + + // In courier, can have no more than 29 chars per line + + int chunkStart = 0; + StringBuffer sb = new StringBuffer(""); + + // Line Descriptor info should be eliminated each time + lineDescriptors = new Vector(1, 1); + lines = 0; + + for (int i = 0; i < textSegments.size(); i++) { + ParagraphTextSegment pts = (ParagraphTextSegment)textSegments.elementAt(i); + sb.append(pts.getText()); + } + + if (sb.length() == 0) { + lines = 1; + lineDescriptors.add(new LineDescriptor((short)1, (short)0)); + return; + } + + while (chunkStart < sb.length()) { + String text = ""; + + try { + text = sb.substring(chunkStart, chunkStart + 30); + } + catch (StringIndexOutOfBoundsException sioobe) { + // We have less than one line left so just add it + text = sb.substring(chunkStart); + lineDescriptors.add(new LineDescriptor((short)(text.length() + 1), (short)(text.length() * 36))); + chunkStart += text.length(); + lines++; + continue; + } + + int lastWhitespace = -1; + + for (int i = 29; i >= 0; i--) { + if (Character.isWhitespace(text.charAt(i))) { + lastWhitespace = i; + break; + } + } + + if (lastWhitespace != -1) { + // The line can be split + lineDescriptors.add(new LineDescriptor((short)(lastWhitespace + 1), (short)(lastWhitespace * 36))); + chunkStart += lastWhitespace + 1; + lines++; + } + else { + // The line is completely occupied by a single word + lineDescriptors.add(new LineDescriptor((short)29, (short)(29 * 36))); + chunkStart += 29; + lines++; + } + } + } + + + /** + * <p>Returns the number of lines in the <code>Paragraph</code>.</p> + * + * @return The number of lines in the document. + */ + public short getLines() { + postProcessText(); + + return lines; + } + + + /** + * <p>Toggles the flag indicating that the <code>Paragraph</code> is a + * bulleted paragraph.</p> + * + * @param isBulleted true to enable bulleting for this paragraph, false + * otherwise. + */ + public void setBullets(boolean isBulleted) { + if (isBulleted) { + bullets = (byte)0xFF; + } + else { + bullets = 0; + } + } + + /** + * <p>Returns the bulleting status of the <code>Paragraph</code>.</p> + * + * @return true if the paragraph is bulleted, false otherwise. + */ + public boolean isBulleted() { + if (bullets != 0) { + return true; + } + return false; + } + + + /** + * <p>Returns the number of text characters in the <code>Paragraph</code>, + * excluding formatting.</p> + * + * @return The length of the paragraph. + */ + public int getTextLength () { + return textLength; + } + + + /** + * <p>Returns an <code>Enumeration</code> over the individual text segments + * of the <code>Paragraph</code>.</p> + * + * @return An <code>Enumeration</code> of the text segments. + */ + public Enumeration getSegmentsEnumerator () { + return textSegments.elements(); + } + + + /** + * <p>Returns a paragraph style object that describes any of the paragraph + * level formatting used by this <code>Paragraph</code>.</p> + * + * @return Paragraph style object describing the <code>Paragraph</code>. + */ + public ParaStyle makeStyle() { + int attrs[] = new int[] { ParaStyle.MARGIN_LEFT, ParaStyle.MARGIN_RIGHT, + ParaStyle.TEXT_ALIGN }; + String values[] = new String[attrs.length]; + + /* + * Not interested in left or right indents just yet. Don't know + * how to calculate them. + */ + + switch (alignment) { + case 2: + values[2] = "center"; + break; + + case 1: + values[2] = "right"; + break; + + case 0: + default: + values[2] = "left"; + return null; // Not interested if its the default. + } + + return new ParaStyle(null, PARAGRAPH_STYLE_FAMILY, null, attrs, + values, null); + } + + + /* + * Class describing the data structures which appear following the text + * of a Paragraph. For each line on screen that the Paragraph uses, a + * LineDescriptor details how many characters are on the line and how much + * screen space they occupy. + * + * The screen space and character breaks are calculated during post-processing + * of the paragraph. See postProcessText(). + * + * The unit of measurement used for screen space is currently unknown. + */ + private class LineDescriptor { + private short characters = 0; + private int filler = 0; + private short screen_space = 0; + private short marker = 0; + + private LineDescriptor(short chars, short space) { + characters = chars; + screen_space = space; + marker = (short)0x040C; // Not a constant. Depends on font used. + } + + + private byte[] getDescriptorInfo(){ + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + + try { + bos.write(EndianConverter.writeShort(characters)); + bos.write(EndianConverter.writeInt(filler)); + bos.write(EndianConverter.writeShort(screen_space)); + bos.write(EndianConverter.writeShort(marker)); + } + catch (IOException ioe) { + // Should never happen in a memory based stream. + } + + return bos.toByteArray(); + } + } +} |