summaryrefslogtreecommitdiff
path: root/svtools/source/svrtf/parrtf.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'svtools/source/svrtf/parrtf.cxx')
-rw-r--r--svtools/source/svrtf/parrtf.cxx710
1 files changed, 710 insertions, 0 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
new file mode 100644
index 000000000000..1c578d160307
--- /dev/null
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -0,0 +1,710 @@
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org. If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_svtools.hxx"
+
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
+
+#include <stdio.h> // for EOF
+#include <rtl/tencinfo.h>
+#include <tools/stream.hxx>
+#include <tools/debug.hxx>
+#include "rtftoken.h"
+#include "rtfkeywd.hxx"
+#include <svtools/parrtf.hxx>
+
+const int MAX_STRING_LEN = 1024;
+const int MAX_TOKEN_LEN = 128;
+
+#define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
+#define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
+
+SV_IMPL_VARARR( RtfParserStates_Impl, RtfParserState_Impl )
+
+SvRTFParser::SvRTFParser( SvStream& rIn, BYTE nStackSize )
+ : SvParser( rIn, nStackSize ),
+ eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet
+ nUCharOverread( 1 )
+{
+ // default ist ANSI-CodeSet
+ SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
+ bRTF_InTextRead = false;
+}
+
+SvRTFParser::~SvRTFParser()
+{
+}
+
+
+
+
+int SvRTFParser::_GetNextToken()
+{
+ int nRet = 0;
+ do {
+ int bNextCh = true;
+ switch( nNextCh )
+ {
+ case '\\':
+ {
+ // Steuerzeichen
+ switch( nNextCh = GetNextChar() )
+ {
+ case '{':
+ case '}':
+ case '\\':
+ case '+': // habe ich in einem RTF-File gefunden
+ case '~': // nonbreaking space
+ case '-': // optional hyphen
+ case '_': // nonbreaking hyphen
+ case '\'': // HexValue
+ nNextCh = '\\';
+ rInput.SeekRel( -1 );
+ ScanText();
+ nRet = RTF_TEXTTOKEN;
+ bNextCh = 0 == nNextCh;
+ break;
+
+ case '*': // ignoreflag
+ nRet = RTF_IGNOREFLAG;
+ break;
+ case ':': // subentry in an index entry
+ nRet = RTF_SUBENTRYINDEX;
+ break;
+ case '|': // formula-charakter
+ nRet = RTF_FORMULA;
+ break;
+
+ case 0x0a:
+ case 0x0d:
+ nRet = RTF_PAR;
+ break;
+
+ default:
+ if( RTF_ISALPHA( nNextCh ) )
+ {
+ aToken = '\\';
+ {
+ String aStrBuffer;
+ sal_Unicode* pStr = aStrBuffer.AllocBuffer(
+ MAX_TOKEN_LEN );
+ xub_StrLen nStrLen = 0;
+ do {
+ *(pStr + nStrLen++) = nNextCh;
+ if( MAX_TOKEN_LEN == nStrLen )
+ {
+ aToken += aStrBuffer;
+ aToken.GetBufferAccess(); // make unique string!
+ nStrLen = 0;
+ }
+ nNextCh = GetNextChar();
+ } while( RTF_ISALPHA( nNextCh ) );
+ if( nStrLen )
+ {
+ aStrBuffer.ReleaseBufferAccess( nStrLen );
+ aToken += aStrBuffer;
+ }
+ }
+
+ // Minus fuer numerischen Parameter
+ int bNegValue = false;
+ if( '-' == nNextCh )
+ {
+ bNegValue = true;
+ nNextCh = GetNextChar();
+ }
+
+ // evt. Numerischer Parameter
+ if( RTF_ISDIGIT( nNextCh ) )
+ {
+ nTokenValue = 0;
+ do {
+ nTokenValue *= 10;
+ nTokenValue += nNextCh - '0';
+ nNextCh = GetNextChar();
+ } while( RTF_ISDIGIT( nNextCh ) );
+ if( bNegValue )
+ nTokenValue = -nTokenValue;
+ bTokenHasValue=true;
+ }
+ else if( bNegValue ) // das Minus wieder zurueck
+ {
+ nNextCh = '-';
+ rInput.SeekRel( -1 );
+ }
+ if( ' ' == nNextCh ) // Blank gehoert zum Token!
+ nNextCh = GetNextChar();
+
+ // suche das Token in der Tabelle:
+ if( 0 == (nRet = GetRTFToken( aToken )) )
+ // Unknown Control
+ nRet = RTF_UNKNOWNCONTROL;
+
+ // bug 76812 - unicode token handled as normal text
+ bNextCh = false;
+ switch( nRet )
+ {
+ case RTF_UC:
+ if( 0 <= nTokenValue )
+ {
+ nUCharOverread = (BYTE)nTokenValue;
+#if 1
+ //cmc: other ifdef breaks #i3584
+ aParserStates[ aParserStates.Count()-1].
+ nUCharOverread = nUCharOverread;
+#else
+ if( !nUCharOverread )
+ nUCharOverread = aParserStates[
+ aParserStates.Count()-1].nUCharOverread;
+ else
+ aParserStates[ aParserStates.Count()-1].
+ nUCharOverread = nUCharOverread;
+#endif
+ }
+ aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
+ // read next token
+ nRet = 0;
+ break;
+
+ case RTF_UPR:
+ if (!_inSkipGroup) {
+ // UPR - overread the group with the ansi
+ // informations
+ while( '{' != _GetNextToken() )
+ ;
+ SkipGroup();
+ _GetNextToken(); // overread the last bracket
+ nRet = 0;
+ }
+ break;
+
+ case RTF_U:
+ if( !bRTF_InTextRead )
+ {
+ nRet = RTF_TEXTTOKEN;
+ aToken = (sal_Unicode)nTokenValue;
+
+ // overread the next n "RTF" characters. This
+ // can be also \{, \}, \'88
+ for( BYTE m = 0; m < nUCharOverread; ++m )
+ {
+ sal_Unicode cAnsi = nNextCh;
+ while( 0xD == cAnsi )
+ cAnsi = GetNextChar();
+ while( 0xA == cAnsi )
+ cAnsi = GetNextChar();
+
+ if( '\\' == cAnsi &&
+ '\'' == ( cAnsi = GetNextChar() ))
+ // HexValue ueberlesen
+ cAnsi = GetHexValue();
+ nNextCh = GetNextChar();
+ }
+ ScanText();
+ bNextCh = 0 == nNextCh;
+ }
+ break;
+ }
+ }
+ else if( SVPAR_PENDING != eState )
+ {
+ // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
+ // eState = SVPAR_ERROR;
+ bNextCh = false;
+ }
+ break;
+ }
+ }
+ break;
+
+ case sal_Unicode(EOF):
+ eState = SVPAR_ACCEPTED;
+ nRet = nNextCh;
+ break;
+
+ case '{':
+ {
+ if( 0 <= nOpenBrakets )
+ {
+ RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
+ aParserStates.Insert(
+ aState, sal::static_int_cast< USHORT >(nOpenBrakets) );
+ }
+ ++nOpenBrakets;
+ DBG_ASSERT( nOpenBrakets == aParserStates.Count(),
+ "ParserStateStack unequal to bracket count" );
+ nRet = nNextCh;
+ }
+ break;
+
+ case '}':
+ --nOpenBrakets;
+ if( 0 <= nOpenBrakets )
+ {
+ aParserStates.Remove(
+ sal::static_int_cast< USHORT >(nOpenBrakets) );
+ if( aParserStates.Count() )
+ {
+ const RtfParserState_Impl& rRPS =
+ aParserStates[ aParserStates.Count() - 1 ];
+ nUCharOverread = rRPS.nUCharOverread;
+ SetSrcEncoding( rRPS.eCodeSet );
+ }
+ else
+ {
+ nUCharOverread = 1;
+ SetSrcEncoding( GetCodeSet() );
+ }
+ }
+ DBG_ASSERT( nOpenBrakets == aParserStates.Count(),
+ "ParserStateStack unequal to bracket count" );
+ nRet = nNextCh;
+ break;
+
+ case 0x0d:
+ case 0x0a:
+ break;
+
+ default:
+ // es folgt normaler Text
+ ScanText();
+ nRet = RTF_TEXTTOKEN;
+ bNextCh = 0 == nNextCh;
+ break;
+ }
+
+ if( bNextCh )
+ nNextCh = GetNextChar();
+
+ } while( !nRet && SVPAR_WORKING == eState );
+ return nRet;
+}
+
+
+sal_Unicode SvRTFParser::GetHexValue()
+{
+ // Hex-Wert sammeln
+ register int n;
+ register sal_Unicode nHexVal = 0;
+
+ for( n = 0; n < 2; ++n )
+ {
+ nHexVal *= 16;
+ nNextCh = GetNextChar();
+ if( nNextCh >= '0' && nNextCh <= '9' )
+ nHexVal += (nNextCh - 48);
+ else if( nNextCh >= 'a' && nNextCh <= 'f' )
+ nHexVal += (nNextCh - 87);
+ else if( nNextCh >= 'A' && nNextCh <= 'F' )
+ nHexVal += (nNextCh - 55);
+ }
+ return nHexVal;
+}
+
+void SvRTFParser::ScanText( const sal_Unicode cBreak )
+{
+ String aStrBuffer;
+ int bWeiter = true;
+ while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
+ {
+ int bNextCh = true;
+ switch( nNextCh )
+ {
+ case '\\':
+ {
+ switch (nNextCh = GetNextChar())
+ {
+ case '\'':
+ {
+
+#if 0
+ // #i35653 patch from cmc
+ ByteString aByteString(static_cast<char>(GetHexValue()));
+ if (aByteString.Len())
+ aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
+#else
+ ByteString aByteString;
+ while (1)
+ {
+ aByteString.Append((char)GetHexValue());
+
+ bool bBreak = false;
+ sal_Char nSlash = '\\';
+ while (!bBreak)
+ {
+ wchar_t __next=GetNextChar();
+ if (__next>0xFF) // fix for #i43933# and #i35653#
+ {
+ if (aByteString.Len())
+ aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
+ aStrBuffer.Append((sal_Unicode)__next);
+
+ aByteString.Erase();
+ continue;
+ }
+ nSlash = (sal_Char)__next;
+ while (nSlash == 0xD || nSlash == 0xA)
+ nSlash = (sal_Char)GetNextChar();
+
+ switch (nSlash)
+ {
+ case '{':
+ case '}':
+ case '\\':
+ bBreak = true;
+ break;
+ default:
+ aByteString.Append(nSlash);
+ break;
+ }
+ }
+
+ nNextCh = GetNextChar();
+
+ if (nSlash != '\\' || nNextCh != '\'')
+ {
+ rInput.SeekRel(-1);
+ nNextCh = nSlash;
+ break;
+ }
+ }
+
+ bNextCh = false;
+
+ if (aByteString.Len())
+ aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
+#endif
+ }
+ break;
+ case '\\':
+ case '}':
+ case '{':
+ case '+': // habe ich in einem RTF-File gefunden
+ aStrBuffer.Append(nNextCh);
+ break;
+ case '~': // nonbreaking space
+ aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
+ break;
+ case '-': // optional hyphen
+ aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
+ break;
+ case '_': // nonbreaking hyphen
+ aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
+ break;
+
+ case 'u':
+ // UNI-Code Zeichen lesen
+ {
+ nNextCh = GetNextChar();
+ rInput.SeekRel( -2 );
+
+ if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
+ {
+ bRTF_InTextRead = true;
+
+ String sSave( aToken );
+ nNextCh = '\\';
+ #ifdef DBG_UTIL
+ int nToken =
+ #endif
+ _GetNextToken();
+ DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
+ // dont convert symbol chars
+ aStrBuffer.Append(
+ static_cast< sal_Unicode >(nTokenValue));
+
+ // overread the next n "RTF" characters. This
+ // can be also \{, \}, \'88
+ for( BYTE m = 0; m < nUCharOverread; ++m )
+ {
+ sal_Unicode cAnsi = nNextCh;
+ while( 0xD == cAnsi )
+ cAnsi = GetNextChar();
+ while( 0xA == cAnsi )
+ cAnsi = GetNextChar();
+
+ if( '\\' == cAnsi &&
+ '\'' == ( cAnsi = GetNextChar() ))
+ // HexValue ueberlesen
+ cAnsi = GetHexValue();
+ nNextCh = GetNextChar();
+ }
+ bNextCh = false;
+ aToken = sSave;
+ bRTF_InTextRead = false;
+ }
+ else
+ {
+ nNextCh = '\\';
+ bWeiter = false; // Abbrechen, String zusammen
+ }
+ }
+ break;
+
+ default:
+ rInput.SeekRel( -1 );
+ nNextCh = '\\';
+ bWeiter = false; // Abbrechen, String zusammen
+ break;
+ }
+ }
+ break;
+
+ case sal_Unicode(EOF):
+ eState = SVPAR_ERROR;
+ // weiter
+ case '{':
+ case '}':
+ bWeiter = false;
+ break;
+
+ case 0x0a:
+ case 0x0d:
+ break;
+
+ default:
+ if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
+ bWeiter = false;
+ else
+ {
+ do {
+ // alle anderen Zeichen kommen in den Text
+ aStrBuffer.Append(nNextCh);
+
+ if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
+ {
+ if (aStrBuffer.Len())
+ aToken += aStrBuffer;
+ return;
+ }
+ } while
+ (
+ (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
+ (aStrBuffer.Len() < MAX_STRING_LEN)
+ );
+ bNextCh = false;
+ }
+ }
+
+ if( bWeiter && bNextCh )
+ nNextCh = GetNextChar();
+ }
+
+ if (aStrBuffer.Len())
+ aToken += aStrBuffer;
+}
+
+
+short SvRTFParser::_inSkipGroup=0;
+
+void SvRTFParser::SkipGroup()
+{
+short nBrackets=1;
+if (_inSkipGroup>0)
+ return;
+_inSkipGroup++;
+#if 1 //#i16185# fecking \bin keyword
+ do
+ {
+ switch (nNextCh)
+ {
+ case '{':
+ ++nBrackets;
+ break;
+ case '}':
+ if (!--nBrackets) {
+ _inSkipGroup--;
+ return;
+ }
+ break;
+ }
+ int nToken = _GetNextToken();
+ if (nToken == RTF_BIN)
+ {
+ rInput.SeekRel(-1);
+ rInput.SeekRel(nTokenValue);
+ nNextCh = GetNextChar();
+ }
+ while (nNextCh==0xa || nNextCh==0xd)
+ {
+ nNextCh = GetNextChar();
+ }
+ } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
+#else
+ sal_Unicode cPrev = 0;
+ do {
+ switch( nNextCh )
+ {
+ case '{':
+ if( '\\' != cPrev )
+ ++nBrackets;
+ break;
+
+ case '}':
+ if( '\\' != cPrev && !--nBrackets )
+ return;
+ break;
+
+ case '\\':
+ if( '\\' == cPrev )
+ nNextCh = 0;
+ break;
+ }
+ cPrev = nNextCh;
+ nNextCh = GetNextChar();
+ } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
+#endif
+
+ if( SVPAR_PENDING != eState && '}' != nNextCh )
+ eState = SVPAR_ERROR;
+ _inSkipGroup--;
+}
+
+void SvRTFParser::ReadUnknownData() { SkipGroup(); }
+void SvRTFParser::ReadBitmapData() { SkipGroup(); }
+void SvRTFParser::ReadOLEData() { SkipGroup(); }
+
+
+SvParserState SvRTFParser::CallParser()
+{
+ sal_Char cFirstCh;
+ nNextChPos = rInput.Tell();
+ rInput >> cFirstCh; nNextCh = cFirstCh;
+ eState = SVPAR_WORKING;
+ nOpenBrakets = 0;
+ SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
+ eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet
+
+ // die 1. beiden Token muessen '{' und \\rtf sein !!
+ if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
+ {
+ AddRef();
+ Continue( 0 );
+ if( SVPAR_PENDING != eState )
+ ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
+ }
+ else
+ eState = SVPAR_ERROR;
+
+ return eState;
+}
+
+void SvRTFParser::Continue( int nToken )
+{
+// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
+// "Zeichensatz wurde geaendert." );
+
+ if( !nToken )
+ nToken = GetNextToken();
+
+ while( IsParserWorking() )
+ {
+ SaveState( nToken );
+ switch( nToken )
+ {
+ case '}':
+ if( nOpenBrakets )
+ goto NEXTTOKEN;
+ eState = SVPAR_ACCEPTED;
+ break;
+
+ case '{':
+ // eine unbekannte Gruppe ?
+ {
+ if( RTF_IGNOREFLAG != GetNextToken() )
+ nToken = SkipToken( -1 );
+ else if( RTF_UNKNOWNCONTROL != GetNextToken() )
+ nToken = SkipToken( -2 );
+ else
+ {
+ // gleich herausfiltern
+ ReadUnknownData();
+ nToken = GetNextToken();
+ if( '}' != nToken )
+ eState = SVPAR_ERROR;
+ break; // auf zum naechsten Token!!
+ }
+ }
+ goto NEXTTOKEN;
+
+ case RTF_UNKNOWNCONTROL:
+ break; // unbekannte Token ueberspringen
+ case RTF_NEXTTYPE:
+ case RTF_ANSITYPE:
+ SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
+ break;
+ case RTF_MACTYPE:
+ SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
+ break;
+ case RTF_PCTYPE:
+ SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
+ break;
+ case RTF_PCATYPE:
+ SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
+ break;
+ case RTF_ANSICPG:
+ eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
+ SetSrcEncoding(eCodeSet);
+ break;
+ default:
+NEXTTOKEN:
+ NextToken( nToken );
+ break;
+ }
+ if( IsParserWorking() )
+ SaveState( 0 ); // bis hierhin abgearbeitet,
+ // weiter mit neuem Token!
+ nToken = GetNextToken();
+ }
+ if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
+ eState = SVPAR_ERROR;
+}
+
+void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
+{
+ if (eEnc == RTL_TEXTENCODING_DONTKNOW)
+ eEnc = GetCodeSet();
+
+ if (aParserStates.Count())
+ aParserStates[aParserStates.Count() - 1].eCodeSet = eEnc;
+ SetSrcEncoding(eEnc);
+}
+
+#ifdef USED
+void SvRTFParser::SaveState( int nToken )
+{
+ SvParser::SaveState( nToken );
+}
+
+void SvRTFParser::RestoreState()
+{
+ SvParser::RestoreState();
+}
+#endif
+
+/* vi:set tabstop=4 shiftwidth=4 expandtab: */