diff options
Diffstat (limited to 'svtools/source/svrtf/parrtf.cxx')
-rw-r--r-- | svtools/source/svrtf/parrtf.cxx | 710 |
1 files changed, 710 insertions, 0 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx new file mode 100644 index 000000000000..1c578d160307 --- /dev/null +++ b/svtools/source/svrtf/parrtf.cxx @@ -0,0 +1,710 @@ +/************************************************************************* + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * Copyright 2000, 2010 Oracle and/or its affiliates. + * + * OpenOffice.org - a multi-platform office productivity suite + * + * This file is part of OpenOffice.org. + * + * OpenOffice.org is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License version 3 + * only, as published by the Free Software Foundation. + * + * OpenOffice.org is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License version 3 for more details + * (a copy is included in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU Lesser General Public License + * version 3 along with OpenOffice.org. If not, see + * <http://www.openoffice.org/license.html> + * for a copy of the LGPLv3 License. + * + ************************************************************************/ + +// MARKER(update_precomp.py): autogen include statement, do not remove +#include "precompiled_svtools.hxx" + +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */ + +#include <stdio.h> // for EOF +#include <rtl/tencinfo.h> +#include <tools/stream.hxx> +#include <tools/debug.hxx> +#include "rtftoken.h" +#include "rtfkeywd.hxx" +#include <svtools/parrtf.hxx> + +const int MAX_STRING_LEN = 1024; +const int MAX_TOKEN_LEN = 128; + +#define RTF_ISDIGIT( c ) (c >= '0' && c <= '9') +#define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) + +SV_IMPL_VARARR( RtfParserStates_Impl, RtfParserState_Impl ) + +SvRTFParser::SvRTFParser( SvStream& rIn, BYTE nStackSize ) + : SvParser( rIn, nStackSize ), + eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet + nUCharOverread( 1 ) +{ + // default ist ANSI-CodeSet + SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); + bRTF_InTextRead = false; +} + +SvRTFParser::~SvRTFParser() +{ +} + + + + +int SvRTFParser::_GetNextToken() +{ + int nRet = 0; + do { + int bNextCh = true; + switch( nNextCh ) + { + case '\\': + { + // Steuerzeichen + switch( nNextCh = GetNextChar() ) + { + case '{': + case '}': + case '\\': + case '+': // habe ich in einem RTF-File gefunden + case '~': // nonbreaking space + case '-': // optional hyphen + case '_': // nonbreaking hyphen + case '\'': // HexValue + nNextCh = '\\'; + rInput.SeekRel( -1 ); + ScanText(); + nRet = RTF_TEXTTOKEN; + bNextCh = 0 == nNextCh; + break; + + case '*': // ignoreflag + nRet = RTF_IGNOREFLAG; + break; + case ':': // subentry in an index entry + nRet = RTF_SUBENTRYINDEX; + break; + case '|': // formula-charakter + nRet = RTF_FORMULA; + break; + + case 0x0a: + case 0x0d: + nRet = RTF_PAR; + break; + + default: + if( RTF_ISALPHA( nNextCh ) ) + { + aToken = '\\'; + { + String aStrBuffer; + sal_Unicode* pStr = aStrBuffer.AllocBuffer( + MAX_TOKEN_LEN ); + xub_StrLen nStrLen = 0; + do { + *(pStr + nStrLen++) = nNextCh; + if( MAX_TOKEN_LEN == nStrLen ) + { + aToken += aStrBuffer; + aToken.GetBufferAccess(); // make unique string! + nStrLen = 0; + } + nNextCh = GetNextChar(); + } while( RTF_ISALPHA( nNextCh ) ); + if( nStrLen ) + { + aStrBuffer.ReleaseBufferAccess( nStrLen ); + aToken += aStrBuffer; + } + } + + // Minus fuer numerischen Parameter + int bNegValue = false; + if( '-' == nNextCh ) + { + bNegValue = true; + nNextCh = GetNextChar(); + } + + // evt. Numerischer Parameter + if( RTF_ISDIGIT( nNextCh ) ) + { + nTokenValue = 0; + do { + nTokenValue *= 10; + nTokenValue += nNextCh - '0'; + nNextCh = GetNextChar(); + } while( RTF_ISDIGIT( nNextCh ) ); + if( bNegValue ) + nTokenValue = -nTokenValue; + bTokenHasValue=true; + } + else if( bNegValue ) // das Minus wieder zurueck + { + nNextCh = '-'; + rInput.SeekRel( -1 ); + } + if( ' ' == nNextCh ) // Blank gehoert zum Token! + nNextCh = GetNextChar(); + + // suche das Token in der Tabelle: + if( 0 == (nRet = GetRTFToken( aToken )) ) + // Unknown Control + nRet = RTF_UNKNOWNCONTROL; + + // bug 76812 - unicode token handled as normal text + bNextCh = false; + switch( nRet ) + { + case RTF_UC: + if( 0 <= nTokenValue ) + { + nUCharOverread = (BYTE)nTokenValue; +#if 1 + //cmc: other ifdef breaks #i3584 + aParserStates[ aParserStates.Count()-1]. + nUCharOverread = nUCharOverread; +#else + if( !nUCharOverread ) + nUCharOverread = aParserStates[ + aParserStates.Count()-1].nUCharOverread; + else + aParserStates[ aParserStates.Count()-1]. + nUCharOverread = nUCharOverread; +#endif + } + aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text + // read next token + nRet = 0; + break; + + case RTF_UPR: + if (!_inSkipGroup) { + // UPR - overread the group with the ansi + // informations + while( '{' != _GetNextToken() ) + ; + SkipGroup(); + _GetNextToken(); // overread the last bracket + nRet = 0; + } + break; + + case RTF_U: + if( !bRTF_InTextRead ) + { + nRet = RTF_TEXTTOKEN; + aToken = (sal_Unicode)nTokenValue; + + // overread the next n "RTF" characters. This + // can be also \{, \}, \'88 + for( BYTE m = 0; m < nUCharOverread; ++m ) + { + sal_Unicode cAnsi = nNextCh; + while( 0xD == cAnsi ) + cAnsi = GetNextChar(); + while( 0xA == cAnsi ) + cAnsi = GetNextChar(); + + if( '\\' == cAnsi && + '\'' == ( cAnsi = GetNextChar() )) + // HexValue ueberlesen + cAnsi = GetHexValue(); + nNextCh = GetNextChar(); + } + ScanText(); + bNextCh = 0 == nNextCh; + } + break; + } + } + else if( SVPAR_PENDING != eState ) + { + // Bug 34631 - "\ " ueberlesen - Blank als Zeichen + // eState = SVPAR_ERROR; + bNextCh = false; + } + break; + } + } + break; + + case sal_Unicode(EOF): + eState = SVPAR_ACCEPTED; + nRet = nNextCh; + break; + + case '{': + { + if( 0 <= nOpenBrakets ) + { + RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() ); + aParserStates.Insert( + aState, sal::static_int_cast< USHORT >(nOpenBrakets) ); + } + ++nOpenBrakets; + DBG_ASSERT( nOpenBrakets == aParserStates.Count(), + "ParserStateStack unequal to bracket count" ); + nRet = nNextCh; + } + break; + + case '}': + --nOpenBrakets; + if( 0 <= nOpenBrakets ) + { + aParserStates.Remove( + sal::static_int_cast< USHORT >(nOpenBrakets) ); + if( aParserStates.Count() ) + { + const RtfParserState_Impl& rRPS = + aParserStates[ aParserStates.Count() - 1 ]; + nUCharOverread = rRPS.nUCharOverread; + SetSrcEncoding( rRPS.eCodeSet ); + } + else + { + nUCharOverread = 1; + SetSrcEncoding( GetCodeSet() ); + } + } + DBG_ASSERT( nOpenBrakets == aParserStates.Count(), + "ParserStateStack unequal to bracket count" ); + nRet = nNextCh; + break; + + case 0x0d: + case 0x0a: + break; + + default: + // es folgt normaler Text + ScanText(); + nRet = RTF_TEXTTOKEN; + bNextCh = 0 == nNextCh; + break; + } + + if( bNextCh ) + nNextCh = GetNextChar(); + + } while( !nRet && SVPAR_WORKING == eState ); + return nRet; +} + + +sal_Unicode SvRTFParser::GetHexValue() +{ + // Hex-Wert sammeln + register int n; + register sal_Unicode nHexVal = 0; + + for( n = 0; n < 2; ++n ) + { + nHexVal *= 16; + nNextCh = GetNextChar(); + if( nNextCh >= '0' && nNextCh <= '9' ) + nHexVal += (nNextCh - 48); + else if( nNextCh >= 'a' && nNextCh <= 'f' ) + nHexVal += (nNextCh - 87); + else if( nNextCh >= 'A' && nNextCh <= 'F' ) + nHexVal += (nNextCh - 55); + } + return nHexVal; +} + +void SvRTFParser::ScanText( const sal_Unicode cBreak ) +{ + String aStrBuffer; + int bWeiter = true; + while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN) + { + int bNextCh = true; + switch( nNextCh ) + { + case '\\': + { + switch (nNextCh = GetNextChar()) + { + case '\'': + { + +#if 0 + // #i35653 patch from cmc + ByteString aByteString(static_cast<char>(GetHexValue())); + if (aByteString.Len()) + aStrBuffer.Append(String(aByteString, GetSrcEncoding())); +#else + ByteString aByteString; + while (1) + { + aByteString.Append((char)GetHexValue()); + + bool bBreak = false; + sal_Char nSlash = '\\'; + while (!bBreak) + { + wchar_t __next=GetNextChar(); + if (__next>0xFF) // fix for #i43933# and #i35653# + { + if (aByteString.Len()) + aStrBuffer.Append(String(aByteString, GetSrcEncoding())); + aStrBuffer.Append((sal_Unicode)__next); + + aByteString.Erase(); + continue; + } + nSlash = (sal_Char)__next; + while (nSlash == 0xD || nSlash == 0xA) + nSlash = (sal_Char)GetNextChar(); + + switch (nSlash) + { + case '{': + case '}': + case '\\': + bBreak = true; + break; + default: + aByteString.Append(nSlash); + break; + } + } + + nNextCh = GetNextChar(); + + if (nSlash != '\\' || nNextCh != '\'') + { + rInput.SeekRel(-1); + nNextCh = nSlash; + break; + } + } + + bNextCh = false; + + if (aByteString.Len()) + aStrBuffer.Append(String(aByteString, GetSrcEncoding())); +#endif + } + break; + case '\\': + case '}': + case '{': + case '+': // habe ich in einem RTF-File gefunden + aStrBuffer.Append(nNextCh); + break; + case '~': // nonbreaking space + aStrBuffer.Append(static_cast< sal_Unicode >(0xA0)); + break; + case '-': // optional hyphen + aStrBuffer.Append(static_cast< sal_Unicode >(0xAD)); + break; + case '_': // nonbreaking hyphen + aStrBuffer.Append(static_cast< sal_Unicode >(0x2011)); + break; + + case 'u': + // UNI-Code Zeichen lesen + { + nNextCh = GetNextChar(); + rInput.SeekRel( -2 ); + + if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) ) + { + bRTF_InTextRead = true; + + String sSave( aToken ); + nNextCh = '\\'; + #ifdef DBG_UTIL + int nToken = + #endif + _GetNextToken(); + DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" ); + // dont convert symbol chars + aStrBuffer.Append( + static_cast< sal_Unicode >(nTokenValue)); + + // overread the next n "RTF" characters. This + // can be also \{, \}, \'88 + for( BYTE m = 0; m < nUCharOverread; ++m ) + { + sal_Unicode cAnsi = nNextCh; + while( 0xD == cAnsi ) + cAnsi = GetNextChar(); + while( 0xA == cAnsi ) + cAnsi = GetNextChar(); + + if( '\\' == cAnsi && + '\'' == ( cAnsi = GetNextChar() )) + // HexValue ueberlesen + cAnsi = GetHexValue(); + nNextCh = GetNextChar(); + } + bNextCh = false; + aToken = sSave; + bRTF_InTextRead = false; + } + else + { + nNextCh = '\\'; + bWeiter = false; // Abbrechen, String zusammen + } + } + break; + + default: + rInput.SeekRel( -1 ); + nNextCh = '\\'; + bWeiter = false; // Abbrechen, String zusammen + break; + } + } + break; + + case sal_Unicode(EOF): + eState = SVPAR_ERROR; + // weiter + case '{': + case '}': + bWeiter = false; + break; + + case 0x0a: + case 0x0d: + break; + + default: + if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN) + bWeiter = false; + else + { + do { + // alle anderen Zeichen kommen in den Text + aStrBuffer.Append(nNextCh); + + if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) + { + if (aStrBuffer.Len()) + aToken += aStrBuffer; + return; + } + } while + ( + (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) && + (aStrBuffer.Len() < MAX_STRING_LEN) + ); + bNextCh = false; + } + } + + if( bWeiter && bNextCh ) + nNextCh = GetNextChar(); + } + + if (aStrBuffer.Len()) + aToken += aStrBuffer; +} + + +short SvRTFParser::_inSkipGroup=0; + +void SvRTFParser::SkipGroup() +{ +short nBrackets=1; +if (_inSkipGroup>0) + return; +_inSkipGroup++; +#if 1 //#i16185# fecking \bin keyword + do + { + switch (nNextCh) + { + case '{': + ++nBrackets; + break; + case '}': + if (!--nBrackets) { + _inSkipGroup--; + return; + } + break; + } + int nToken = _GetNextToken(); + if (nToken == RTF_BIN) + { + rInput.SeekRel(-1); + rInput.SeekRel(nTokenValue); + nNextCh = GetNextChar(); + } + while (nNextCh==0xa || nNextCh==0xd) + { + nNextCh = GetNextChar(); + } + } while (sal_Unicode(EOF) != nNextCh && IsParserWorking()); +#else + sal_Unicode cPrev = 0; + do { + switch( nNextCh ) + { + case '{': + if( '\\' != cPrev ) + ++nBrackets; + break; + + case '}': + if( '\\' != cPrev && !--nBrackets ) + return; + break; + + case '\\': + if( '\\' == cPrev ) + nNextCh = 0; + break; + } + cPrev = nNextCh; + nNextCh = GetNextChar(); + } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() ); +#endif + + if( SVPAR_PENDING != eState && '}' != nNextCh ) + eState = SVPAR_ERROR; + _inSkipGroup--; +} + +void SvRTFParser::ReadUnknownData() { SkipGroup(); } +void SvRTFParser::ReadBitmapData() { SkipGroup(); } +void SvRTFParser::ReadOLEData() { SkipGroup(); } + + +SvParserState SvRTFParser::CallParser() +{ + sal_Char cFirstCh; + nNextChPos = rInput.Tell(); + rInput >> cFirstCh; nNextCh = cFirstCh; + eState = SVPAR_WORKING; + nOpenBrakets = 0; + SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); + eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet + + // die 1. beiden Token muessen '{' und \\rtf sein !! + if( '{' == GetNextToken() && RTF_RTF == GetNextToken() ) + { + AddRef(); + Continue( 0 ); + if( SVPAR_PENDING != eState ) + ReleaseRef(); // dann brauchen wir den Parser nicht mehr! + } + else + eState = SVPAR_ERROR; + + return eState; +} + +void SvRTFParser::Continue( int nToken ) +{ +// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(), +// "Zeichensatz wurde geaendert." ); + + if( !nToken ) + nToken = GetNextToken(); + + while( IsParserWorking() ) + { + SaveState( nToken ); + switch( nToken ) + { + case '}': + if( nOpenBrakets ) + goto NEXTTOKEN; + eState = SVPAR_ACCEPTED; + break; + + case '{': + // eine unbekannte Gruppe ? + { + if( RTF_IGNOREFLAG != GetNextToken() ) + nToken = SkipToken( -1 ); + else if( RTF_UNKNOWNCONTROL != GetNextToken() ) + nToken = SkipToken( -2 ); + else + { + // gleich herausfiltern + ReadUnknownData(); + nToken = GetNextToken(); + if( '}' != nToken ) + eState = SVPAR_ERROR; + break; // auf zum naechsten Token!! + } + } + goto NEXTTOKEN; + + case RTF_UNKNOWNCONTROL: + break; // unbekannte Token ueberspringen + case RTF_NEXTTYPE: + case RTF_ANSITYPE: + SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); + break; + case RTF_MACTYPE: + SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN ); + break; + case RTF_PCTYPE: + SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 ); + break; + case RTF_PCATYPE: + SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 ); + break; + case RTF_ANSICPG: + eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue); + SetSrcEncoding(eCodeSet); + break; + default: +NEXTTOKEN: + NextToken( nToken ); + break; + } + if( IsParserWorking() ) + SaveState( 0 ); // bis hierhin abgearbeitet, + // weiter mit neuem Token! + nToken = GetNextToken(); + } + if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets ) + eState = SVPAR_ERROR; +} + +void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc ) +{ + if (eEnc == RTL_TEXTENCODING_DONTKNOW) + eEnc = GetCodeSet(); + + if (aParserStates.Count()) + aParserStates[aParserStates.Count() - 1].eCodeSet = eEnc; + SetSrcEncoding(eEnc); +} + +#ifdef USED +void SvRTFParser::SaveState( int nToken ) +{ + SvParser::SaveState( nToken ); +} + +void SvRTFParser::RestoreState() +{ + SvParser::RestoreState(); +} +#endif + +/* vi:set tabstop=4 shiftwidth=4 expandtab: */ |