1 files changed, 710 insertions, 0 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
new file mode 100644
index 000000000000..1c578d160307
--- /dev/null
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -0,0 +1,710 @@
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org.  If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_svtools.hxx"
+
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
+
+#include <stdio.h>                      // for EOF
+#include <rtl/tencinfo.h>
+#include <tools/stream.hxx>
+#include <tools/debug.hxx>
+#include "rtftoken.h"
+#include "rtfkeywd.hxx"
+#include <svtools/parrtf.hxx>
+
+const int MAX_STRING_LEN = 1024;
+const int MAX_TOKEN_LEN = 128;
+
+#define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
+#define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
+
+SV_IMPL_VARARR( RtfParserStates_Impl, RtfParserState_Impl )
+
+SvRTFParser::SvRTFParser( SvStream& rIn, BYTE nStackSize )
+    : SvParser( rIn, nStackSize ),
+    eUNICodeSet( RTL_TEXTENCODING_MS_1252 ),    // default ist ANSI-CodeSet
+    nUCharOverread( 1 )
+{
+    // default ist ANSI-CodeSet
+    SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
+    bRTF_InTextRead = false;
+}
+
+SvRTFParser::~SvRTFParser()
+{
+}
+
+
+
+
+int SvRTFParser::_GetNextToken()
+{
+    int nRet = 0;
+    do {
+        int bNextCh = true;
+        switch( nNextCh )
+        {
+        case '\\':
+            {
+                // Steuerzeichen
+                switch( nNextCh = GetNextChar() )
+                {
+                case '{':
+                case '}':
+                case '\\':
+                case '+':       // habe ich in einem RTF-File gefunden
+                case '~':       // nonbreaking space
+                case '-':       // optional hyphen
+                case '_':       // nonbreaking hyphen
+                case '\'':      // HexValue
+                    nNextCh = '\\';
+                    rInput.SeekRel( -1 );
+                    ScanText();
+                    nRet = RTF_TEXTTOKEN;
+                    bNextCh = 0 == nNextCh;
+                    break;
+
+                case '*':       // ignoreflag
+                    nRet = RTF_IGNOREFLAG;
+                    break;
+                case ':':       // subentry in an index entry
+                    nRet = RTF_SUBENTRYINDEX;
+                    break;
+                case '|':       // formula-charakter
+                    nRet = RTF_FORMULA;
+                    break;
+
+                case 0x0a:
+                case 0x0d:
+                    nRet = RTF_PAR;
+                    break;
+
+                default:
+                    if( RTF_ISALPHA( nNextCh ) )
+                    {
+                        aToken = '\\';
+                        {
+                            String aStrBuffer;
+                            sal_Unicode* pStr = aStrBuffer.AllocBuffer(
+                                                            MAX_TOKEN_LEN );
+                            xub_StrLen nStrLen = 0;
+                            do {
+                                *(pStr + nStrLen++) = nNextCh;
+                                if( MAX_TOKEN_LEN == nStrLen )
+                                {
+                                    aToken += aStrBuffer;
+                                    aToken.GetBufferAccess();  // make unique string!
+                                    nStrLen = 0;
+                                }
+                                nNextCh = GetNextChar();
+                            } while( RTF_ISALPHA( nNextCh ) );
+                            if( nStrLen )
+                            {
+                                aStrBuffer.ReleaseBufferAccess( nStrLen );
+                                aToken += aStrBuffer;
+                            }
+                        }
+
+                        // Minus fuer numerischen Parameter
+                        int bNegValue = false;
+                        if( '-' == nNextCh )
+                        {
+                            bNegValue = true;
+                            nNextCh = GetNextChar();
+                        }
+
+                        // evt. Numerischer Parameter
+                        if( RTF_ISDIGIT( nNextCh ) )
+                        {
+                            nTokenValue = 0;
+                            do {
+                                nTokenValue *= 10;
+                                nTokenValue += nNextCh - '0';
+                                nNextCh = GetNextChar();
+                            } while( RTF_ISDIGIT( nNextCh ) );
+                            if( bNegValue )
+                                nTokenValue = -nTokenValue;
+                            bTokenHasValue=true;
+                        }
+                        else if( bNegValue )        // das Minus wieder zurueck
+                        {
+                            nNextCh = '-';
+                            rInput.SeekRel( -1 );
+                        }
+                        if( ' ' == nNextCh )        // Blank gehoert zum Token!
+                            nNextCh = GetNextChar();
+
+                        // suche das Token in der Tabelle:
+                        if( 0 == (nRet = GetRTFToken( aToken )) )
+                            // Unknown Control
+                            nRet = RTF_UNKNOWNCONTROL;
+
+                        // bug 76812 - unicode token handled as normal text
+                        bNextCh = false;
+                        switch( nRet )
+                        {
+                        case RTF_UC:
+                            if( 0 <= nTokenValue )
+                            {
+                                nUCharOverread = (BYTE)nTokenValue;
+#if 1
+                                //cmc: other ifdef breaks #i3584
+                                aParserStates[ aParserStates.Count()-1].
+                                    nUCharOverread = nUCharOverread;
+#else
+                                if( !nUCharOverread )
+                                    nUCharOverread = aParserStates[
+                                        aParserStates.Count()-1].nUCharOverread;
+                                else
+                                    aParserStates[ aParserStates.Count()-1].
+                                        nUCharOverread = nUCharOverread;
+#endif
+                            }
+                            aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
+                            // read next token
+                            nRet = 0;
+                            break;
+
+                        case RTF_UPR:
+                            if (!_inSkipGroup) {
+                            // UPR - overread the group with the ansi
+                            //       informations
+                            while( '{' != _GetNextToken() )
+                                ;
+                            SkipGroup();
+                            _GetNextToken();  // overread the last bracket
+                            nRet = 0;
+                            }
+                            break;
+
+                        case RTF_U:
+                            if( !bRTF_InTextRead )
+                            {
+                                nRet = RTF_TEXTTOKEN;
+                                aToken = (sal_Unicode)nTokenValue;
+
+                                // overread the next n "RTF" characters. This
+                                // can be also \{, \}, \'88
+                                for( BYTE m = 0; m < nUCharOverread; ++m )
+                                {
+                                    sal_Unicode cAnsi = nNextCh;
+                                    while( 0xD == cAnsi )
+                                        cAnsi = GetNextChar();
+                                    while( 0xA == cAnsi )
+                                        cAnsi = GetNextChar();
+
+                                    if( '\\' == cAnsi &&
+                                        '\'' == ( cAnsi = GetNextChar() ))
+                                        // HexValue ueberlesen
+                                        cAnsi = GetHexValue();
+                                    nNextCh = GetNextChar();
+                                }
+                                ScanText();
+                                bNextCh = 0 == nNextCh;
+                            }
+                            break;
+                        }
+                    }
+                    else if( SVPAR_PENDING != eState )
+                    {
+                        // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
+                        // eState = SVPAR_ERROR;
+                        bNextCh = false;
+                    }
+                    break;
+                }
+            }
+            break;
+
+        case sal_Unicode(EOF):
+            eState = SVPAR_ACCEPTED;
+            nRet = nNextCh;
+            break;
+
+        case '{':
+            {
+                if( 0 <= nOpenBrakets )
+                {
+                    RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
+                    aParserStates.Insert(
+                        aState, sal::static_int_cast< USHORT >(nOpenBrakets) );
+                }
+                ++nOpenBrakets;
+                DBG_ASSERT( nOpenBrakets == aParserStates.Count(),
+                            "ParserStateStack unequal to bracket count" );
+                nRet = nNextCh;
+            }
+            break;
+
+        case '}':
+            --nOpenBrakets;
+            if( 0 <= nOpenBrakets )
+            {
+                aParserStates.Remove(
+                    sal::static_int_cast< USHORT >(nOpenBrakets) );
+                if( aParserStates.Count() )
+                {
+                    const RtfParserState_Impl& rRPS =
+                            aParserStates[ aParserStates.Count() - 1 ];
+                    nUCharOverread = rRPS.nUCharOverread;
+                    SetSrcEncoding( rRPS.eCodeSet );
+                }
+                else
+                {
+                    nUCharOverread = 1;
+                    SetSrcEncoding( GetCodeSet() );
+                }
+            }
+            DBG_ASSERT( nOpenBrakets == aParserStates.Count(),
+                        "ParserStateStack unequal to bracket count" );
+            nRet = nNextCh;
+            break;
+
+        case 0x0d:
+        case 0x0a:
+            break;
+
+        default:
+            // es folgt normaler Text
+            ScanText();
+            nRet = RTF_TEXTTOKEN;
+            bNextCh = 0 == nNextCh;
+            break;
+        }
+
+        if( bNextCh )
+            nNextCh = GetNextChar();
+
+    } while( !nRet && SVPAR_WORKING == eState );
+    return nRet;
+}
+
+
+sal_Unicode SvRTFParser::GetHexValue()
+{
+    // Hex-Wert sammeln
+    register int n;
+    register sal_Unicode nHexVal = 0;
+
+    for( n = 0; n < 2; ++n )
+    {
+        nHexVal *= 16;
+        nNextCh = GetNextChar();
+        if( nNextCh >= '0' && nNextCh <= '9' )
+            nHexVal += (nNextCh - 48);
+        else if( nNextCh >= 'a' && nNextCh <= 'f' )
+            nHexVal += (nNextCh - 87);
+        else if( nNextCh >= 'A' && nNextCh <= 'F' )
+            nHexVal += (nNextCh - 55);
+    }
+    return nHexVal;
+}
+
+void SvRTFParser::ScanText( const sal_Unicode cBreak )
+{
+    String aStrBuffer;
+    int bWeiter = true;
+    while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
+    {
+        int bNextCh = true;
+        switch( nNextCh )
+        {
+        case '\\':
+            {
+                switch (nNextCh = GetNextChar())
+                {
+                case '\'':
+                    {
+
+#if 0
+                        // #i35653 patch from cmc
+                        ByteString aByteString(static_cast<char>(GetHexValue()));
+                        if (aByteString.Len())
+                            aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
+#else
+                        ByteString aByteString;
+                        while (1)
+                        {
+                            aByteString.Append((char)GetHexValue());
+
+                            bool bBreak = false;
+                            sal_Char nSlash = '\\';
+                            while (!bBreak)
+                            {
+                                wchar_t __next=GetNextChar();
+                                if (__next>0xFF) // fix for #i43933# and #i35653#
+                                {
+                                    if (aByteString.Len())
+                                        aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
+                                    aStrBuffer.Append((sal_Unicode)__next);
+
+                                    aByteString.Erase();
+                                    continue;
+                                }
+                                nSlash = (sal_Char)__next;
+                                while (nSlash == 0xD || nSlash == 0xA)
+                                    nSlash = (sal_Char)GetNextChar();
+
+                                switch (nSlash)
+                                {
+                                    case '{':
+                                    case '}':
+                                    case '\\':
+                                        bBreak = true;
+                                        break;
+                                    default:
+                                        aByteString.Append(nSlash);
+                                        break;
+                                }
+                            }
+
+                            nNextCh = GetNextChar();
+
+                            if (nSlash != '\\' || nNextCh != '\'')
+                            {
+                                rInput.SeekRel(-1);
+                                nNextCh = nSlash;
+                                break;
+                            }
+                        }
+
+                        bNextCh = false;
+
+                        if (aByteString.Len())
+                            aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
+#endif
+                    }
+                    break;
+                case '\\':
+                case '}':
+                case '{':
+                case '+':       // habe ich in einem RTF-File gefunden
+                    aStrBuffer.Append(nNextCh);
+                    break;
+                case '~':       // nonbreaking space
+                    aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
+                    break;
+                case '-':       // optional hyphen
+                    aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
+                    break;
+                case '_':       // nonbreaking hyphen
+                    aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
+                    break;
+
+                case 'u':
+                    // UNI-Code Zeichen lesen
+                    {
+                        nNextCh = GetNextChar();
+                        rInput.SeekRel( -2 );
+
+                        if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
+                        {
+                            bRTF_InTextRead = true;
+
+                            String sSave( aToken );
+                            nNextCh = '\\';
+                            #ifdef DBG_UTIL
+                            int nToken =
+                            #endif
+                                _GetNextToken();
+                            DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
+                            // dont convert symbol chars
+                            aStrBuffer.Append(
+                                static_cast< sal_Unicode >(nTokenValue));
+
+                            // overread the next n "RTF" characters. This
+                            // can be also \{, \}, \'88
+                            for( BYTE m = 0; m < nUCharOverread; ++m )
+                            {
+                                sal_Unicode cAnsi = nNextCh;
+                                while( 0xD == cAnsi )
+                                    cAnsi = GetNextChar();
+                                while( 0xA == cAnsi )
+                                    cAnsi = GetNextChar();
+
+                                if( '\\' == cAnsi &&
+                                    '\'' == ( cAnsi = GetNextChar() ))
+                                    // HexValue ueberlesen
+                                    cAnsi = GetHexValue();
+                                nNextCh = GetNextChar();
+                            }
+                            bNextCh = false;
+                            aToken = sSave;
+                            bRTF_InTextRead = false;
+                        }
+                        else
+                        {
+                            nNextCh = '\\';
+                            bWeiter = false;        // Abbrechen, String zusammen
+                        }
+                    }
+                    break;
+
+                default:
+                    rInput.SeekRel( -1 );
+                    nNextCh = '\\';
+                    bWeiter = false;        // Abbrechen, String zusammen
+                    break;
+                }
+            }
+            break;
+
+        case sal_Unicode(EOF):
+                eState = SVPAR_ERROR;
+                // weiter
+        case '{':
+        case '}':
+            bWeiter = false;
+            break;
+
+        case 0x0a:
+        case 0x0d:
+            break;
+
+        default:
+            if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
+                bWeiter = false;
+            else
+            {
+                do {
+                    // alle anderen Zeichen kommen in den Text
+                    aStrBuffer.Append(nNextCh);
+
+                    if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
+                    {
+                        if (aStrBuffer.Len())
+                            aToken += aStrBuffer;
+                        return;
+                    }
+                } while
+                (
+                    (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
+                    (aStrBuffer.Len() < MAX_STRING_LEN)
+                );
+                bNextCh = false;
+            }
+        }
+
+        if( bWeiter && bNextCh )
+            nNextCh = GetNextChar();
+    }
+
+    if (aStrBuffer.Len())
+        aToken += aStrBuffer;
+}
+
+
+short SvRTFParser::_inSkipGroup=0;
+
+void SvRTFParser::SkipGroup()
+{
+short nBrackets=1;
+if (_inSkipGroup>0)
+    return;
+_inSkipGroup++;
+#if 1   //#i16185# fecking \bin keyword
+    do
+    {
+        switch (nNextCh)
+        {
+            case '{':
+                ++nBrackets;
+                break;
+            case '}':
+                if (!--nBrackets) {
+                    _inSkipGroup--;
+                    return;
+                }
+                break;
+        }
+        int nToken = _GetNextToken();
+        if (nToken == RTF_BIN)
+        {
+            rInput.SeekRel(-1);
+            rInput.SeekRel(nTokenValue);
+            nNextCh = GetNextChar();
+        }
+        while (nNextCh==0xa || nNextCh==0xd)
+        {
+            nNextCh = GetNextChar();
+        }
+    } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
+#else
+    sal_Unicode cPrev = 0;
+    do {
+        switch( nNextCh )
+        {
+        case '{':
+            if( '\\' != cPrev )
+                ++nBrackets;
+            break;
+
+        case '}':
+            if( '\\' != cPrev && !--nBrackets )
+                return;
+            break;
+
+        case '\\':
+            if( '\\' == cPrev )
+                nNextCh = 0;
+            break;
+        }
+        cPrev = nNextCh;
+        nNextCh = GetNextChar();
+    } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
+#endif
+
+    if( SVPAR_PENDING != eState && '}' != nNextCh )
+        eState = SVPAR_ERROR;
+    _inSkipGroup--;
+}
+
+void SvRTFParser::ReadUnknownData() { SkipGroup(); }
+void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
+void SvRTFParser::ReadOLEData()     { SkipGroup(); }
+
+
+SvParserState SvRTFParser::CallParser()
+{
+    sal_Char cFirstCh;
+    nNextChPos = rInput.Tell();
+    rInput >> cFirstCh; nNextCh = cFirstCh;
+    eState = SVPAR_WORKING;
+    nOpenBrakets = 0;
+    SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
+    eUNICodeSet = RTL_TEXTENCODING_MS_1252;     // default ist ANSI-CodeSet
+
+    // die 1. beiden Token muessen '{' und \\rtf sein !!
+    if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
+    {
+        AddRef();
+        Continue( 0 );
+        if( SVPAR_PENDING != eState )
+            ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
+    }
+    else
+        eState = SVPAR_ERROR;
+
+    return eState;
+}
+
+void SvRTFParser::Continue( int nToken )
+{
+//  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
+//              "Zeichensatz wurde geaendert." );
+
+    if( !nToken )
+        nToken = GetNextToken();
+
+    while( IsParserWorking() )
+    {
+        SaveState( nToken );
+        switch( nToken )
+        {
+        case '}':
+            if( nOpenBrakets )
+                goto NEXTTOKEN;
+            eState = SVPAR_ACCEPTED;
+            break;
+
+        case '{':
+            // eine unbekannte Gruppe ?
+            {
+                if( RTF_IGNOREFLAG != GetNextToken() )
+                    nToken = SkipToken( -1 );
+                else if( RTF_UNKNOWNCONTROL != GetNextToken() )
+                    nToken = SkipToken( -2 );
+                else
+                {
+                    // gleich herausfiltern
+                    ReadUnknownData();
+                    nToken = GetNextToken();
+                    if( '}' != nToken )
+                        eState = SVPAR_ERROR;
+                    break;      // auf zum naechsten Token!!
+                }
+            }
+            goto NEXTTOKEN;
+
+        case RTF_UNKNOWNCONTROL:
+            break;      // unbekannte Token ueberspringen
+        case RTF_NEXTTYPE:
+        case RTF_ANSITYPE:
+            SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
+            break;
+        case RTF_MACTYPE:
+            SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
+            break;
+        case RTF_PCTYPE:
+            SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
+            break;
+        case RTF_PCATYPE:
+            SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
+            break;
+        case RTF_ANSICPG:
+            eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
+            SetSrcEncoding(eCodeSet);
+            break;
+        default:
+NEXTTOKEN:
+            NextToken( nToken );
+            break;
+        }
+        if( IsParserWorking() )
+            SaveState( 0 );         // bis hierhin abgearbeitet,
+                                    // weiter mit neuem Token!
+        nToken = GetNextToken();
+    }
+    if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
+        eState = SVPAR_ERROR;
+}
+
+void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
+{
+    if (eEnc == RTL_TEXTENCODING_DONTKNOW)
+        eEnc = GetCodeSet();
+
+    if (aParserStates.Count())
+        aParserStates[aParserStates.Count() - 1].eCodeSet = eEnc;
+    SetSrcEncoding(eEnc);
+}
+
+#ifdef USED
+void SvRTFParser::SaveState( int nToken )
+{
+    SvParser::SaveState( nToken );
+}
+
+void SvRTFParser::RestoreState()
+{
+    SvParser::RestoreState();
+}
+#endif
+
+/* vi:set tabstop=4 shiftwidth=4 expandtab: */