summaryrefslogtreecommitdiff
path: root/svtools/source/svrtf/parrtf.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'svtools/source/svrtf/parrtf.cxx')
-rw-r--r--svtools/source/svrtf/parrtf.cxx591
1 files changed, 591 insertions, 0 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx
new file mode 100644
index 000000000000..c5da72a9cbf5
--- /dev/null
+++ b/svtools/source/svrtf/parrtf.cxx
@@ -0,0 +1,591 @@
+/*************************************************************************
+ *
+ * $RCSfile: parrtf.cxx,v $
+ *
+ * $Revision: 1.1.1.1 $
+ *
+ * last change: $Author: hr $ $Date: 2000-09-18 16:59:05 $
+ *
+ * The Contents of this file are made available subject to the terms of
+ * either of the following licenses
+ *
+ * - GNU Lesser General Public License Version 2.1
+ * - Sun Industry Standards Source License Version 1.1
+ *
+ * Sun Microsystems Inc., October, 2000
+ *
+ * GNU Lesser General Public License Version 2.1
+ * =============================================
+ * Copyright 2000 by Sun Microsystems, Inc.
+ * 901 San Antonio Road, Palo Alto, CA 94303, USA
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ *
+ *
+ * Sun Industry Standards Source License Version 1.1
+ * =================================================
+ * The contents of this file are subject to the Sun Industry Standards
+ * Source License Version 1.1 (the "License"); You may not use this file
+ * except in compliance with the License. You may obtain a copy of the
+ * License at http://www.openoffice.org/license.html.
+ *
+ * Software provided under this License is provided on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
+ * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
+ * See the License for the specific provisions governing your rights and
+ * obligations concerning the Software.
+ *
+ * The Initial Developer of the Original Code is: Sun Microsystems, Inc.
+ *
+ * Copyright: 2000 by Sun Microsystems, Inc.
+ *
+ * All Rights Reserved.
+ *
+ * Contributor(s): _______________________________________
+ *
+ *
+ ************************************************************************/
+
+#include <stdio.h> // for EOF
+
+#ifndef _STREAM_HXX
+#include <tools/stream.hxx>
+#endif
+#ifndef _TOOLS_DEBUG_HXX
+#include <tools/debug.hxx>
+#endif
+#include "rtftoken.h"
+#include "rtfkeywd.hxx"
+#include "parrtf.hxx"
+
+const int MAX_STRING_LEN = 1024;
+const int MAX_TOKEN_LEN = 128;
+
+#define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
+#define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
+
+SvRTFParser::SvRTFParser( SvStream& rIn, BYTE nStackSize )
+ : SvParser( rIn, nStackSize ),
+ eUNICodeSet( RTL_TEXTENCODING_MS_1252 ) // default ist ANSI-CodeSet
+{
+ // default ist ANSI-CodeSet
+ SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
+ bRTF_InTextRead = FALSE;
+}
+
+SvRTFParser::~SvRTFParser()
+{
+}
+
+int SvRTFParser::_GetNextToken()
+{
+ int nRet = 0;
+ do {
+ int bNextCh = TRUE;
+ switch( nNextCh )
+ {
+ case '\\':
+ {
+ // Steuerzeichen
+ switch( nNextCh = GetNextChar() )
+ {
+ case '{':
+ case '}':
+ case '\\':
+ aToken = nNextCh;
+ goto ISCHAR_SCANTEXT;
+ case '~': // nonbreaking space
+ aToken = 0xA0;
+ goto ISCHAR_SCANTEXT;
+ case '-': // optional hyphen
+ aToken = 0xAD;
+ goto ISCHAR_SCANTEXT;
+ case '_': // nonbreaking hyphen
+ aToken = 0x2011;
+ goto ISCHAR_SCANTEXT;
+
+ case '\'': // HexValue
+ // es folgt normaler Text
+ aToken = ByteString::ConvertToUnicode( GetHexValue(),
+ GetSrcEncoding() );
+ISCHAR_SCANTEXT:
+ nNextCh = GetNextChar();
+ ScanText();
+ nRet = RTF_TEXTTOKEN;
+ bNextCh = 0 == nNextCh;
+ break;
+
+ case '*': // ignoreflag
+ nRet = RTF_IGNOREFLAG;
+ break;
+ case ':': // subentry in an index entry
+ nRet = RTF_SUBENTRYINDEX;
+ break;
+ case '|': // formula-charakter
+ nRet = RTF_FORMULA;
+ break;
+
+ case 0x0a:
+ case 0x0d:
+ nRet = RTF_PAR;
+ break;
+
+ default:
+ if( RTF_ISALPHA( nNextCh ) )
+ {
+ aToken = '\\';
+ {
+ String aStrBuffer;
+ sal_Unicode* pStr = aStrBuffer.AllocBuffer(
+ MAX_TOKEN_LEN );
+ int nStrLen = 0;
+ do {
+ *(pStr + nStrLen++) = ByteString::
+ ConvertToUnicode( nNextCh,
+ GetSrcEncoding() );
+ if( MAX_TOKEN_LEN == nStrLen )
+ {
+ aToken += aStrBuffer;
+ aToken.GetBufferAccess(); // make unique string!
+ nStrLen = 0;
+ }
+ nNextCh = GetNextChar();
+ } while( RTF_ISALPHA( nNextCh ) );
+ if( nStrLen )
+ {
+ aStrBuffer.ReleaseBufferAccess( nStrLen );
+ aToken += aStrBuffer;
+ }
+ }
+
+ // Minus fuer numerischen Parameter
+ int bNegValue = FALSE;
+ if( '-' == nNextCh )
+ {
+ bNegValue = TRUE;
+ nNextCh = GetNextChar();
+ }
+
+ // evt. Numerischer Parameter
+ if( RTF_ISDIGIT( nNextCh ) )
+ {
+ nTokenValue = 0;
+ do {
+ nTokenValue *= 10;
+ nTokenValue += nNextCh - '0';
+ nNextCh = GetNextChar();
+ } while( RTF_ISDIGIT( nNextCh ) );
+ if( bNegValue )
+ nTokenValue = -nTokenValue;
+ }
+ else if( bNegValue ) // das Minus wieder zurueck
+ {
+ nNextCh = '-';
+ rInput.SeekRel( -1 );
+ }
+ if( ' ' == nNextCh ) // Blank gehoert zum Token!
+ nNextCh = GetNextChar();
+
+ // suche das Token in der Tabelle:
+ if( 0 == (nRet = GetRTFToken( aToken )) )
+ // Unknown Control
+ nRet = RTF_UNKNOWNCONTROL;
+
+ // bug 76812 - unicode token handled as normal text
+ if( !bRTF_InTextRead && RTF_U == nRet )
+ {
+ nRet = RTF_TEXTTOKEN;
+ aToken = (sal_Unicode)nTokenValue;
+
+ // das naechste Zeichen noch ueberlesen
+ // JP 10.12.98: das kann auch ein \{, \}, \'88 sein!
+ sal_Unicode cAnsi = nNextCh;
+ while( 0xD == cAnsi )
+ cAnsi = GetNextChar();
+ while( 0xA == cAnsi )
+ cAnsi = GetNextChar();
+
+ if( '\\' == cAnsi &&
+ '\'' == ( cAnsi = GetNextChar() ))
+ // HexValue ueberlesen
+ cAnsi = GetHexValue();
+
+ // the next char must be read
+ bNextCh = TRUE;
+ break;
+ }
+ }
+ else if( SVPAR_PENDING != eState )
+ {
+ // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
+ // eState = SVPAR_ERROR;
+ }
+
+ bNextCh = FALSE;
+ break;
+ }
+ }
+ break;
+
+ case sal_Unicode(EOF):
+ eState = SVPAR_ACCEPTED;
+ nRet = nNextCh;
+ break;
+
+ case '{':
+ ++nOpenBrakets;
+ nRet = nNextCh;
+ break;
+
+ case '}':
+ --nOpenBrakets;
+ nRet = nNextCh;
+ break;
+
+ case 0x0d:
+ case 0x0a:
+ break;
+
+ default:
+ // es folgt normaler Text
+ ScanText();
+ nRet = RTF_TEXTTOKEN;
+ bNextCh = 0 == nNextCh;
+ break;
+ }
+
+ if( bNextCh )
+ nNextCh = GetNextChar();
+
+ } while( !nRet && SVPAR_WORKING == eState );
+ return nRet;
+}
+
+
+sal_Unicode SvRTFParser::GetHexValue()
+{
+ // Hex-Wert sammeln
+ register int n;
+ register sal_Unicode nHexVal = 0;
+
+ for( n = 0; n < 2; ++n )
+ {
+ nHexVal *= 16;
+ nNextCh = GetNextChar();
+ if( nNextCh >= '0' && nNextCh <= '9' )
+ nHexVal += (nNextCh - 48);
+ else if( nNextCh >= 'a' && nNextCh <= 'f' )
+ nHexVal += (nNextCh - 87);
+ else if( nNextCh >= 'A' && nNextCh <= 'F' )
+ nHexVal += (nNextCh - 55);
+ }
+ return nHexVal;
+}
+
+void SvRTFParser::ScanText( const sal_Unicode cBreak )
+{
+ String aStrBuffer;
+ sal_Unicode* pStr = aStrBuffer.AllocBuffer( MAX_STRING_LEN );
+ int nStrLen = 0;
+ int bWeiter = TRUE;
+ while( bWeiter && IsParserWorking() )
+ {
+ int bNextCh = TRUE;
+ switch( nNextCh )
+ {
+ case '\\':
+ {
+ switch( nNextCh = GetNextChar() )
+ {
+ case '\'':
+ *(pStr + nStrLen++) = ByteString::ConvertToUnicode(
+ GetHexValue(), GetSrcEncoding() );
+ break;
+
+ case '\\':
+ case '}':
+ case '{':
+ case '+': // habe ich in einem RTF-File gefunden
+ *(pStr + nStrLen++) = (sal_Char)nNextCh;
+ break;
+ case '~': // nonbreaking space
+ *(pStr + nStrLen++) = 0xA0;
+ break;
+ case '-': // optional hyphen
+ *(pStr + nStrLen++) = 0xAD;
+ break;
+ case '_': // nonbreaking hyphen
+ *(pStr + nStrLen++) = 0x2011;
+ break;
+
+ case 'u':
+ // UNI-Code Zeichen lesen
+ {
+ nNextCh = GetNextChar();
+ rInput.SeekRel( -2 );
+
+ if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
+ {
+ bRTF_InTextRead = TRUE;
+
+ String sSave( aToken );
+ nNextCh = '\\';
+ int nToken = _GetNextToken();
+ DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
+ // dont convert symbol chars
+ *(pStr + nStrLen++) = (sal_Unicode)nTokenValue;
+
+ // das naechste Zeichen noch ueberlesen
+ // JP 10.12.98: das kann auch ein \{, \}, \'88 sein!
+ sal_Unicode cAnsi = nNextCh;
+ while( 0xD == cAnsi )
+ cAnsi = GetNextChar();
+ while( 0xA == cAnsi )
+ cAnsi = GetNextChar();
+
+ if( '\\' == cAnsi &&
+ '\'' == ( cAnsi = GetNextChar() ))
+ // HexValue ueberlesen
+ cAnsi = GetHexValue();
+
+ aToken = sSave;
+ bRTF_InTextRead = FALSE;
+ }
+ else
+ {
+ nNextCh = '\\';
+ bWeiter = FALSE; // Abbrechen, String zusammen
+ }
+ }
+ break;
+
+ default:
+ rInput.SeekRel( -1 );
+ nNextCh = '\\';
+ bWeiter = FALSE; // Abbrechen, String zusammen
+ break;
+ }
+ }
+ break;
+
+ case sal_Unicode(EOF):
+ eState = SVPAR_ERROR;
+ // weiter
+ case '{':
+ case '}':
+ bWeiter = FALSE;
+ break;
+
+ case 0x0a:
+ case 0x0d:
+ break;
+
+ default:
+ if( nNextCh == cBreak ||
+ aToken.Len() >= ( STRING_MAXLEN - MAX_STRING_LEN ))
+ bWeiter = FALSE;
+ else
+ {
+ do {
+ // alle anderen Zeichen kommen in den Text
+ *(pStr + nStrLen++) = ByteString::ConvertToUnicode(
+ nNextCh, GetSrcEncoding() );
+
+ if( MAX_STRING_LEN == nStrLen )
+ {
+ aToken += aStrBuffer;
+ aToken.GetBufferAccess(); // make unique string!
+ nStrLen = 0;
+ if( aToken.Len() >=
+ ( STRING_MAXLEN - MAX_STRING_LEN ))
+ {
+ nNextCh = GetNextChar();
+ return;
+ }
+ }
+ if( sal_Unicode(EOF) == (nNextCh = GetNextChar() ))
+ {
+ if( nStrLen )
+ {
+ aStrBuffer.ReleaseBufferAccess( nStrLen );
+ aToken += aStrBuffer;
+ }
+ return;
+ }
+ } while( RTF_ISALPHA( nNextCh ) || RTF_ISDIGIT( nNextCh ) );
+ bNextCh = FALSE;
+ }
+ }
+
+ if( MAX_STRING_LEN == nStrLen )
+ {
+ aToken += aStrBuffer;
+ aToken.GetBufferAccess(); // make unique string!
+ nStrLen = 0;
+ }
+
+ if( bWeiter && bNextCh )
+ nNextCh = GetNextChar();
+ }
+ if( nStrLen )
+ {
+ aStrBuffer.ReleaseBufferAccess( nStrLen );
+ aToken += aStrBuffer;
+ }
+}
+
+
+void SvRTFParser::SkipGroup()
+{
+ short nBrackets = 1;
+ sal_Unicode cPrev = 0;
+ do {
+ switch( nNextCh )
+ {
+ case '{':
+ if( '\\' != cPrev )
+ ++nBrackets;
+ break;
+
+ case '}':
+ if( '\\' != cPrev && !--nBrackets )
+ return;
+ break;
+
+ case '\\':
+ if( '\\' == cPrev )
+ nNextCh = 0;
+ break;
+ }
+ cPrev = nNextCh;
+ nNextCh = GetNextChar();
+ } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
+
+ if( SVPAR_PENDING != eState && '}' != nNextCh )
+ eState = SVPAR_ERROR;
+}
+
+void SvRTFParser::ReadUnknownData() { SkipGroup(); }
+void SvRTFParser::ReadBitmapData() { SkipGroup(); }
+void SvRTFParser::ReadOLEData() { SkipGroup(); }
+
+
+SvParserState SvRTFParser::CallParser()
+{
+ sal_Char cFirstCh;
+ rInput >> cFirstCh; nNextCh = cFirstCh;
+ eState = SVPAR_WORKING;
+ nOpenBrakets = 0;
+ SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
+ eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet
+
+ // die 1. beiden Token muessen '{' und \\rtf sein !!
+ if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
+ {
+ AddRef();
+ Continue( 0 );
+ if( SVPAR_PENDING != eState )
+ ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
+ }
+ else
+ eState = SVPAR_ERROR;
+
+ return eState;
+}
+
+void SvRTFParser::Continue( int nToken )
+{
+// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
+// "Zeichensatz wurde geaendert." );
+
+ if( !nToken )
+ nToken = GetNextToken();
+
+ while( IsParserWorking() )
+ {
+ SaveState( nToken );
+ switch( nToken )
+ {
+ case '}':
+ if( nOpenBrakets )
+ goto NEXTTOKEN;
+ eState = SVPAR_ACCEPTED;
+ break;
+
+ case '{':
+ // eine unbekannte Gruppe ?
+ {
+ if( RTF_IGNOREFLAG != GetNextToken() )
+ nToken = SkipToken( -1 );
+ else if( RTF_UNKNOWNCONTROL != GetNextToken() )
+ nToken = SkipToken( -2 );
+ else
+ {
+ // gleich herausfiltern
+ ReadUnknownData();
+ nToken = GetNextToken();
+ if( '}' != nToken )
+ eState = SVPAR_ERROR;
+ break; // auf zum naechsten Token!!
+ }
+ }
+ goto NEXTTOKEN;
+
+ case RTF_UNKNOWNCONTROL:
+ break; // unbekannte Token ueberspringen
+
+ case RTF_NEXTTYPE:
+ case RTF_ANSITYPE: SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); break;
+ case RTF_MACTYPE: SetSrcEncoding( RTL_TEXTENCODING_APPLE_ROMAN ); break;
+ case RTF_PCTYPE: SetSrcEncoding( RTL_TEXTENCODING_IBM_437 ); break;
+ case RTF_PCATYPE: SetSrcEncoding( RTL_TEXTENCODING_IBM_850 ); break;
+
+/*
+ case RTF_ANSICPG:
+ switch( nTokenValue )
+ {
+??? case 1252: SetUNICodeSet( CHARSET_ANSI ); break;
+ }
+ break;
+*/
+ default:
+NEXTTOKEN:
+ NextToken( nToken );
+ break;
+ }
+ if( IsParserWorking() )
+ SaveState( 0 ); // bis hierhin abgearbeitet,
+ // weiter mit neuem Token!
+ nToken = GetNextToken();
+ }
+ if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
+ eState = SVPAR_ERROR;
+}
+
+
+#ifdef USED
+void SvRTFParser::SaveState( int nToken )
+{
+ SvParser::SaveState( nToken );
+}
+
+void SvRTFParser::RestoreState()
+{
+ SvParser::RestoreState();
+}
+#endif
+
+