diff options
Diffstat (limited to 'svtools/source/svrtf/parrtf.cxx')
-rw-r--r-- | svtools/source/svrtf/parrtf.cxx | 591 |
1 files changed, 591 insertions, 0 deletions
diff --git a/svtools/source/svrtf/parrtf.cxx b/svtools/source/svrtf/parrtf.cxx new file mode 100644 index 000000000000..c5da72a9cbf5 --- /dev/null +++ b/svtools/source/svrtf/parrtf.cxx @@ -0,0 +1,591 @@ +/************************************************************************* + * + * $RCSfile: parrtf.cxx,v $ + * + * $Revision: 1.1.1.1 $ + * + * last change: $Author: hr $ $Date: 2000-09-18 16:59:05 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ + +#include <stdio.h> // for EOF + +#ifndef _STREAM_HXX +#include <tools/stream.hxx> +#endif +#ifndef _TOOLS_DEBUG_HXX +#include <tools/debug.hxx> +#endif +#include "rtftoken.h" +#include "rtfkeywd.hxx" +#include "parrtf.hxx" + +const int MAX_STRING_LEN = 1024; +const int MAX_TOKEN_LEN = 128; + +#define RTF_ISDIGIT( c ) (c >= '0' && c <= '9') +#define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) + +SvRTFParser::SvRTFParser( SvStream& rIn, BYTE nStackSize ) + : SvParser( rIn, nStackSize ), + eUNICodeSet( RTL_TEXTENCODING_MS_1252 ) // default ist ANSI-CodeSet +{ + // default ist ANSI-CodeSet + SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); + bRTF_InTextRead = FALSE; +} + +SvRTFParser::~SvRTFParser() +{ +} + +int SvRTFParser::_GetNextToken() +{ + int nRet = 0; + do { + int bNextCh = TRUE; + switch( nNextCh ) + { + case '\\': + { + // Steuerzeichen + switch( nNextCh = GetNextChar() ) + { + case '{': + case '}': + case '\\': + aToken = nNextCh; + goto ISCHAR_SCANTEXT; + case '~': // nonbreaking space + aToken = 0xA0; + goto ISCHAR_SCANTEXT; + case '-': // optional hyphen + aToken = 0xAD; + goto ISCHAR_SCANTEXT; + case '_': // nonbreaking hyphen + aToken = 0x2011; + goto ISCHAR_SCANTEXT; + + case '\'': // HexValue + // es folgt normaler Text + aToken = ByteString::ConvertToUnicode( GetHexValue(), + GetSrcEncoding() ); +ISCHAR_SCANTEXT: + nNextCh = GetNextChar(); + ScanText(); + nRet = RTF_TEXTTOKEN; + bNextCh = 0 == nNextCh; + break; + + case '*': // ignoreflag + nRet = RTF_IGNOREFLAG; + break; + case ':': // subentry in an index entry + nRet = RTF_SUBENTRYINDEX; + break; + case '|': // formula-charakter + nRet = RTF_FORMULA; + break; + + case 0x0a: + case 0x0d: + nRet = RTF_PAR; + break; + + default: + if( RTF_ISALPHA( nNextCh ) ) + { + aToken = '\\'; + { + String aStrBuffer; + sal_Unicode* pStr = aStrBuffer.AllocBuffer( + MAX_TOKEN_LEN ); + int nStrLen = 0; + do { + *(pStr + nStrLen++) = ByteString:: + ConvertToUnicode( nNextCh, + GetSrcEncoding() ); + if( MAX_TOKEN_LEN == nStrLen ) + { + aToken += aStrBuffer; + aToken.GetBufferAccess(); // make unique string! + nStrLen = 0; + } + nNextCh = GetNextChar(); + } while( RTF_ISALPHA( nNextCh ) ); + if( nStrLen ) + { + aStrBuffer.ReleaseBufferAccess( nStrLen ); + aToken += aStrBuffer; + } + } + + // Minus fuer numerischen Parameter + int bNegValue = FALSE; + if( '-' == nNextCh ) + { + bNegValue = TRUE; + nNextCh = GetNextChar(); + } + + // evt. Numerischer Parameter + if( RTF_ISDIGIT( nNextCh ) ) + { + nTokenValue = 0; + do { + nTokenValue *= 10; + nTokenValue += nNextCh - '0'; + nNextCh = GetNextChar(); + } while( RTF_ISDIGIT( nNextCh ) ); + if( bNegValue ) + nTokenValue = -nTokenValue; + } + else if( bNegValue ) // das Minus wieder zurueck + { + nNextCh = '-'; + rInput.SeekRel( -1 ); + } + if( ' ' == nNextCh ) // Blank gehoert zum Token! + nNextCh = GetNextChar(); + + // suche das Token in der Tabelle: + if( 0 == (nRet = GetRTFToken( aToken )) ) + // Unknown Control + nRet = RTF_UNKNOWNCONTROL; + + // bug 76812 - unicode token handled as normal text + if( !bRTF_InTextRead && RTF_U == nRet ) + { + nRet = RTF_TEXTTOKEN; + aToken = (sal_Unicode)nTokenValue; + + // das naechste Zeichen noch ueberlesen + // JP 10.12.98: das kann auch ein \{, \}, \'88 sein! + sal_Unicode cAnsi = nNextCh; + while( 0xD == cAnsi ) + cAnsi = GetNextChar(); + while( 0xA == cAnsi ) + cAnsi = GetNextChar(); + + if( '\\' == cAnsi && + '\'' == ( cAnsi = GetNextChar() )) + // HexValue ueberlesen + cAnsi = GetHexValue(); + + // the next char must be read + bNextCh = TRUE; + break; + } + } + else if( SVPAR_PENDING != eState ) + { + // Bug 34631 - "\ " ueberlesen - Blank als Zeichen + // eState = SVPAR_ERROR; + } + + bNextCh = FALSE; + break; + } + } + break; + + case sal_Unicode(EOF): + eState = SVPAR_ACCEPTED; + nRet = nNextCh; + break; + + case '{': + ++nOpenBrakets; + nRet = nNextCh; + break; + + case '}': + --nOpenBrakets; + nRet = nNextCh; + break; + + case 0x0d: + case 0x0a: + break; + + default: + // es folgt normaler Text + ScanText(); + nRet = RTF_TEXTTOKEN; + bNextCh = 0 == nNextCh; + break; + } + + if( bNextCh ) + nNextCh = GetNextChar(); + + } while( !nRet && SVPAR_WORKING == eState ); + return nRet; +} + + +sal_Unicode SvRTFParser::GetHexValue() +{ + // Hex-Wert sammeln + register int n; + register sal_Unicode nHexVal = 0; + + for( n = 0; n < 2; ++n ) + { + nHexVal *= 16; + nNextCh = GetNextChar(); + if( nNextCh >= '0' && nNextCh <= '9' ) + nHexVal += (nNextCh - 48); + else if( nNextCh >= 'a' && nNextCh <= 'f' ) + nHexVal += (nNextCh - 87); + else if( nNextCh >= 'A' && nNextCh <= 'F' ) + nHexVal += (nNextCh - 55); + } + return nHexVal; +} + +void SvRTFParser::ScanText( const sal_Unicode cBreak ) +{ + String aStrBuffer; + sal_Unicode* pStr = aStrBuffer.AllocBuffer( MAX_STRING_LEN ); + int nStrLen = 0; + int bWeiter = TRUE; + while( bWeiter && IsParserWorking() ) + { + int bNextCh = TRUE; + switch( nNextCh ) + { + case '\\': + { + switch( nNextCh = GetNextChar() ) + { + case '\'': + *(pStr + nStrLen++) = ByteString::ConvertToUnicode( + GetHexValue(), GetSrcEncoding() ); + break; + + case '\\': + case '}': + case '{': + case '+': // habe ich in einem RTF-File gefunden + *(pStr + nStrLen++) = (sal_Char)nNextCh; + break; + case '~': // nonbreaking space + *(pStr + nStrLen++) = 0xA0; + break; + case '-': // optional hyphen + *(pStr + nStrLen++) = 0xAD; + break; + case '_': // nonbreaking hyphen + *(pStr + nStrLen++) = 0x2011; + break; + + case 'u': + // UNI-Code Zeichen lesen + { + nNextCh = GetNextChar(); + rInput.SeekRel( -2 ); + + if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) ) + { + bRTF_InTextRead = TRUE; + + String sSave( aToken ); + nNextCh = '\\'; + int nToken = _GetNextToken(); + DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" ); + // dont convert symbol chars + *(pStr + nStrLen++) = (sal_Unicode)nTokenValue; + + // das naechste Zeichen noch ueberlesen + // JP 10.12.98: das kann auch ein \{, \}, \'88 sein! + sal_Unicode cAnsi = nNextCh; + while( 0xD == cAnsi ) + cAnsi = GetNextChar(); + while( 0xA == cAnsi ) + cAnsi = GetNextChar(); + + if( '\\' == cAnsi && + '\'' == ( cAnsi = GetNextChar() )) + // HexValue ueberlesen + cAnsi = GetHexValue(); + + aToken = sSave; + bRTF_InTextRead = FALSE; + } + else + { + nNextCh = '\\'; + bWeiter = FALSE; // Abbrechen, String zusammen + } + } + break; + + default: + rInput.SeekRel( -1 ); + nNextCh = '\\'; + bWeiter = FALSE; // Abbrechen, String zusammen + break; + } + } + break; + + case sal_Unicode(EOF): + eState = SVPAR_ERROR; + // weiter + case '{': + case '}': + bWeiter = FALSE; + break; + + case 0x0a: + case 0x0d: + break; + + default: + if( nNextCh == cBreak || + aToken.Len() >= ( STRING_MAXLEN - MAX_STRING_LEN )) + bWeiter = FALSE; + else + { + do { + // alle anderen Zeichen kommen in den Text + *(pStr + nStrLen++) = ByteString::ConvertToUnicode( + nNextCh, GetSrcEncoding() ); + + if( MAX_STRING_LEN == nStrLen ) + { + aToken += aStrBuffer; + aToken.GetBufferAccess(); // make unique string! + nStrLen = 0; + if( aToken.Len() >= + ( STRING_MAXLEN - MAX_STRING_LEN )) + { + nNextCh = GetNextChar(); + return; + } + } + if( sal_Unicode(EOF) == (nNextCh = GetNextChar() )) + { + if( nStrLen ) + { + aStrBuffer.ReleaseBufferAccess( nStrLen ); + aToken += aStrBuffer; + } + return; + } + } while( RTF_ISALPHA( nNextCh ) || RTF_ISDIGIT( nNextCh ) ); + bNextCh = FALSE; + } + } + + if( MAX_STRING_LEN == nStrLen ) + { + aToken += aStrBuffer; + aToken.GetBufferAccess(); // make unique string! + nStrLen = 0; + } + + if( bWeiter && bNextCh ) + nNextCh = GetNextChar(); + } + if( nStrLen ) + { + aStrBuffer.ReleaseBufferAccess( nStrLen ); + aToken += aStrBuffer; + } +} + + +void SvRTFParser::SkipGroup() +{ + short nBrackets = 1; + sal_Unicode cPrev = 0; + do { + switch( nNextCh ) + { + case '{': + if( '\\' != cPrev ) + ++nBrackets; + break; + + case '}': + if( '\\' != cPrev && !--nBrackets ) + return; + break; + + case '\\': + if( '\\' == cPrev ) + nNextCh = 0; + break; + } + cPrev = nNextCh; + nNextCh = GetNextChar(); + } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() ); + + if( SVPAR_PENDING != eState && '}' != nNextCh ) + eState = SVPAR_ERROR; +} + +void SvRTFParser::ReadUnknownData() { SkipGroup(); } +void SvRTFParser::ReadBitmapData() { SkipGroup(); } +void SvRTFParser::ReadOLEData() { SkipGroup(); } + + +SvParserState SvRTFParser::CallParser() +{ + sal_Char cFirstCh; + rInput >> cFirstCh; nNextCh = cFirstCh; + eState = SVPAR_WORKING; + nOpenBrakets = 0; + SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); + eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet + + // die 1. beiden Token muessen '{' und \\rtf sein !! + if( '{' == GetNextToken() && RTF_RTF == GetNextToken() ) + { + AddRef(); + Continue( 0 ); + if( SVPAR_PENDING != eState ) + ReleaseRef(); // dann brauchen wir den Parser nicht mehr! + } + else + eState = SVPAR_ERROR; + + return eState; +} + +void SvRTFParser::Continue( int nToken ) +{ +// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(), +// "Zeichensatz wurde geaendert." ); + + if( !nToken ) + nToken = GetNextToken(); + + while( IsParserWorking() ) + { + SaveState( nToken ); + switch( nToken ) + { + case '}': + if( nOpenBrakets ) + goto NEXTTOKEN; + eState = SVPAR_ACCEPTED; + break; + + case '{': + // eine unbekannte Gruppe ? + { + if( RTF_IGNOREFLAG != GetNextToken() ) + nToken = SkipToken( -1 ); + else if( RTF_UNKNOWNCONTROL != GetNextToken() ) + nToken = SkipToken( -2 ); + else + { + // gleich herausfiltern + ReadUnknownData(); + nToken = GetNextToken(); + if( '}' != nToken ) + eState = SVPAR_ERROR; + break; // auf zum naechsten Token!! + } + } + goto NEXTTOKEN; + + case RTF_UNKNOWNCONTROL: + break; // unbekannte Token ueberspringen + + case RTF_NEXTTYPE: + case RTF_ANSITYPE: SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); break; + case RTF_MACTYPE: SetSrcEncoding( RTL_TEXTENCODING_APPLE_ROMAN ); break; + case RTF_PCTYPE: SetSrcEncoding( RTL_TEXTENCODING_IBM_437 ); break; + case RTF_PCATYPE: SetSrcEncoding( RTL_TEXTENCODING_IBM_850 ); break; + +/* + case RTF_ANSICPG: + switch( nTokenValue ) + { +??? case 1252: SetUNICodeSet( CHARSET_ANSI ); break; + } + break; +*/ + default: +NEXTTOKEN: + NextToken( nToken ); + break; + } + if( IsParserWorking() ) + SaveState( 0 ); // bis hierhin abgearbeitet, + // weiter mit neuem Token! + nToken = GetNextToken(); + } + if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets ) + eState = SVPAR_ERROR; +} + + +#ifdef USED +void SvRTFParser::SaveState( int nToken ) +{ + SvParser::SaveState( nToken ); +} + +void SvRTFParser::RestoreState() +{ + SvParser::RestoreState(); +} +#endif + + |