summaryrefslogtreecommitdiff
path: root/i18npool/source/characterclassification
diff options
context:
space:
mode:
Diffstat (limited to 'i18npool/source/characterclassification')
-rw-r--r--i18npool/source/characterclassification/cclass_unicode.cxx288
-rw-r--r--i18npool/source/characterclassification/cclass_unicode_parser.cxx1067
-rw-r--r--i18npool/source/characterclassification/characterclassificationImpl.cxx236
-rw-r--r--i18npool/source/characterclassification/makefile.mk52
-rw-r--r--i18npool/source/characterclassification/scripttypedetector.cxx182
5 files changed, 1825 insertions, 0 deletions
diff --git a/i18npool/source/characterclassification/cclass_unicode.cxx b/i18npool/source/characterclassification/cclass_unicode.cxx
new file mode 100644
index 000000000000..045512e61258
--- /dev/null
+++ b/i18npool/source/characterclassification/cclass_unicode.cxx
@@ -0,0 +1,288 @@
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org. If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_i18npool.hxx"
+
+#include <cclass_unicode.hxx>
+#include <com/sun/star/i18n/UnicodeScript.hpp>
+#include <com/sun/star/i18n/UnicodeType.hpp>
+#include <com/sun/star/i18n/KCharacterType.hpp>
+#include <unicode/uchar.h>
+#include <i18nutil/x_rtl_ustring.h>
+#include <breakiteratorImpl.hxx>
+
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::lang;
+using namespace ::rtl;
+
+namespace com { namespace sun { namespace star { namespace i18n {
+// ----------------------------------------------------
+// class cclass_Unicode
+// ----------------------------------------------------;
+
+cclass_Unicode::cclass_Unicode( uno::Reference < XMultiServiceFactory > xSMgr ) : xMSF( xSMgr ),
+ pTable( NULL ),
+ pStart( NULL ),
+ pCont( NULL ),
+ nStartTypes( 0 ),
+ nContTypes( 0 ),
+ eState( ssGetChar ),
+ cGroupSep( ',' ),
+ cDecimalSep( '.' )
+{
+ trans = new Transliteration_casemapping();
+ cClass = "com.sun.star.i18n.CharacterClassification_Unicode";
+}
+
+cclass_Unicode::~cclass_Unicode() {
+ destroyParserTable();
+ delete trans;
+}
+
+
+OUString SAL_CALL
+cclass_Unicode::toUpper( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException) {
+ sal_Int32 len = Text.getLength();
+ if (nPos >= len)
+ return OUString();
+ if (nCount + nPos > len)
+ nCount = len - nPos;
+
+ trans->setMappingType(MappingTypeToUpper, rLocale);
+ return trans->transliterateString2String(Text, nPos, nCount);
+}
+
+OUString SAL_CALL
+cclass_Unicode::toLower( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException) {
+ sal_Int32 len = Text.getLength();
+ if (nPos >= len)
+ return OUString();
+ if (nCount + nPos > len)
+ nCount = len - nPos;
+
+ trans->setMappingType(MappingTypeToLower, rLocale);
+ return trans->transliterateString2String(Text, nPos, nCount);
+}
+
+OUString SAL_CALL
+cclass_Unicode::toTitle( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException) {
+ sal_Int32 len = Text.getLength();
+ if (nPos >= len)
+ return OUString();
+ if (nCount + nPos > len)
+ nCount = len - nPos;
+
+ trans->setMappingType(MappingTypeToTitle, rLocale);
+ rtl_uString* pStr = x_rtl_uString_new_WithLength( nCount, 1 );
+ sal_Unicode* out = pStr->buffer;
+ BreakIteratorImpl brk(xMSF);
+ Boundary bdy = brk.getWordBoundary(Text, nPos, rLocale,
+ WordType::ANYWORD_IGNOREWHITESPACES, sal_True);
+ for (sal_Int32 i = nPos; i < nCount + nPos; i++, out++) {
+ if (i >= bdy.endPos)
+ bdy = brk.nextWord(Text, bdy.endPos, rLocale,
+ WordType::ANYWORD_IGNOREWHITESPACES);
+ *out = (i == bdy.startPos) ?
+ trans->transliterateChar2Char(Text[i]) : Text[i];
+ }
+ *out = 0;
+ return OUString( pStr, SAL_NO_ACQUIRE );
+}
+
+sal_Int16 SAL_CALL
+cclass_Unicode::getType( const OUString& Text, sal_Int32 nPos ) throw(RuntimeException) {
+ if ( nPos < 0 || Text.getLength() <= nPos ) return 0;
+ return (sal_Int16) u_charType(Text.iterateCodePoints(&nPos, 0));
+}
+
+sal_Int16 SAL_CALL
+cclass_Unicode::getCharacterDirection( const OUString& Text, sal_Int32 nPos ) throw(RuntimeException) {
+ if ( nPos < 0 || Text.getLength() <= nPos ) return 0;
+ return (sal_Int16) u_charDirection(Text.iterateCodePoints(&nPos, 0));
+}
+
+
+sal_Int16 SAL_CALL
+cclass_Unicode::getScript( const OUString& Text, sal_Int32 nPos ) throw(RuntimeException) {
+ if ( nPos < 0 || Text.getLength() <= nPos ) return 0;
+ // ICU Unicode script type UBlockCode starts from 1 for Basci Latin,
+ // while OO.o enum UnicideScript starts from 0.
+ // To map ICU UBlockCode to OO.o UnicodeScript, it needs to shift 1.
+ return (sal_Int16) ublock_getCode(Text.iterateCodePoints(&nPos, 0))-1;
+}
+
+
+sal_Int32 SAL_CALL
+cclass_Unicode::getCharType( const OUString& Text, sal_Int32* nPos, sal_Int32 increment) {
+ using namespace ::com::sun::star::i18n::KCharacterType;
+
+ sal_uInt32 ch = Text.iterateCodePoints(nPos, increment);
+ if (increment > 0) ch = Text.iterateCodePoints(nPos, 0);
+ switch ( u_charType(ch) ) {
+ // Upper
+ case U_UPPERCASE_LETTER :
+ return UPPER|LETTER|PRINTABLE|BASE_FORM;
+
+ // Lower
+ case U_LOWERCASE_LETTER :
+ return LOWER|LETTER|PRINTABLE|BASE_FORM;
+
+ // Title
+ case U_TITLECASE_LETTER :
+ return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
+
+ // Letter
+ case U_MODIFIER_LETTER :
+ case U_OTHER_LETTER :
+ return LETTER|PRINTABLE|BASE_FORM;
+
+ // Digit
+ case U_DECIMAL_DIGIT_NUMBER:
+ case U_LETTER_NUMBER:
+ case U_OTHER_NUMBER:
+ return DIGIT|PRINTABLE|BASE_FORM;
+
+ // Base
+ case U_NON_SPACING_MARK:
+ case U_ENCLOSING_MARK:
+ case U_COMBINING_SPACING_MARK:
+ return BASE_FORM|PRINTABLE;
+
+ // Print
+ case U_SPACE_SEPARATOR:
+
+ case U_DASH_PUNCTUATION:
+ case U_INITIAL_PUNCTUATION:
+ case U_FINAL_PUNCTUATION:
+ case U_CONNECTOR_PUNCTUATION:
+ case U_OTHER_PUNCTUATION:
+
+ case U_MATH_SYMBOL:
+ case U_CURRENCY_SYMBOL:
+ case U_MODIFIER_SYMBOL:
+ case U_OTHER_SYMBOL:
+ return PRINTABLE;
+
+ // Control
+ case U_CONTROL_CHAR:
+ case U_FORMAT_CHAR:
+ return CONTROL;
+
+ case U_LINE_SEPARATOR:
+ case U_PARAGRAPH_SEPARATOR:
+ return CONTROL|PRINTABLE;
+
+ // for all others
+ default:
+ return U_GENERAL_OTHER_TYPES;
+ }
+}
+
+sal_Int32 SAL_CALL
+cclass_Unicode::getCharacterType( const OUString& Text, sal_Int32 nPos, const Locale& /*rLocale*/ ) throw(RuntimeException) {
+ if ( nPos < 0 || Text.getLength() <= nPos ) return 0;
+ return getCharType(Text, &nPos, 0);
+
+}
+
+sal_Int32 SAL_CALL
+cclass_Unicode::getStringType( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& /*rLocale*/ ) throw(RuntimeException) {
+ if ( nPos < 0 || Text.getLength() <= nPos ) return 0;
+
+ sal_Int32 result = getCharType(Text, &nPos, 0);
+ for (sal_Int32 i = 1; i < nCount && nPos < Text.getLength(); i++)
+ result |= getCharType(Text, &nPos, 1);
+ return result;
+}
+
+ParseResult SAL_CALL cclass_Unicode::parseAnyToken(
+ const OUString& Text,
+ sal_Int32 nPos,
+ const Locale& rLocale,
+ sal_Int32 startCharTokenType,
+ const OUString& userDefinedCharactersStart,
+ sal_Int32 contCharTokenType,
+ const OUString& userDefinedCharactersCont )
+ throw(RuntimeException)
+{
+ ParseResult r;
+ if ( Text.getLength() <= nPos )
+ return r;
+
+ setupParserTable( rLocale,
+ startCharTokenType, userDefinedCharactersStart,
+ contCharTokenType, userDefinedCharactersCont );
+ parseText( r, Text, nPos );
+
+ return r;
+}
+
+
+ParseResult SAL_CALL cclass_Unicode::parsePredefinedToken(
+ sal_Int32 nTokenType,
+ const OUString& Text,
+ sal_Int32 nPos,
+ const Locale& rLocale,
+ sal_Int32 startCharTokenType,
+ const OUString& userDefinedCharactersStart,
+ sal_Int32 contCharTokenType,
+ const OUString& userDefinedCharactersCont )
+ throw(RuntimeException)
+{
+ ParseResult r;
+ if ( Text.getLength() <= nPos )
+ return r;
+
+ setupParserTable( rLocale,
+ startCharTokenType, userDefinedCharactersStart,
+ contCharTokenType, userDefinedCharactersCont );
+ parseText( r, Text, nPos, nTokenType );
+
+ return r;
+}
+
+OUString SAL_CALL cclass_Unicode::getImplementationName() throw( RuntimeException )
+{
+ return OUString::createFromAscii(cClass);
+}
+
+
+sal_Bool SAL_CALL cclass_Unicode::supportsService(const OUString& rServiceName) throw( RuntimeException )
+{
+ return !rServiceName.compareToAscii(cClass);
+}
+
+Sequence< OUString > SAL_CALL cclass_Unicode::getSupportedServiceNames() throw( RuntimeException )
+{
+ Sequence< OUString > aRet(1);
+ aRet[0] = OUString::createFromAscii(cClass);
+ return aRet;
+}
+
+} } } }
+
diff --git a/i18npool/source/characterclassification/cclass_unicode_parser.cxx b/i18npool/source/characterclassification/cclass_unicode_parser.cxx
new file mode 100644
index 000000000000..2abd7eb90539
--- /dev/null
+++ b/i18npool/source/characterclassification/cclass_unicode_parser.cxx
@@ -0,0 +1,1067 @@
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org. If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_i18npool.hxx"
+
+#include <cclass_unicode.hxx>
+#include <unicode/uchar.h>
+#include <rtl/math.hxx>
+#include <rtl/ustring.hxx>
+#include <com/sun/star/i18n/KParseTokens.hpp>
+#include <com/sun/star/i18n/KParseType.hpp>
+#include <com/sun/star/i18n/UnicodeType.hpp>
+#include <com/sun/star/i18n/XLocaleData.hpp>
+#include <com/sun/star/i18n/NativeNumberMode.hpp>
+
+#include <string.h> // memcpy()
+
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::lang;
+using namespace ::rtl;
+
+namespace com { namespace sun { namespace star { namespace i18n {
+
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_ILLEGAL = 0x00000000;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR = 0x00000001;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_BOOL = 0x00000002;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_WORD = 0x00000004;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_VALUE = 0x00000008;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_STRING = 0x00000010;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_DONTCARE= 0x00000020;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_BOOL = 0x00000040;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_WORD = 0x00000080;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_WORD_SEP = 0x00000100;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE = 0x00000200;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_SEP = 0x00000400;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_EXP = 0x00000800;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_SIGN = 0x00001000;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_EXP_VALUE = 0x00002000;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_DIGIT = 0x00004000;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_NAME_SEP = 0x20000000;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_STRING_SEP = 0x40000000;
+const UPT_FLAG_TYPE cclass_Unicode::TOKEN_EXCLUDED = 0x80000000;
+
+#define TOKEN_DIGIT_FLAGS (TOKEN_CHAR_VALUE | TOKEN_VALUE | TOKEN_VALUE_EXP | TOKEN_VALUE_EXP_VALUE | TOKEN_VALUE_DIGIT)
+
+// Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
+
+const sal_uInt8 cclass_Unicode::nDefCnt = 128;
+const UPT_FLAG_TYPE cclass_Unicode::pDefaultParserTable[ nDefCnt ] =
+{
+// (...) == Calc formula compiler specific, commented out and modified
+
+ /* \0 */ TOKEN_EXCLUDED,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ /* 9 \t */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL)
+ TOKEN_ILLEGAL,
+ /* 11 \v */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL)
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ TOKEN_ILLEGAL,
+ /* 32 */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 33 ! */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 34 " */ TOKEN_CHAR_STRING | TOKEN_STRING_SEP,
+ /* 35 # */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_WORD_SEP)
+ /* 36 $ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_CHAR_WORD | TOKEN_WORD)
+ /* 37 % */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_VALUE)
+ /* 38 & */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 39 ' */ TOKEN_NAME_SEP,
+ /* 40 ( */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 41 ) */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 42 * */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 43 + */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP | TOKEN_VALUE_EXP | TOKEN_VALUE_SIGN,
+ /* 44 , */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_CHAR_VALUE | TOKEN_VALUE)
+ /* 45 - */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP | TOKEN_VALUE_EXP | TOKEN_VALUE_SIGN,
+ /* 46 . */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_WORD | TOKEN_CHAR_VALUE | TOKEN_VALUE)
+ /* 47 / */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ //for ( i = 48; i < 58; i++ )
+ /* 48 0 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 49 1 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 50 2 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 51 3 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 52 4 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 53 5 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 54 6 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 55 7 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 56 8 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 57 9 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
+ /* 58 : */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_WORD)
+ /* 59 ; */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 60 < */ TOKEN_CHAR_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 61 = */ TOKEN_CHAR | TOKEN_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 62 > */ TOKEN_CHAR_BOOL | TOKEN_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 63 ? */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_CHAR_WORD | TOKEN_WORD)
+ /* 64 @ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ //for ( i = 65; i < 91; i++ )
+ /* 65 A */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 66 B */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 67 C */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 68 D */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 69 E */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 70 F */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 71 G */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 72 H */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 73 I */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 74 J */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 75 K */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 76 L */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 77 M */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 78 N */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 79 O */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 80 P */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 81 Q */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 82 R */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 83 S */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 84 T */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 85 U */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 86 V */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 87 W */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 88 X */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 89 Y */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 90 Z */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 91 [ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 92 \ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 93 ] */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 94 ^ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
+ /* 95 _ */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 96 ` */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ //for ( i = 97; i < 123; i++ )
+ /* 97 a */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 98 b */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 99 c */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 100 d */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 101 e */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 102 f */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 103 g */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 104 h */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 105 i */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 106 j */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 107 k */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 108 l */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 109 m */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 110 n */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 111 o */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 112 p */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 113 q */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 114 r */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 115 s */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 116 t */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 117 u */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 118 v */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 119 w */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 120 x */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 121 y */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 122 z */ TOKEN_CHAR_WORD | TOKEN_WORD,
+ /* 123 { */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 124 | */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 125 } */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 126 ~ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP, // (TOKEN_ILLEGAL // UNUSED)
+ /* 127 */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP // (TOKEN_ILLEGAL // UNUSED)
+};
+
+
+const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
+{
+ /* \0 */ KParseTokens::ASC_OTHER,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ /* 9 \t */ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ /* 11 \v */ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ KParseTokens::ASC_CONTROL,
+ /* 32 */ KParseTokens::ASC_OTHER,
+ /* 33 ! */ KParseTokens::ASC_OTHER,
+ /* 34 " */ KParseTokens::ASC_OTHER,
+ /* 35 # */ KParseTokens::ASC_OTHER,
+ /* 36 $ */ KParseTokens::ASC_DOLLAR,
+ /* 37 % */ KParseTokens::ASC_OTHER,
+ /* 38 & */ KParseTokens::ASC_OTHER,
+ /* 39 ' */ KParseTokens::ASC_OTHER,
+ /* 40 ( */ KParseTokens::ASC_OTHER,
+ /* 41 ) */ KParseTokens::ASC_OTHER,
+ /* 42 * */ KParseTokens::ASC_OTHER,
+ /* 43 + */ KParseTokens::ASC_OTHER,
+ /* 44 , */ KParseTokens::ASC_OTHER,
+ /* 45 - */ KParseTokens::ASC_OTHER,
+ /* 46 . */ KParseTokens::ASC_DOT,
+ /* 47 / */ KParseTokens::ASC_OTHER,
+ //for ( i = 48; i < 58; i++ )
+ /* 48 0 */ KParseTokens::ASC_DIGIT,
+ /* 49 1 */ KParseTokens::ASC_DIGIT,
+ /* 50 2 */ KParseTokens::ASC_DIGIT,
+ /* 51 3 */ KParseTokens::ASC_DIGIT,
+ /* 52 4 */ KParseTokens::ASC_DIGIT,
+ /* 53 5 */ KParseTokens::ASC_DIGIT,
+ /* 54 6 */ KParseTokens::ASC_DIGIT,
+ /* 55 7 */ KParseTokens::ASC_DIGIT,
+ /* 56 8 */ KParseTokens::ASC_DIGIT,
+ /* 57 9 */ KParseTokens::ASC_DIGIT,
+ /* 58 : */ KParseTokens::ASC_COLON,
+ /* 59 ; */ KParseTokens::ASC_OTHER,
+ /* 60 < */ KParseTokens::ASC_OTHER,
+ /* 61 = */ KParseTokens::ASC_OTHER,
+ /* 62 > */ KParseTokens::ASC_OTHER,
+ /* 63 ? */ KParseTokens::ASC_OTHER,
+ /* 64 @ */ KParseTokens::ASC_OTHER,
+ //for ( i = 65; i < 91; i++ )
+ /* 65 A */ KParseTokens::ASC_UPALPHA,
+ /* 66 B */ KParseTokens::ASC_UPALPHA,
+ /* 67 C */ KParseTokens::ASC_UPALPHA,
+ /* 68 D */ KParseTokens::ASC_UPALPHA,
+ /* 69 E */ KParseTokens::ASC_UPALPHA,
+ /* 70 F */ KParseTokens::ASC_UPALPHA,
+ /* 71 G */ KParseTokens::ASC_UPALPHA,
+ /* 72 H */ KParseTokens::ASC_UPALPHA,
+ /* 73 I */ KParseTokens::ASC_UPALPHA,
+ /* 74 J */ KParseTokens::ASC_UPALPHA,
+ /* 75 K */ KParseTokens::ASC_UPALPHA,
+ /* 76 L */ KParseTokens::ASC_UPALPHA,
+ /* 77 M */ KParseTokens::ASC_UPALPHA,
+ /* 78 N */ KParseTokens::ASC_UPALPHA,
+ /* 79 O */ KParseTokens::ASC_UPALPHA,
+ /* 80 P */ KParseTokens::ASC_UPALPHA,
+ /* 81 Q */ KParseTokens::ASC_UPALPHA,
+ /* 82 R */ KParseTokens::ASC_UPALPHA,
+ /* 83 S */ KParseTokens::ASC_UPALPHA,
+ /* 84 T */ KParseTokens::ASC_UPALPHA,
+ /* 85 U */ KParseTokens::ASC_UPALPHA,
+ /* 86 V */ KParseTokens::ASC_UPALPHA,
+ /* 87 W */ KParseTokens::ASC_UPALPHA,
+ /* 88 X */ KParseTokens::ASC_UPALPHA,
+ /* 89 Y */ KParseTokens::ASC_UPALPHA,
+ /* 90 Z */ KParseTokens::ASC_UPALPHA,
+ /* 91 [ */ KParseTokens::ASC_OTHER,
+ /* 92 \ */ KParseTokens::ASC_OTHER,
+ /* 93 ] */ KParseTokens::ASC_OTHER,
+ /* 94 ^ */ KParseTokens::ASC_OTHER,
+ /* 95 _ */ KParseTokens::ASC_UNDERSCORE,
+ /* 96 ` */ KParseTokens::ASC_OTHER,
+ //for ( i = 97; i < 123; i++ )
+ /* 97 a */ KParseTokens::ASC_LOALPHA,
+ /* 98 b */ KParseTokens::ASC_LOALPHA,
+ /* 99 c */ KParseTokens::ASC_LOALPHA,
+ /* 100 d */ KParseTokens::ASC_LOALPHA,
+ /* 101 e */ KParseTokens::ASC_LOALPHA,
+ /* 102 f */ KParseTokens::ASC_LOALPHA,
+ /* 103 g */ KParseTokens::ASC_LOALPHA,
+ /* 104 h */ KParseTokens::ASC_LOALPHA,
+ /* 105 i */ KParseTokens::ASC_LOALPHA,
+ /* 106 j */ KParseTokens::ASC_LOALPHA,
+ /* 107 k */ KParseTokens::ASC_LOALPHA,
+ /* 108 l */ KParseTokens::ASC_LOALPHA,
+ /* 109 m */ KParseTokens::ASC_LOALPHA,
+ /* 110 n */ KParseTokens::ASC_LOALPHA,
+ /* 111 o */ KParseTokens::ASC_LOALPHA,
+ /* 112 p */ KParseTokens::ASC_LOALPHA,
+ /* 113 q */ KParseTokens::ASC_LOALPHA,
+ /* 114 r */ KParseTokens::ASC_LOALPHA,
+ /* 115 s */ KParseTokens::ASC_LOALPHA,
+ /* 116 t */ KParseTokens::ASC_LOALPHA,
+ /* 117 u */ KParseTokens::ASC_LOALPHA,
+ /* 118 v */ KParseTokens::ASC_LOALPHA,
+ /* 119 w */ KParseTokens::ASC_LOALPHA,
+ /* 120 x */ KParseTokens::ASC_LOALPHA,
+ /* 121 y */ KParseTokens::ASC_LOALPHA,
+ /* 122 z */ KParseTokens::ASC_LOALPHA,
+ /* 123 { */ KParseTokens::ASC_OTHER,
+ /* 124 | */ KParseTokens::ASC_OTHER,
+ /* 125 } */ KParseTokens::ASC_OTHER,
+ /* 126 ~ */ KParseTokens::ASC_OTHER,
+ /* 127 */ KParseTokens::ASC_OTHER
+};
+
+
+// static
+const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_Unicode c )
+{
+ if ( !pStr )
+ return NULL;
+ while ( *pStr )
+ {
+ if ( *pStr == c )
+ return pStr;
+ pStr++;
+ }
+ return NULL;
+}
+
+
+sal_Int32 cclass_Unicode::getParseTokensType( const sal_Unicode* aStr, sal_Int32 nPos )
+{
+ sal_Unicode c = aStr[nPos];
+ if ( c < nDefCnt )
+ return pParseTokensType[ sal_uInt8(c) ];
+ else
+ {
+
+ //! all KParseTokens::UNI_... must be matched
+ switch ( u_charType( (sal_uInt32) c ) )
+ {
+ case U_UPPERCASE_LETTER :
+ return KParseTokens::UNI_UPALPHA;
+ case U_LOWERCASE_LETTER :
+ return KParseTokens::UNI_LOALPHA;
+ case U_TITLECASE_LETTER :
+ return KParseTokens::UNI_TITLE_ALPHA;
+ case U_MODIFIER_LETTER :
+ return KParseTokens::UNI_MODIFIER_LETTER;
+ case U_OTHER_LETTER :
+ // Non_Spacing_Mark could not be as leading character
+ if (nPos == 0) break;
+ // fall through, treat it as Other_Letter.
+ case U_NON_SPACING_MARK :
+ return KParseTokens::UNI_OTHER_LETTER;
+ case U_DECIMAL_DIGIT_NUMBER :
+ return KParseTokens::UNI_DIGIT;
+ case U_LETTER_NUMBER :
+ return KParseTokens::UNI_LETTER_NUMBER;
+ case U_OTHER_NUMBER :
+ return KParseTokens::UNI_OTHER_NUMBER;
+ }
+
+ return KParseTokens::UNI_OTHER;
+ }
+}
+
+sal_Bool cclass_Unicode::setupInternational( const Locale& rLocale )
+{
+ sal_Bool bChanged = (aParserLocale.Language != rLocale.Language
+ || aParserLocale.Country != rLocale.Country
+ || aParserLocale.Variant != rLocale.Variant);
+ if ( bChanged )
+ {
+ aParserLocale.Language = rLocale.Language;
+ aParserLocale.Country = rLocale.Country;
+ aParserLocale.Variant = rLocale.Variant;
+ }
+ if ( !xLocaleData.is() && xMSF.is() )
+ {
+ Reference <
+ XInterface > xI =
+ xMSF->createInstance( OUString(
+ RTL_CONSTASCII_USTRINGPARAM( "com.sun.star.i18n.LocaleData" ) ) );
+ if ( xI.is() )
+ {
+ Any x = xI->queryInterface( getCppuType((const Reference< XLocaleData>*)0) );
+ x >>= xLocaleData;
+ }
+ }
+ return bChanged;
+}
+
+
+void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
+ const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
+ const OUString& userDefinedCharactersCont )
+{
+ bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
+ rLocale.Country == aParserLocale.Country &&
+ rLocale.Variant == aParserLocale.Variant);
+ if ( !pTable || !bIntlEqual ||
+ startCharTokenType != nStartTypes ||
+ contCharTokenType != nContTypes ||
+ userDefinedCharactersStart != aStartChars ||
+ userDefinedCharactersCont != aContChars )
+ initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
+ contCharTokenType, userDefinedCharactersCont );
+}
+
+
+void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
+ const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
+ const OUString& userDefinedCharactersCont )
+{
+ // (Re)Init
+ setupInternational( rLocale );
+ // Memory of pTable is reused.
+ if ( !pTable )
+ pTable = new UPT_FLAG_TYPE[nDefCnt];
+ memcpy( pTable, pDefaultParserTable, sizeof(UPT_FLAG_TYPE) * nDefCnt );
+ // Start and cont tables only need reallocation if different length.
+ if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
+ {
+ delete [] pStart;
+ pStart = NULL;
+ }
+ if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
+ {
+ delete [] pCont;
+ pCont = NULL;
+ }
+ nStartTypes = startCharTokenType;
+ nContTypes = contCharTokenType;
+ aStartChars = userDefinedCharactersStart;
+ aContChars = userDefinedCharactersCont;
+
+ // specials
+ if( xLocaleData.is() )
+ {
+ LocaleDataItem aItem =
+ xLocaleData->getLocaleItem( aParserLocale );
+//!TODO: theoretically separators may be a string, adjustment would have to be
+//! done here and in parsing and in ::rtl::math::stringToDouble()
+ cGroupSep = aItem.thousandSeparator.getStr()[0];
+ cDecimalSep = aItem.decimalSeparator.getStr()[0];
+ }
+
+ if ( cGroupSep < nDefCnt )
+ pTable[cGroupSep] |= TOKEN_VALUE;
+ if ( cDecimalSep < nDefCnt )
+ pTable[cDecimalSep] |= TOKEN_CHAR_VALUE | TOKEN_VALUE;
+
+ // Modify characters according to KParseTokens definitions.
+ {
+ using namespace KParseTokens;
+ sal_uInt8 i;
+
+ if ( !(nStartTypes & ASC_UPALPHA) )
+ for ( i = 65; i < 91; i++ )
+ pTable[i] &= ~TOKEN_CHAR_WORD; // not allowed as start character
+ if ( !(nContTypes & ASC_UPALPHA) )
+ for ( i = 65; i < 91; i++ )
+ pTable[i] &= ~TOKEN_WORD; // not allowed as cont character
+
+ if ( !(nStartTypes & ASC_LOALPHA) )
+ for ( i = 97; i < 123; i++ )
+ pTable[i] &= ~TOKEN_CHAR_WORD; // not allowed as start character
+ if ( !(nContTypes & ASC_LOALPHA) )
+ for ( i = 97; i < 123; i++ )
+ pTable[i] &= ~TOKEN_WORD; // not allowed as cont character
+
+ if ( nStartTypes & ASC_DIGIT )
+ for ( i = 48; i < 58; i++ )
+ pTable[i] |= TOKEN_CHAR_WORD; // allowed as start character
+ if ( !(nContTypes & ASC_DIGIT) )
+ for ( i = 48; i < 58; i++ )
+ pTable[i] &= ~TOKEN_WORD; // not allowed as cont character
+
+ if ( !(nStartTypes & ASC_UNDERSCORE) )
+ pTable[95] &= ~TOKEN_CHAR_WORD; // not allowed as start character
+ if ( !(nContTypes & ASC_UNDERSCORE) )
+ pTable[95] &= ~TOKEN_WORD; // not allowed as cont character
+
+ if ( nStartTypes & ASC_DOLLAR )
+ pTable[36] |= TOKEN_CHAR_WORD; // allowed as start character
+ if ( nContTypes & ASC_DOLLAR )
+ pTable[36] |= TOKEN_WORD; // allowed as cont character
+
+ if ( nStartTypes & ASC_DOT )
+ pTable[46] |= TOKEN_CHAR_WORD; // allowed as start character
+ if ( nContTypes & ASC_DOT )
+ pTable[46] |= TOKEN_WORD; // allowed as cont character
+
+ if ( nStartTypes & ASC_COLON )
+ pTable[58] |= TOKEN_CHAR_WORD; // allowed as start character
+ if ( nContTypes & ASC_COLON )
+ pTable[58] |= TOKEN_WORD; // allowed as cont character
+
+ if ( nStartTypes & ASC_CONTROL )
+ for ( i = 1; i < 32; i++ )
+ pTable[i] |= TOKEN_CHAR_WORD; // allowed as start character
+ if ( nContTypes & ASC_CONTROL )
+ for ( i = 1; i < 32; i++ )
+ pTable[i] |= TOKEN_WORD; // allowed as cont character
+
+ if ( nStartTypes & ASC_ANY_BUT_CONTROL )
+ for ( i = 32; i < nDefCnt; i++ )
+ pTable[i] |= TOKEN_CHAR_WORD; // allowed as start character
+ if ( nContTypes & ASC_ANY_BUT_CONTROL )
+ for ( i = 32; i < nDefCnt; i++ )
+ pTable[i] |= TOKEN_WORD; // allowed as cont character
+
+ }
+
+ // Merge in (positively override with) user defined characters.
+ // StartChars
+ sal_Int32 nLen = aStartChars.getLength();
+ if ( nLen )
+ {
+ if ( !pStart )
+ pStart = new UPT_FLAG_TYPE[ nLen ];
+ const sal_Unicode* p = aStartChars.getStr();
+ for ( sal_Int32 j=0; j<nLen; j++, p++ )
+ {
+ pStart[j] = TOKEN_CHAR_WORD;
+ if ( *p < nDefCnt )
+ pTable[*p] |= TOKEN_CHAR_WORD;
+ }
+ }
+ // ContChars
+ nLen = aContChars.getLength();
+ if ( nLen )
+ {
+ if ( !pCont )
+ pCont = new UPT_FLAG_TYPE[ nLen ];
+ const sal_Unicode* p = aContChars.getStr();
+ for ( sal_Int32 j=0; j<nLen; j++ )
+ {
+ pCont[j] = TOKEN_WORD;
+ if ( *p < nDefCnt )
+ pTable[*p] |= TOKEN_WORD;
+ }
+ }
+}
+
+
+void cclass_Unicode::destroyParserTable()
+{
+ if ( pCont )
+ delete [] pCont;
+ if ( pStart )
+ delete [] pStart;
+ if ( pTable )
+ delete [] pTable;
+}
+
+
+UPT_FLAG_TYPE cclass_Unicode::getFlags( const sal_Unicode* aStr, sal_Int32 nPos )
+{
+ UPT_FLAG_TYPE nMask;
+ sal_Unicode c = aStr[nPos];
+ if ( c < nDefCnt )
+ nMask = pTable[ sal_uInt8(c) ];
+ else
+ nMask = getFlagsExtended( aStr, nPos );
+ switch ( eState )
+ {
+ case ssGetChar :
+ case ssRewindFromValue :
+ case ssIgnoreLeadingInRewind :
+ case ssGetWordFirstChar :
+ if ( !(nMask & TOKEN_CHAR_WORD) )
+ {
+ nMask |= getStartCharsFlags( c );
+ if ( nMask & TOKEN_CHAR_WORD )
+ nMask &= ~TOKEN_EXCLUDED;
+ }
+ break;
+ case ssGetValue :
+ case ssGetWord :
+ if ( !(nMask & TOKEN_WORD) )
+ {
+ nMask |= getContCharsFlags( c );
+ if ( nMask & TOKEN_WORD )
+ nMask &= ~TOKEN_EXCLUDED;
+ }
+ break;
+ default:
+ ; // other cases aren't needed, no compiler warning
+ }
+ return nMask;
+}
+
+
+UPT_FLAG_TYPE cclass_Unicode::getFlagsExtended( const sal_Unicode* aStr, sal_Int32 nPos )
+{
+ sal_Unicode c = aStr[nPos];
+ if ( c == cGroupSep )
+ return TOKEN_VALUE;
+ else if ( c == cDecimalSep )
+ return TOKEN_CHAR_VALUE | TOKEN_VALUE;
+ using namespace i18n;
+ bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
+ eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind);
+ sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
+
+ //! all KParseTokens::UNI_... must be matched
+ switch ( u_charType( (sal_uInt32) c ) )
+ {
+ case U_UPPERCASE_LETTER :
+ return (nTypes & KParseTokens::UNI_UPALPHA) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL;
+ case U_LOWERCASE_LETTER :
+ return (nTypes & KParseTokens::UNI_LOALPHA) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL;
+ case U_TITLECASE_LETTER :
+ return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL;
+ case U_MODIFIER_LETTER :
+ return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL;
+ case U_NON_SPACING_MARK :
+ case U_COMBINING_SPACING_MARK :
+ // Non_Spacing_Mark can't be a leading character,
+ // nor can a spacing combining mark.
+ if (bStart)
+ return TOKEN_ILLEGAL;
+ // fall through, treat it as Other_Letter.
+ case U_OTHER_LETTER :
+ return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL;
+ case U_DECIMAL_DIGIT_NUMBER :
+ return ((nTypes & KParseTokens::UNI_DIGIT) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
+ case U_LETTER_NUMBER :
+ return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
+ case U_OTHER_NUMBER :
+ return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
+ (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
+ TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
+ case U_SPACE_SEPARATOR :
+ return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
+ TOKEN_CHAR_DONTCARE : (bStart ? TOKEN_CHAR_WORD : (TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP) ));
+ }
+
+ return TOKEN_ILLEGAL;
+}
+
+
+UPT_FLAG_TYPE cclass_Unicode::getStartCharsFlags( sal_Unicode c )
+{
+ if ( pStart )
+ {
+ const sal_Unicode* pStr = aStartChars.getStr();
+ const sal_Unicode* p = StrChr( pStr, c );
+ if ( p )
+ return pStart[ p - pStr ];
+ }
+ return TOKEN_ILLEGAL;
+}
+
+
+UPT_FLAG_TYPE cclass_Unicode::getContCharsFlags( sal_Unicode c )
+{
+ if ( pCont )
+ {
+ const sal_Unicode* pStr = aContChars.getStr();
+ const sal_Unicode* p = StrChr( pStr, c );
+ if ( p )
+ return pCont[ p - pStr ];
+ }
+ return TOKEN_ILLEGAL;
+}
+
+
+void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
+{
+ using namespace i18n;
+ const sal_Unicode* const pTextStart = rText.getStr() + nPos;
+ eState = ssGetChar;
+
+ //! All the variables below (plus ParseResult) have to be resetted on ssRewindFromValue!
+ const sal_Unicode* pSym = pTextStart;
+ const sal_Unicode* pSrc = pSym;
+ OUString aSymbol;
+ sal_Unicode c = *pSrc;
+ sal_Unicode cLast = 0;
+ int nDecSeps = 0;
+ bool bQuote = false;
+ bool bMightBeWord = true;
+ bool bMightBeWordLast = true;
+ //! All the variables above (plus ParseResult) have to be resetted on ssRewindFromValue!
+
+ while ( (c != 0) && (eState != ssStop) )
+ {
+ UPT_FLAG_TYPE nMask = getFlags( pTextStart, pSrc - pTextStart );
+ if ( nMask & TOKEN_EXCLUDED )
+ eState = ssBounce;
+ if ( bMightBeWord )
+ { // only relevant for ssGetValue fall back
+ if ( eState == ssGetChar || eState == ssRewindFromValue ||
+ eState == ssIgnoreLeadingInRewind )
+ bMightBeWord = ((nMask & TOKEN_CHAR_WORD) != 0);
+ else
+ bMightBeWord = ((nMask & TOKEN_WORD) != 0);
+ }
+ sal_Int32 nParseTokensType = getParseTokensType( pTextStart, pSrc - pTextStart );
+ pSrc++;
+ switch (eState)
+ {
+ case ssGetChar :
+ case ssRewindFromValue :
+ case ssIgnoreLeadingInRewind :
+ {
+ if ( (nMask & TOKEN_CHAR_VALUE) && eState != ssRewindFromValue
+ && eState != ssIgnoreLeadingInRewind )
+ { //! must be first, may fall back to ssGetWord via bMightBeWord
+ eState = ssGetValue;
+ if ( nMask & TOKEN_VALUE_DIGIT )
+ {
+ if ( 128 <= c )
+ r.TokenType = KParseType::UNI_NUMBER;
+ else
+ r.TokenType = KParseType::ASC_NUMBER;
+ }
+ else if ( c == cDecimalSep )
+ {
+ if ( *pSrc )
+ ++nDecSeps;
+ else
+ eState = ssRewindFromValue;
+ // retry for ONE_SINGLE_CHAR or others
+ }
+ }
+ else if ( nMask & TOKEN_CHAR_WORD )
+ {
+ eState = ssGetWord;
+ r.TokenType = KParseType::IDENTNAME;
+ }
+ else if ( nMask & TOKEN_NAME_SEP )
+ {
+ eState = ssGetWordFirstChar;
+ bQuote = true;
+ pSym++;
+ nParseTokensType = 0; // will be taken of first real character
+ r.TokenType = KParseType::SINGLE_QUOTE_NAME;
+ }
+ else if ( nMask & TOKEN_CHAR_STRING )
+ {
+ eState = ssGetString;
+ pSym++;
+ nParseTokensType = 0; // will be taken of first real character
+ r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
+ }
+ else if ( nMask & TOKEN_CHAR_DONTCARE )
+ {
+ if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
+ {
+ if (eState == ssRewindFromValue)
+ eState = ssIgnoreLeadingInRewind;
+ r.LeadingWhiteSpace++;
+ pSym++;
+ nParseTokensType = 0; // wait until real character
+ bMightBeWord = true;
+ }
+ else
+ eState = ssBounce;
+ }
+ else if ( nMask & TOKEN_CHAR_BOOL )
+ {
+ eState = ssGetBool;
+ r.TokenType = KParseType::BOOLEAN;
+ }
+ else if ( nMask & TOKEN_CHAR )
+ { //! must be last
+ eState = ssStop;
+ r.TokenType = KParseType::ONE_SINGLE_CHAR;
+ }
+ else
+ eState = ssBounce; // not known
+ }
+ break;
+ case ssGetValue :
+ {
+ if ( nMask & TOKEN_VALUE_DIGIT )
+ {
+ if ( 128 <= c )
+ r.TokenType = KParseType::UNI_NUMBER;
+ else if ( r.TokenType != KParseType::UNI_NUMBER )
+ r.TokenType = KParseType::ASC_NUMBER;
+ }
+ if ( nMask & TOKEN_VALUE )
+ {
+ if ( c == cDecimalSep && ++nDecSeps > 1 )
+ {
+ if ( pSrc - pTextStart == 2 )
+ eState = ssRewindFromValue;
+ // consecutive separators
+ else
+ eState = ssStopBack;
+ }
+ // else keep it going
+ }
+ else if ( c == 'E' || c == 'e' )
+ {
+ UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart );
+ if ( nNext & TOKEN_VALUE_EXP )
+ ; // keep it going
+ else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) )
+ { // might be a numerical name (1.2efg)
+ eState = ssGetWord;
+ r.TokenType = KParseType::IDENTNAME;
+ }
+ else
+ eState = ssStopBack;
+ }
+ else if ( nMask & TOKEN_VALUE_SIGN )
+ {
+ if ( (cLast == 'E') || (cLast == 'e') )
+ {
+ UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart );
+ if ( nNext & TOKEN_VALUE_EXP_VALUE )
+ ; // keep it going
+ else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) )
+ { // might be a numerical name (1.2e+fg)
+ eState = ssGetWord;
+ r.TokenType = KParseType::IDENTNAME;
+ }
+ else
+ eState = ssStopBack;
+ }
+ else if ( bMightBeWord )
+ { // might be a numerical name (1.2+fg)
+ eState = ssGetWord;
+ r.TokenType = KParseType::IDENTNAME;
+ }
+ else
+ eState = ssStopBack;
+ }
+ else if ( bMightBeWord && (nMask & TOKEN_WORD) )
+ { // might be a numerical name (1995.A1)
+ eState = ssGetWord;
+ r.TokenType = KParseType::IDENTNAME;
+ }
+ else
+ eState = ssStopBack;
+ }
+ break;
+ case ssGetWordFirstChar :
+ eState = ssGetWord;
+ // fall thru
+ case ssGetWord :
+ {
+ if ( nMask & TOKEN_WORD )
+ ; // keep it going
+ else if ( nMask & TOKEN_NAME_SEP )
+ {
+ if ( bQuote )
+ {
+ if ( cLast == '\\' )
+ { // escaped
+ aSymbol += OUString( pSym, pSrc - pSym - 2 );
+ aSymbol += OUString( &c, 1);
+ }
+ else
+ {
+ eState = ssStop;
+ aSymbol += OUString( pSym, pSrc - pSym - 1 );
+ }
+ pSym = pSrc;
+ }
+ else
+ eState = ssStopBack;
+ }
+ else if ( bQuote )
+ ; // keep it going
+ else
+ eState = ssStopBack;
+ }
+ break;
+ case ssGetString :
+ {
+ if ( nMask & TOKEN_STRING_SEP )
+ {
+ if ( cLast == '\\' )
+ { // escaped
+ aSymbol += OUString( pSym, pSrc - pSym - 2 );
+ aSymbol += OUString( &c, 1);
+ }
+ else if ( c == *pSrc &&
+ !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
+ { // "" => literal " escaped
+ aSymbol += OUString( pSym, pSrc - pSym );
+ pSrc++;
+ }
+ else
+ {
+ eState = ssStop;
+ aSymbol += OUString( pSym, pSrc - pSym - 1 );
+ }
+ pSym = pSrc;
+ }
+ }
+ break;
+ case ssGetBool :
+ {
+ if ( (nMask & TOKEN_BOOL) )
+ eState = ssStop; // maximum 2: <, >, <>, <=, >=
+ else
+ eState = ssStopBack;
+ }
+ break;
+ case ssStopBack :
+ case ssBounce :
+ case ssStop :
+ ; // nothing, no compiler warning
+ break;
+ }
+ if ( eState == ssRewindFromValue )
+ {
+ r = ParseResult();
+ pSym = pTextStart;
+ pSrc = pSym;
+ aSymbol = OUString();
+ c = *pSrc;
+ cLast = 0;
+ nDecSeps = 0;
+ bQuote = false;
+ bMightBeWord = true;
+ bMightBeWordLast = true;
+ }
+ else
+ {
+ if ( !(r.TokenType & nTokenType) )
+ {
+ if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
+ && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
+ ; // keep a number that might be a word
+ else if ( r.LeadingWhiteSpace == (pSrc - pTextStart) )
+ ; // keep ignored white space
+ else if ( !r.TokenType && eState == ssGetValue && (nMask & TOKEN_VALUE_SEP) )
+ ; // keep uncertain value
+ else
+ eState = ssBounce;
+ }
+ if ( eState == ssBounce )
+ {
+ r.TokenType = 0;
+ eState = ssStopBack;
+ }
+ if ( eState == ssStopBack )
+ { // put back
+ pSrc--;
+ bMightBeWord = bMightBeWordLast;
+ eState = ssStop;
+ }
+ if ( eState != ssStop )
+ {
+ if ( !r.StartFlags )
+ r.StartFlags |= nParseTokensType;
+ else
+ r.ContFlags |= nParseTokensType;
+ }
+ bMightBeWordLast = bMightBeWord;
+ cLast = c;
+ c = *pSrc;
+ }
+ }
+ // r.CharLen is the length in characters (not code points) of the parsed
+ // token not including any leading white space, change this calculation if
+ // multi-code-point Unicode characters are to be supported.
+ r.CharLen = pSrc - pTextStart - r.LeadingWhiteSpace;
+ r.EndPos = nPos + (pSrc - pTextStart);
+ if ( r.TokenType & KParseType::ASC_NUMBER )
+ {
+ r.Value = rtl_math_uStringToDouble( pTextStart + r.LeadingWhiteSpace,
+ pTextStart + r.EndPos, cDecimalSep, cGroupSep, NULL, NULL );
+ if ( bMightBeWord )
+ r.TokenType |= KParseType::IDENTNAME;
+ }
+ else if ( r.TokenType & KParseType::UNI_NUMBER )
+ {
+ if ( !xNatNumSup.is() )
+ {
+#define NATIVENUMBERSUPPLIER_SERVICENAME "com.sun.star.i18n.NativeNumberSupplier"
+ if ( xMSF.is() )
+ {
+ xNatNumSup = Reference< XNativeNumberSupplier > (
+ xMSF->createInstance( OUString(
+ RTL_CONSTASCII_USTRINGPARAM(
+ NATIVENUMBERSUPPLIER_SERVICENAME ) ) ),
+ UNO_QUERY );
+ }
+ if ( !xNatNumSup.is() )
+ {
+ throw RuntimeException( OUString(
+#ifdef DBG_UTIL
+ RTL_CONSTASCII_USTRINGPARAM(
+ "cclass_Unicode::parseText: can't instanciate "
+ NATIVENUMBERSUPPLIER_SERVICENAME )
+#endif
+ ), *this );
+ }
+#undef NATIVENUMBERSUPPLIER_SERVICENAME
+ }
+ OUString aTmp( pTextStart + r.LeadingWhiteSpace, r.EndPos - nPos +
+ r.LeadingWhiteSpace );
+ // transliterate to ASCII
+ aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
+ NativeNumberMode::NATNUM0 );
+ r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep, NULL, NULL );
+ if ( bMightBeWord )
+ r.TokenType |= KParseType::IDENTNAME;
+ }
+ else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
+ {
+ if ( pSym < pSrc )
+ { //! open quote
+ aSymbol += OUString( pSym, pSrc - pSym );
+ r.TokenType |= KParseType::MISSING_QUOTE;
+ }
+ r.DequotedNameOrString = aSymbol;
+ }
+}
+
+} } } }
diff --git a/i18npool/source/characterclassification/characterclassificationImpl.cxx b/i18npool/source/characterclassification/characterclassificationImpl.cxx
new file mode 100644
index 000000000000..5665bf654194
--- /dev/null
+++ b/i18npool/source/characterclassification/characterclassificationImpl.cxx
@@ -0,0 +1,236 @@
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org. If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_i18npool.hxx"
+
+#include <characterclassificationImpl.hxx>
+#include <rtl/ustrbuf.hxx>
+
+using namespace com::sun::star::uno;
+using namespace com::sun::star::lang;
+using namespace rtl;
+
+namespace com { namespace sun { namespace star { namespace i18n {
+
+CharacterClassificationImpl::CharacterClassificationImpl(
+ const Reference < lang::XMultiServiceFactory >& rxMSF ) : xMSF( rxMSF )
+{
+ if (createLocaleSpecificCharacterClassification(OUString::createFromAscii("Unicode"), Locale()))
+ xUCI = cachedItem->xCI;
+}
+
+CharacterClassificationImpl::~CharacterClassificationImpl() {
+ // Clear lookuptable
+ for (size_t l = 0; l < lookupTable.size(); l++)
+ delete lookupTable[l];
+ lookupTable.clear();
+}
+
+
+OUString SAL_CALL
+CharacterClassificationImpl::toUpper( const OUString& Text, sal_Int32 nPos,
+ sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->toUpper(Text, nPos, nCount, rLocale);
+}
+
+OUString SAL_CALL
+CharacterClassificationImpl::toLower( const OUString& Text, sal_Int32 nPos,
+ sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->toLower(Text, nPos, nCount, rLocale);
+}
+
+OUString SAL_CALL
+CharacterClassificationImpl::toTitle( const OUString& Text, sal_Int32 nPos,
+ sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->toTitle(Text, nPos, nCount, rLocale);
+}
+
+sal_Int16 SAL_CALL
+CharacterClassificationImpl::getType( const OUString& Text, sal_Int32 nPos )
+ throw(RuntimeException)
+{
+ if (xUCI.is())
+ return xUCI->getType(Text, nPos);
+ throw RuntimeException();
+}
+
+sal_Int16 SAL_CALL
+CharacterClassificationImpl::getCharacterDirection( const OUString& Text, sal_Int32 nPos )
+ throw(RuntimeException)
+{
+ if (xUCI.is())
+ return xUCI->getCharacterDirection(Text, nPos);
+ throw RuntimeException();
+}
+
+sal_Int16 SAL_CALL
+CharacterClassificationImpl::getScript( const OUString& Text, sal_Int32 nPos )
+ throw(RuntimeException)
+{
+ if (xUCI.is())
+ return xUCI->getScript(Text, nPos);
+ throw RuntimeException();
+}
+
+sal_Int32 SAL_CALL
+CharacterClassificationImpl::getCharacterType( const OUString& Text, sal_Int32 nPos,
+ const Locale& rLocale ) throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->getCharacterType(Text, nPos, rLocale);
+}
+
+sal_Int32 SAL_CALL
+CharacterClassificationImpl::getStringType( const OUString& Text, sal_Int32 nPos,
+ sal_Int32 nCount, const Locale& rLocale ) throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->getStringType(Text, nPos, nCount, rLocale);
+}
+
+ParseResult SAL_CALL CharacterClassificationImpl::parseAnyToken(
+ const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
+ sal_Int32 startCharTokenType, const OUString& userDefinedCharactersStart,
+ sal_Int32 contCharTokenType, const OUString& userDefinedCharactersCont )
+ throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->parseAnyToken(Text, nPos, rLocale,
+ startCharTokenType,userDefinedCharactersStart,
+ contCharTokenType, userDefinedCharactersCont);
+}
+
+
+ParseResult SAL_CALL CharacterClassificationImpl::parsePredefinedToken(
+ sal_Int32 nTokenType, const OUString& Text, sal_Int32 nPos,
+ const Locale& rLocale, sal_Int32 startCharTokenType,
+ const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
+ const OUString& userDefinedCharactersCont ) throw(RuntimeException)
+{
+ return getLocaleSpecificCharacterClassification(rLocale)->parsePredefinedToken(
+ nTokenType, Text, nPos, rLocale, startCharTokenType, userDefinedCharactersStart,
+ contCharTokenType, userDefinedCharactersCont);
+}
+
+sal_Bool SAL_CALL CharacterClassificationImpl::createLocaleSpecificCharacterClassification(const OUString& serviceName, const Locale& rLocale)
+{
+ // to share service between same Language but different Country code, like zh_CN and zh_SG
+ for (size_t l = 0; l < lookupTable.size(); l++) {
+ cachedItem = lookupTable[l];
+ if (serviceName == cachedItem->aName) {
+ lookupTable.push_back( cachedItem = new lookupTableItem(rLocale, serviceName, cachedItem->xCI) );
+ return sal_True;
+ }
+ }
+
+ Reference < XInterface > xI = xMSF->createInstance(
+ OUString::createFromAscii("com.sun.star.i18n.CharacterClassification_") + serviceName);
+
+ Reference < XCharacterClassification > xCI;
+ if ( xI.is() ) {
+ xI->queryInterface(::getCppuType((const Reference< XCharacterClassification>*)0) ) >>= xCI;
+ if (xCI.is()) {
+ lookupTable.push_back( cachedItem = new lookupTableItem(rLocale, serviceName, xCI) );
+ return sal_True;
+ }
+ }
+ return sal_False;
+}
+
+Reference < XCharacterClassification > SAL_CALL
+CharacterClassificationImpl::getLocaleSpecificCharacterClassification(const Locale& rLocale)
+ throw(RuntimeException)
+{
+ // reuse instance if locale didn't change
+ if (cachedItem && cachedItem->equals(rLocale))
+ return cachedItem->xCI;
+ else if (xMSF.is()) {
+ for (size_t i = 0; i < lookupTable.size(); i++) {
+ cachedItem = lookupTable[i];
+ if (cachedItem->equals(rLocale))
+ return cachedItem->xCI;
+ }
+
+ static sal_Unicode under = (sal_Unicode)'_';
+ static OUString tw(OUString::createFromAscii("TW"));
+ sal_Int32 l = rLocale.Language.getLength();
+ sal_Int32 c = rLocale.Country.getLength();
+ sal_Int32 v = rLocale.Variant.getLength();
+ OUStringBuffer aBuf(l+c+v+3);
+
+ // load service with name <base>_<lang>_<country>_<varian>
+ if ((l > 0 && c > 0 && v > 0 &&
+ createLocaleSpecificCharacterClassification(aBuf.append(rLocale.Language).append(under).append(
+ rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear(), rLocale)) ||
+ // load service with name <base>_<lang>_<country>
+ (l > 0 && c > 0 &&
+ createLocaleSpecificCharacterClassification(aBuf.append(rLocale.Language).append(under).append(
+ rLocale.Country).makeStringAndClear(), rLocale)) ||
+ (l > 0 && c > 0 && rLocale.Language.compareToAscii("zh") == 0 &&
+ (rLocale.Country.compareToAscii("HK") == 0 ||
+ rLocale.Country.compareToAscii("MO") == 0) &&
+ // if the country code is HK or MO, one more step to try TW.
+ createLocaleSpecificCharacterClassification(aBuf.append(rLocale.Language).append(under).append(
+ tw).makeStringAndClear(), rLocale)) ||
+ (l > 0 &&
+ // load service with name <base>_<lang>
+ createLocaleSpecificCharacterClassification(rLocale.Language, rLocale))) {
+ return cachedItem->xCI;
+ } else if (xUCI.is()) {
+ lookupTable.push_back( cachedItem = new lookupTableItem(rLocale, OUString::createFromAscii("Unicode"), xUCI) );
+ return cachedItem->xCI;
+ }
+ }
+ throw RuntimeException();
+}
+
+const sal_Char cClass[] = "com.sun.star.i18n.CharacterClassification";
+
+OUString SAL_CALL
+CharacterClassificationImpl::getImplementationName(void)
+ throw( RuntimeException )
+{
+ return OUString::createFromAscii(cClass);
+}
+
+sal_Bool SAL_CALL
+CharacterClassificationImpl::supportsService(const rtl::OUString& rServiceName)
+ throw( RuntimeException )
+{
+ return !rServiceName.compareToAscii(cClass);
+}
+
+Sequence< OUString > SAL_CALL
+CharacterClassificationImpl::getSupportedServiceNames(void) throw( RuntimeException )
+{
+ Sequence< OUString > aRet(1);
+ aRet[0] = OUString::createFromAscii(cClass);
+ return aRet;
+}
+
+} } } }
diff --git a/i18npool/source/characterclassification/makefile.mk b/i18npool/source/characterclassification/makefile.mk
new file mode 100644
index 000000000000..7b8bfe98253a
--- /dev/null
+++ b/i18npool/source/characterclassification/makefile.mk
@@ -0,0 +1,52 @@
+#*************************************************************************
+#*
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# Copyright 2000, 2010 Oracle and/or its affiliates.
+#
+# OpenOffice.org - a multi-platform office productivity suite
+#
+# This file is part of OpenOffice.org.
+#
+# OpenOffice.org is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License version 3
+# only, as published by the Free Software Foundation.
+#
+# OpenOffice.org is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License version 3 for more details
+# (a copy is included in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU Lesser General Public License
+# version 3 along with OpenOffice.org. If not, see
+# <http://www.openoffice.org/license.html>
+# for a copy of the LGPLv3 License.
+#
+#************************************************************************/
+
+PRJ=..$/..
+
+PRJNAME=i18npool
+TARGET=characterclassification
+
+ENABLE_EXCEPTIONS=TRUE
+
+# --- Settings -----------------------------------------------------
+
+.INCLUDE : settings.mk
+
+# --- Files --------------------------------------------------------
+
+SLOFILES= \
+ $(SLO)$/characterclassificationImpl.obj \
+ $(SLO)$/cclass_unicode.obj \
+ $(SLO)$/cclass_unicode_parser.obj \
+ $(SLO)$/scripttypedetector.obj
+
+# --- Targets ------------------------------------------------------
+
+.INCLUDE : target.mk
+
+
+
diff --git a/i18npool/source/characterclassification/scripttypedetector.cxx b/i18npool/source/characterclassification/scripttypedetector.cxx
new file mode 100644
index 000000000000..cb90e99075e1
--- /dev/null
+++ b/i18npool/source/characterclassification/scripttypedetector.cxx
@@ -0,0 +1,182 @@
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org. If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_i18npool.hxx"
+
+#include <com/sun/star/i18n/CTLScriptType.hpp>
+#include <com/sun/star/i18n/ScriptDirection.hpp>
+#include <com/sun/star/i18n/UnicodeScript.hpp>
+#include <scripttypedetector.hxx>
+#include <i18nutil/unicode.hxx>
+
+// ----------------------------------------------------
+// class ScriptTypeDetector
+// ----------------------------------------------------;
+
+using namespace com::sun::star::i18n;
+
+ScriptTypeDetector::ScriptTypeDetector()
+{
+}
+
+ScriptTypeDetector::~ScriptTypeDetector()
+{
+}
+
+static sal_Int16 scriptDirection[] = {
+ ScriptDirection::LEFT_TO_RIGHT, // DirectionProperty_LEFT_TO_RIGHT = 0,
+ ScriptDirection::RIGHT_TO_LEFT, // DirectionProperty_RIGHT_TO_LEFT = 1,
+ ScriptDirection::LEFT_TO_RIGHT, // DirectionProperty_EUROPEAN_NUMBER = 2,
+ ScriptDirection::LEFT_TO_RIGHT, // DirectionProperty_EUROPEAN_NUMBER_SEPARATOR = 3,
+ ScriptDirection::LEFT_TO_RIGHT, // DirectionProperty_EUROPEAN_NUMBER_TERMINATOR = 4,
+ ScriptDirection::RIGHT_TO_LEFT, // DirectionProperty_ARABIC_NUMBER = 5,
+ ScriptDirection::NEUTRAL, // DirectionProperty_COMMON_NUMBER_SEPARATOR = 6,
+ ScriptDirection::NEUTRAL, // DirectionProperty_BLOCK_SEPARATOR = 7,
+ ScriptDirection::NEUTRAL, // DirectionProperty_SEGMENT_SEPARATOR = 8,
+ ScriptDirection::NEUTRAL, // DirectionProperty_WHITE_SPACE_NEUTRAL = 9,
+ ScriptDirection::NEUTRAL, // DirectionProperty_OTHER_NEUTRAL = 10,
+ ScriptDirection::LEFT_TO_RIGHT, // DirectionProperty_LEFT_TO_RIGHT_EMBEDDING = 11,
+ ScriptDirection::LEFT_TO_RIGHT, // DirectionProperty_LEFT_TO_RIGHT_OVERRIDE = 12,
+ ScriptDirection::RIGHT_TO_LEFT, // DirectionProperty_RIGHT_TO_LEFT_ARABIC = 13,
+ ScriptDirection::RIGHT_TO_LEFT, // DirectionProperty_RIGHT_TO_LEFT_EMBEDDING = 14,
+ ScriptDirection::RIGHT_TO_LEFT, // DirectionProperty_RIGHT_TO_LEFT_OVERRIDE = 15,
+ ScriptDirection::NEUTRAL, // DirectionProperty_POP_DIRECTIONAL_FORMAT = 16,
+ ScriptDirection::NEUTRAL, // DirectionProperty_DIR_NON_SPACING_MARK = 17,
+ ScriptDirection::NEUTRAL, // DirectionProperty_BOUNDARY_NEUTRAL = 18,
+};
+
+sal_Int16 SAL_CALL
+ScriptTypeDetector::getScriptDirection( const ::rtl::OUString& Text, sal_Int32 nPos, sal_Int16 defaultScriptDirection ) throw (::com::sun::star::uno::RuntimeException)
+{
+ sal_Int16 dir = scriptDirection[unicode::getUnicodeDirection(Text[nPos])];
+ return (dir == ScriptDirection::NEUTRAL) ? defaultScriptDirection : dir;
+}
+
+// return value '-1' means either the direction on nPos is not same as scriptDirection or nPos is out of range.
+sal_Int32 SAL_CALL
+ScriptTypeDetector::beginOfScriptDirection( const ::rtl::OUString& Text, sal_Int32 nPos, sal_Int16 direction ) throw (::com::sun::star::uno::RuntimeException)
+{
+ sal_Int32 cPos = nPos;
+
+ if (cPos < Text.getLength()) {
+ for (; cPos >= 0; cPos--) {
+ if (direction != getScriptDirection(Text, cPos, direction))
+ break;
+ }
+ }
+ return cPos == nPos ? -1 : cPos + 1;
+}
+
+sal_Int32 SAL_CALL
+ScriptTypeDetector::endOfScriptDirection( const ::rtl::OUString& Text, sal_Int32 nPos, sal_Int16 direction ) throw (::com::sun::star::uno::RuntimeException)
+{
+ sal_Int32 cPos = nPos;
+ sal_Int32 len = Text.getLength();
+
+ if (cPos >=0) {
+ for (; cPos < len; cPos++) {
+ if (direction != getScriptDirection(Text, cPos, direction))
+ break;
+ }
+ }
+ return cPos == nPos ? -1 : cPos;
+}
+
+sal_Int16 SAL_CALL
+ScriptTypeDetector::getCTLScriptType( const ::rtl::OUString& Text, sal_Int32 nPos ) throw (::com::sun::star::uno::RuntimeException)
+{
+ static ScriptTypeList typeList[] = {
+ { UnicodeScript_kHebrew, UnicodeScript_kHebrew, CTLScriptType::CTL_HEBREW }, // 10
+ { UnicodeScript_kArabic, UnicodeScript_kArabic, CTLScriptType::CTL_ARABIC }, // 11
+ { UnicodeScript_kDevanagari, UnicodeScript_kDevanagari, CTLScriptType::CTL_INDIC }, // 14
+ { UnicodeScript_kThai, UnicodeScript_kThai, CTLScriptType::CTL_THAI }, // 24
+ { UnicodeScript_kScriptCount, UnicodeScript_kScriptCount, CTLScriptType::CTL_UNKNOWN } // 88
+ };
+
+ return unicode::getUnicodeScriptType(Text[nPos], typeList, CTLScriptType::CTL_UNKNOWN);
+}
+
+// Begin of Script Type is inclusive.
+sal_Int32 SAL_CALL
+ScriptTypeDetector::beginOfCTLScriptType( const ::rtl::OUString& Text, sal_Int32 nPos ) throw (::com::sun::star::uno::RuntimeException)
+{
+ if (nPos < 0)
+ return 0;
+ else if (nPos >= Text.getLength())
+ return Text.getLength();
+ else {
+ sal_Int16 cType = getCTLScriptType(Text, nPos);
+ for (nPos--; nPos >= 0; nPos--) {
+ if (cType != getCTLScriptType(Text, nPos))
+ break;
+ }
+ return nPos + 1;
+ }
+}
+
+// End of the Script Type is exclusive, the return value pointing to the begin of next script type
+sal_Int32 SAL_CALL
+ScriptTypeDetector::endOfCTLScriptType( const ::rtl::OUString& Text, sal_Int32 nPos ) throw (::com::sun::star::uno::RuntimeException)
+{
+ if (nPos < 0)
+ return 0;
+ else if (nPos >= Text.getLength())
+ return Text.getLength();
+ else {
+ sal_Int16 cType = getCTLScriptType(Text, nPos);
+ sal_Int32 len = Text.getLength();
+ for (nPos++; nPos < len; nPos++) {
+ if (cType != getCTLScriptType(Text, nPos))
+ break;
+ }
+ return nPos;
+ }
+}
+
+const sal_Char sDetector[] = "draft.com.sun.star.i18n.ScriptTypeDetector";
+
+rtl::OUString SAL_CALL
+ScriptTypeDetector::getImplementationName() throw( ::com::sun::star::uno::RuntimeException )
+{
+ return ::rtl::OUString::createFromAscii(sDetector);
+}
+
+sal_Bool SAL_CALL
+ScriptTypeDetector::supportsService(const rtl::OUString& ServiceName) throw( ::com::sun::star::uno::RuntimeException )
+{
+ return !ServiceName.compareToAscii(sDetector);
+}
+
+::com::sun::star::uno::Sequence< rtl::OUString > SAL_CALL
+ScriptTypeDetector::getSupportedServiceNames() throw( ::com::sun::star::uno::RuntimeException )
+{
+ ::com::sun::star::uno::Sequence< ::rtl::OUString > aRet(1);
+ aRet[0] = ::rtl::OUString::createFromAscii(sDetector);
+ return aRet;
+}
+