diff options
Diffstat (limited to 'i18npool/source/breakiterator')
-rw-r--r-- | i18npool/source/breakiterator/breakiteratorImpl.cxx | 109 | ||||
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_cjk.cxx | 3 | ||||
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_ctl.cxx | 3 | ||||
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_th.cxx | 3 | ||||
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_unicode.cxx | 3 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/char.txt | 118 | ||||
-rw-r--r-- | i18npool/source/breakiterator/data/makefile.mk | 22 | ||||
-rw-r--r-- | i18npool/source/breakiterator/gendict.cxx | 305 | ||||
-rw-r--r-- | i18npool/source/breakiterator/makefile.mk | 63 | ||||
-rw-r--r-- | i18npool/source/breakiterator/xdictionary.cxx | 160 |
10 files changed, 463 insertions, 326 deletions
diff --git a/i18npool/source/breakiterator/breakiteratorImpl.cxx b/i18npool/source/breakiterator/breakiteratorImpl.cxx index 19b175d4bd83..3cc974870c3d 100644 --- a/i18npool/source/breakiterator/breakiteratorImpl.cxx +++ b/i18npool/source/breakiterator/breakiteratorImpl.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -132,7 +133,7 @@ Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 n } static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) { - return rLocale.Language.equalsAscii("zh") || rLocale.Language.equalsAscii("ja") || rLocale.Language.equalsAscii("ko"); + return rLocale.Language.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("zh")) || rLocale.Language.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("ja")) || rLocale.Language.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("ko")); } Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos, @@ -442,38 +443,49 @@ sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/, return 0; } -typedef struct { - UBlockCode from; - UBlockCode to; - sal_Int16 script; -} UBlock2Script; - -// for a list of the UBLOCK_... values see: -// http://icu-project.org/apiref/icu4c/uchar_8h.html -// where enum UBlockCode is defined. -// See also http://www.unicode.org/charts/ for general reference -static UBlock2Script scriptList[] = { - {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK}, - {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN}, - {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX}, - {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN}, - {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN}, - {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX}, - {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN}, - {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX}, - {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN}, - {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN}, - {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN}, - {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX}, - {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN}, - {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX}, - {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN}, - {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN}, - {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN}, - {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN} -}; - -#define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script) +static sal_Int16 scriptTypes[] = { + ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, +// 15 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX, + ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, +// 30 + ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 45 + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 60 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, +// 75 + ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 90 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, +// 105 + ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, +// 120 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, +// 135 + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, + ScriptType::COMPLEX, + ScriptType::WEAK}; + +#define scriptListCount SAL_N_ELEMENTS(scriptTypes) sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) { @@ -483,27 +495,13 @@ sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) if (currentChar != lastChar) { lastChar = currentChar; - //JP 21.9.2001: handle specific characters - always as weak - // definition of 1 - this breaks a word - // 2 - this can be inside a word - // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char. - if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar) + int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT); + if (script < 0) nRet = ScriptType::WEAK; - // workaround for Coptic - else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar) - nRet = ScriptType::LATIN; - // work-around for ligatures (see http://www.unicode.org/charts/PDF/UFB00.pdf) - else if ((0xFB00 <= currentChar && currentChar <= 0xFB06) || - (0xFB13 <= currentChar && currentChar <= 0xFB17)) - nRet = ScriptType::LATIN; - else { - UBlockCode block=ublock_getCode(currentChar); - sal_uInt16 i; - for ( i = 0; i < scriptListCount; i++) { - if (block <= scriptList[i].to) break; - } - nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK; - } + else if (static_cast<size_t>(script) >= SAL_N_ELEMENTS(scriptTypes)) + nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild + else + nRet = scriptTypes[script]; } return nRet; } @@ -524,7 +522,7 @@ sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUS } Reference < uno::XInterface > xI = xMSF->createInstance( - OUString::createFromAscii("com.sun.star.i18n.BreakIterator_") + aLocaleName); + OUString(RTL_CONSTASCII_USTRINGPARAM("com.sun.star.i18n.BreakIterator_")) + aLocaleName); if ( xI.is() ) { xI->queryInterface( getCppuType((const Reference< XBreakIterator>*)0) ) >>= xBI; @@ -575,7 +573,7 @@ BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw ( // load service with name <base>_<lang> createLocaleSpecificBreakIterator(rLocale.Language)) || // load default service with name <base>_Unicode - createLocaleSpecificBreakIterator(OUString::createFromAscii("Unicode"))) { + createLocaleSpecificBreakIterator(OUString(RTL_CONSTASCII_USTRINGPARAM("Unicode")))) { lookupTable.push_back( new lookupTableItem(aLocale, xBI) ); return xBI; } @@ -607,3 +605,4 @@ BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException ) } } } } +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx b/i18npool/source/breakiterator/breakiterator_cjk.cxx index 16d7d1337538..ce7170c2a411 100644 --- a/i18npool/source/breakiterator/breakiterator_cjk.cxx +++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -184,3 +185,5 @@ BreakIterator_ko::~BreakIterator_ko() } } } } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/breakiterator/breakiterator_ctl.cxx b/i18npool/source/breakiterator/breakiterator_ctl.cxx index cc174084198e..6d6b3f9b9cd7 100644 --- a/i18npool/source/breakiterator/breakiterator_ctl.cxx +++ b/i18npool/source/breakiterator/breakiterator_ctl.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -143,3 +144,5 @@ LineBreakResults SAL_CALL BreakIterator_CTL::getLineBreak( } } } } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/breakiterator/breakiterator_th.cxx b/i18npool/source/breakiterator/breakiterator_th.cxx index cbbcd510379b..ad3c619d0979 100644 --- a/i18npool/source/breakiterator/breakiterator_th.cxx +++ b/i18npool/source/breakiterator/breakiterator_th.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -155,3 +156,5 @@ void SAL_CALL BreakIterator_th::makeIndex(const OUString& Text, sal_Int32 nStart } } } } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx index d7242d180d85..ad934db2db11 100644 --- a/i18npool/source/breakiterator/breakiterator_unicode.cxx +++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -443,3 +444,5 @@ BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeExcepti } } } } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/breakiterator/data/char.txt b/i18npool/source/breakiterator/data/char.txt new file mode 100644 index 000000000000..8e49a565ed8c --- /dev/null +++ b/i18npool/source/breakiterator/data/char.txt @@ -0,0 +1,118 @@ +# +# Copyright (C) 2002-2009, International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: char.txt +# +# ICU Character Break Rules, also known as Grapheme Cluster Boundaries +# See Unicode Standard Annex #29. +# These rules are based on TR29 Revision 13, for Unicode Version 5.1 +# Modifications to SpacingMark and Prepend by M. Hosken. +# + +# +# Character Class Definitions. +# +$CR = [\p{Grapheme_Cluster_Break = CR}]; +$LF = [\p{Grapheme_Cluster_Break = LF}]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; +$Extend = [\p{Grapheme_Cluster_Break = Extend}]; +$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; +# True Indic wants to move by syllables. Break up SpacingMark. This based on Unicode 6.0 data +# In effect it is [\p{Grapheme_Cluster_Break = SpacingMark} - \u0E30 \u0E32 \u0E45 \u0EB0 \u0EB2 \u102B \u102C \u1038 +# \u1062-\u1064 \u1067-\u106D \u1083 \u1087-\u108C \u108F \u109A-\u109C \u19B0-\u19B4 \u19B8-\u19C0 \u19C8 \u19C9 +# \u1A61 \u1A63 \u1A64 \u1BE7 \u1BEA-\u1BEC \u1BEE \u1BF2 \u1BF3 \uAA7B +$IndicSpacing = [\u0903 \u093B \u093E-\u0940 \u0949-\u094C \u094E \u094F \u0982 \u0983 \u09BF \u09C0 \u09C7 \u09C8 \u09CB \u09CC \u0A03 \u0A3E-\u0A40 \u0A83 \u0ABE-\u0AC0 \u0AC9 \u0ACB \u0ACC \u0B02 \u0B03 \u0B40 \u0B47 \u0B48 \u0B4B-\u0B4C \u0BBF \u0BC1 \u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0C01-\u0C03 \u0C41-\u0C44 \u0C82 \u0C83 \u0CBE \u0CC0 \u0CC1 \u0CC3 \u0CC4 \u0CC7 \u0CC8 \u0CCA \u0CCB \u0D02 \u0D03 \u0D3F \u0D40 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D82 \u0D83 \u0DD0 \u0DD1 \u0DD8-\u0DDE \u0F3E \u0F3F \u0F7F \u1923-\u1926 \u1929-\u192B \u1930 \u1931 \u1933-\u1938 \u1A19-\u1A1B \u1B04 \u1B35 \u1B3B \u1B3D-\u1B41 \u1B43 \u1B44 \u1B82 \u1BA1 \u1BA6 \u1BA7 \u1BAA \u1C24-\u1C2B \u1C34 \u1C35 \u1CE1 \u1CF2 \uA880 \uA881 \uA8B4-\uA8C3 \uA952 \uA953 \uA983 \uA9B4 \uA9B5 \uA9BA \uA9BB \uA9BD-\uA9C0 \uAA2F \uAA30 \uAA33 \uAA34 \uABE3 \uABE4 \uABE6 \uABE7 \uABE9 \uABEA \uABEC \U00011000 \U00011002 \U00011082 \U000110B0-\U000110B2 \U000110B7 \U000100B8 \U0001D166 \U0001D16D]; +# SEAsian (Thai, Lao, Burmese, Tai Lue, Tai Tham, Batak) are cluster based not syllable based +$SEASpacing = [\u0E33 \u0EB3 \u1031 \u103B \u103C \u1056 \u1057 \u1084 \u17B6 \u17BE-\u17C5 \u17C7 \u17C8 \u19B5-\u19B7 \u19BA \u1A55 \u1A57 \u1A6D-\u1A72 \uA823 \uA824 \uA827 \uAA4D]; +$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; +$BengaliSignVirama = \u09CD; +$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; +$GujaratiSignVirama = \u0ACD; +$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; +$DevanagariSignVirama = \u094D; +$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; +$KannadaSignVirama = \u0CCD; +$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; +$MalayalamSignVirama = \u0D4D; +$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; +$OriyaSignVirama = \u0B4D; +$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; +$GurmukhiSignVirama = \u0A4D; +$TamilLetter = [\u0B85-\u0BB9]; +$TamilSignVirama = \u0BCD; +$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; +$TeluguSignVirama = \u0C4D; + +# +# Korean Syllable Definitions +# +$L = [\p{Grapheme_Cluster_Break = L}]; +$V = [\p{Grapheme_Cluster_Break = V}]; +$T = [\p{Grapheme_Cluster_Break = T}]; + +$LV = [\p{Grapheme_Cluster_Break = LV}]; +$LVT = [\p{Grapheme_Cluster_Break = LVT}]; + + +## ------------------------------------------------- +!!chain; + +!!forward; + +$CR $LF; + +$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; +$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; +$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; +$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; +$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; +$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+; +$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; +$TamilLetter ($TamilSignVirama $TamilLetter?)+; +$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; + +$L ($L | $V | $LV | $LVT); +($LV | $V) ($V | $T); +($LVT | $T) $T; + +[^$Control $CR $LF] $Extend; + +[^$Control $CR $LF] ($IndicSpacing | $SEASpacing); +#[^$Control $CR $LF] $SpacingMark; +# $Prepend [^$Control $CR $LF]; + + +## ------------------------------------------------- + +!!reverse; +$LF $CR; +($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; +($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; +($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; +($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; +($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; +($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter; +($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; +($TamilLetter? $TamilSignVirama)+ $TamilLetter; +($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; +($L | $V | $LV | $LVT) $L; +($V | $T) ($LV | $V); +$T ($LVT | $T); + +$Extend [^$Control $CR $LF]; +($IndicSpacing | $SEASpacing) [^$Control $CR $LF]; +#$SpacingMark [^$Control $CR $LF]; +# [^$Control $CR $LF] $Prepend; + + +## ------------------------------------------------- + +!!safe_reverse; + + +## ------------------------------------------------- + +!!safe_forward; + diff --git a/i18npool/source/breakiterator/data/makefile.mk b/i18npool/source/breakiterator/data/makefile.mk index cb37c5132f67..81bbbbd280e1 100644 --- a/i18npool/source/breakiterator/data/makefile.mk +++ b/i18npool/source/breakiterator/data/makefile.mk @@ -24,7 +24,7 @@ # for a copy of the LGPLv3 License. # #************************************************************************ -PRJ=..$/..$/.. +PRJ=../../.. PRJNAME=i18npool TARGET=dict @@ -46,13 +46,13 @@ SHL1TARGET=dict_ja SHL1IMPLIB=i$(SHL1TARGET) SHL1VERSIONMAP=$(TARGET).map -SHL1DEF=$(MISC)$/$(SHL1TARGET).def +SHL1DEF=$(MISC)/$(SHL1TARGET).def DEF1NAME=$(SHL1TARGET) SHL1OBJS= \ - $(SLO)$/dict_ja.obj + $(SLO)/dict_ja.obj -LIB1TARGET= $(SLB)$/$(SHL1TARGET).lib +LIB1TARGET= $(SLB)/$(SHL1TARGET).lib LIB1OBJFILES=$(SHL1OBJS) # Chinese dictionary @@ -60,13 +60,13 @@ SHL2TARGET=dict_zh SHL2IMPLIB=i$(SHL2TARGET) SHL2VERSIONMAP=$(TARGET).map -SHL2DEF=$(MISC)$/$(SHL2TARGET).def +SHL2DEF=$(MISC)/$(SHL2TARGET).def DEF2NAME=$(SHL2TARGET) SHL2OBJS= \ - $(SLO)$/dict_zh.obj + $(SLO)/dict_zh.obj -LIB2TARGET= $(SLB)$/$(SHL2TARGET).lib +LIB2TARGET= $(SLB)/$(SHL2TARGET).lib LIB2OBJFILES=$(SHL2OBJS) DEPOBJFILES= \ @@ -76,9 +76,5 @@ DEPOBJFILES= \ # --- Targets ------------------------------------------------------ .INCLUDE : target.mk -$(MISC)$/dict_%.cxx : %.dic - $(AUGMENT_LIBRARY_PATH) $(BIN)$/gendict $< $@ - -# ugly - is this dependency really required here? -$(foreach,i,$(shell @$(FIND) . -name "*.dic") $(MISC)$/dict_$(i:b).cxx) : $(BIN)$/gendict$(EXECPOST) - +$(MISC)/dict_%.cxx : %.dic + $(AUGMENT_LIBRARY_PATH) $(OUT_FOR_BUILD)/bin/gendict $< $@ diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index fe2758602ee4..ab181be73836 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -35,171 +36,213 @@ #include <sal/types.h> #include <rtl/strbuf.hxx> #include <rtl/ustring.hxx> +#include <osl/diagnose.h> +#include <vector> +using std::vector; using namespace ::rtl; -/* Main Procedure */ +/* Utility gendict: -SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) -{ - FILE *sfp, *cfp; + "BreakIterator_CJK provides input string caching and dictionary searching for + longest matching. You can provide a sorted dictionary (the encoding must be + UTF-8) by creating the following file: + i18npool/source/breakiterator/data/<language>.dict. - if (argc < 3) exit(-1); + The utility gendict will convert the file to C code, which will be compiled + into a shared library for dynamic loading. - sfp = fopen(argv[1], "rb"); // open the source file for read; - if (sfp == NULL) - { - printf("Open the dictionary source file failed."); - return -1; - } + All dictionary searching and loading is performed in the xdictionary class. + The only thing you need to do is to derive your class from BreakIterator_CJK + and create an instance of the xdictionary with the language name and + pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/ + /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011) +*/ - // create the C source file to write - cfp = fopen(argv[2], "wb"); - if (cfp == NULL) { - fclose(sfp); - printf("Can't create the C source file."); - return -1; - } +// C-standard garantees that static variables are automatically initialized to 0 +static sal_uInt8 exists[0x2000]; +static sal_uInt32 charArray[0x10000]; - fprintf(cfp, "/*\n"); - fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n"); - fprintf(cfp, " * All Rights Reserved.\n"); - fprintf(cfp, " */\n\n"); - fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n"); - fprintf(cfp, "#include <sal/types.h>\n\n"); - fprintf(cfp, "extern \"C\" {\n"); - - sal_Int32 count, i, j; - sal_Int32 lenArrayCurr = 0, lenArrayCount = 0, lenArrayLen = 0, *lenArray = NULL, charArray[0x10000]; - sal_Bool exist[0x10000]; - for (i = 0; i < 0x10000; i++) { - exist[i] = sal_False; - charArray[i] = 0; - } +static inline void set_exists(sal_uInt32 index) +{ + exists[index>>3] |= 1 << (index & 0x07); +} + +static inline void printIncludes(FILE* source_fp) +{ + fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp); + fputs("#include <sal/types.h>\n\n", source_fp); +} +static inline void printFunctions(FILE* source_fp) +{ + fputs ("\tconst sal_uInt8* getExistMark() { return existMark; }\n", source_fp); + fputs ("\tconst sal_Int16* getIndex1() { return index1; }\n", source_fp); + fputs ("\tconst sal_Int32* getIndex2() { return index2; }\n", source_fp); + fputs ("\tconst sal_Int32* getLenArray() { return lenArray; }\n", source_fp); + fputs ("\tconst sal_Unicode* getDataArea() { return dataArea; }\n", source_fp); +} + +static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray) +{ // generate main dict. data array - fprintf(cfp, "static const sal_Unicode dataArea[] = {"); + fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp); sal_Char str[1024]; + sal_uInt32 lenArrayCurr = 0; sal_Unicode current = 0; - count = 0; - while (fgets(str, 1024, sfp)) { + + while (fgets(str, 1024, dictionary_fp)) { // input file is in UTF-8 encoding // don't convert last new line character to Ostr. OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8); const sal_Unicode *u = Ostr.getStr(); - sal_Int32 len = Ostr.getLength(); + const sal_Int32 len = Ostr.getLength(); - i=0; + sal_Int32 i=0; Ostr.iterateCodePoints(&i, 1); - if (len == i) continue; // skip one character word - - if (*u != current) { - if (*u < current) - printf("u %x, current %x, count %d, lenArrayCount %d\n", *u, current, - sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArrayCount)); - current = *u; - charArray[current] = lenArrayCount; + if (len == i) + continue; // skip one character word + + if (u[0] != current) { + OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted"); + current = u[0]; + charArray[current] = lenArray.size(); } - if (lenArrayLen <= lenArrayCount+1) - lenArray = (sal_Int32*) realloc(lenArray, (lenArrayLen += 1000) * sizeof(sal_Int32)); - lenArray[lenArrayCount++] = lenArrayCurr; - - exist[u[0]] = sal_True; - for (i = 1; i < len; i++) { // start from second character, - exist[u[i]] = sal_True; // since the first character is captured in charArray. - lenArrayCurr++; - if ((count++) % 0x10 == 0) - fprintf(cfp, "\n\t"); - fprintf(cfp, "0x%04x, ", u[i]); + lenArray.push_back(lenArrayCurr); + + set_exists(u[0]); + // first character is stored in charArray, so start from second + for (i = 1; i < len; i++, lenArrayCurr++) { + set_exists(u[i]); + fprintf(source_fp, "0x%04x, ", u[i]); + if ((lenArrayCurr & 0x0f) == 0x0f) + fputs("\n\t", source_fp); } } - lenArray[lenArrayCount++] = lenArrayCurr; // store last ending pointer - charArray[current+1] = lenArrayCount; - fprintf(cfp, "\n};\n"); - - // generate lenArray - fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t"); - count = 1; - fprintf(cfp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. - for (i = 0; i < lenArrayCount; i++) { - fprintf(cfp, "0x%lx, ", static_cast<long unsigned int>(lenArray[i])); - if (count == 0xf) { - count = 0; - fprintf(cfp, "\n\t"); - } else count++; - } - fprintf(cfp, "\n};\n"); + lenArray.push_back( lenArrayCurr ); // store last ending pointer + charArray[current+1] = lenArray.size(); + fputs("\n};\n", source_fp); +} - free(lenArray); +static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray) +{ + fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t"); + fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. + for (size_t k = 0; k < lenArray.size(); k++) + { + if( !(k & 0xf) ) + fputs("\n\t", source_fp); - // generate index1 array - fprintf (cfp, "static const sal_Int16 index1[] = {\n\t"); - sal_Int16 set[0x100]; - count = 0; - for (i = 0; i < 0x100; i++) { - for (j = 0; j < 0x100; j++) - if (charArray[(i*0x100) + j] != 0) - break; - - fprintf(cfp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff)); - if ((i+1) % 0x10 == 0) - fprintf (cfp, "\n\t"); + fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k])); + } + fputs("\n};\n", source_fp ); +} + +/* FIXME?: what happens if in every range i there is at least one charArray != 0 + => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff } + => then in index2, the last range will be ignored incorrectly */ +static inline void printIndex1(FILE *source_fp, sal_Int16 *set) +{ + fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t"); + sal_Int16 count = 0; + for (sal_Int32 i = 0; i < 0x100; i++) { + sal_Int32 j = 0; + while( j < 0x100 && charArray[(i<<8) + j] == 0) + j++; + + fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? count++ : 0xff)); + if ((i & 0x0f) == 0x0f) + fputs ("\n\t", source_fp); } - fprintf (cfp, "};\n"); + fputs("};\n", source_fp); +} - // generate index2 array - fprintf (cfp, "static const sal_Int32 index2[] = {\n\t"); +static inline void printIndex2(FILE *source_fp, sal_Int16 *set) +{ + fputs ("static const sal_Int32 index2[] = {\n\t", source_fp); sal_Int32 prev = 0; - for (i = 0; i < 0x100; i++) { + for (sal_Int32 i = 0; i < 0x100; i++) { if (set[i] != 0xff) { - for (j = 0; j < 0x100; j++) { - sal_Int32 k = (i*0x100) + j; - if (prev != 0 && charArray[k] == 0) { - for (k++; k < 0x10000; k++) - if (charArray[k] != 0) - break; + for (sal_Int32 j = 0; j < 0x100; j++) { + sal_Int32 k = (i<<8) + j; + if (prev != 0 ) + while( charArray[k] == 0 && k < 0x10000 ) + k++; + + prev = charArray[(i<<8) + j]; + fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0)); + if ((j & 0x0f) == 0x0f) + fputs ("\n\t", source_fp); } - prev = charArray[(i*0x100) + j]; - fprintf( - cfp, "0x%lx, ", - sal::static_int_cast< unsigned long >( - k < 0x10000 ? charArray[k] + 1 : 0)); - if ((j+1) % 0x10 == 0) - fprintf (cfp, "\n\t"); - } - fprintf (cfp, "\n\t"); + fputs ("\n\t", source_fp); } } - fprintf (cfp, "\n};\n"); - - // generate existMark array - count = 0; - fprintf (cfp, "static const sal_uInt8 existMark[] = {\n\t"); - for (i = 0; i < 0x1FFF; i++) { - sal_uInt8 bit = 0; - for (j = 0; j < 8; j++) - if (exist[i * 8 + j]) - bit |= 1 << j; - fprintf(cfp, "0x%02x, ", bit); - if (count == 0xf) { - count = 0; - fprintf(cfp, "\n\t"); - } else count++; + fputs ("\n};\n", source_fp); +} + +/* Generates a bitmask for the existance of sal_Unicode values in dictionary; + it packs 8 sal_Bool values in 1 sal_uInt8 */ +static inline void printExistsMask(FILE *source_fp) +{ + fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t"); + for (unsigned int i = 0; i < 0x2000; i++) + { + fprintf(source_fp, "0x%02x, ", exists[i]); + if ( (i & 0xf) == 0xf ) + fputs("\n\t", source_fp); } - fprintf (cfp, "\n};\n"); + fputs("\n};\n", source_fp); +} - // create function to return arrays - fprintf (cfp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n"); - fprintf (cfp, "\tconst sal_Int16* getIndex1() { return index1; }\n"); - fprintf (cfp, "\tconst sal_Int32* getIndex2() { return index2; }\n"); - fprintf (cfp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n"); - fprintf (cfp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); - fprintf (cfp, "}\n"); +SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) +{ + FILE *dictionary_fp, *source_fp; - fclose(sfp); - fclose(cfp); + if (argc == 1 || argc > 3) + { + fputs("2 arguments required: dictionary_file_name source_file_name", stderr); + exit(-1); + } + + dictionary_fp = fopen(argv[1], "rb"); // open the source file for read; + if (dictionary_fp == NULL) + { + printf("Open the dictionary source file failed."); + return -1; + } + + if(argc == 2) + source_fp = stdout; + else + { + // create the C source file to write + source_fp = fopen(argv[2], "wb"); + if (source_fp == NULL) { + fclose(dictionary_fp); + printf("Can't create the C source file."); + return -1; + } + } + + vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea + sal_Int16 set[0x100]; + + printIncludes(source_fp); + fputs("extern \"C\" {\n", source_fp); + printDataArea(dictionary_fp, source_fp, lenArray); + printLenArray(source_fp, lenArray); + printIndex1(source_fp, set); + printIndex2(source_fp, set); + printExistsMask(source_fp); + printFunctions(source_fp); + fputs("}\n", source_fp); + + fclose(dictionary_fp); + fclose(source_fp); return 0; -} // End of main +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/breakiterator/makefile.mk b/i18npool/source/breakiterator/makefile.mk index fc6561c3e0aa..434fdc9f5d88 100644 --- a/i18npool/source/breakiterator/makefile.mk +++ b/i18npool/source/breakiterator/makefile.mk @@ -24,7 +24,7 @@ # for a copy of the LGPLv3 License. # #************************************************************************/ -PRJ=..$/.. +PRJ=../.. PRJNAME=i18npool TARGET=breakiterator @@ -41,28 +41,28 @@ ENABLE_EXCEPTIONS=TRUE MY_BRK_TXTFILES:=$(shell @ls data/*.txt) # insert "OpenOffice" as icu package name in front of the name of each rule file for searching on application provided data -MY_BRK_BRKFILES:=$(subst,data/,$(MISC)$/ $(MY_BRK_TXTFILES:s/.txt/.brk/)) +MY_BRK_BRKFILES:=$(subst,data/,$(MISC)/ $(MY_BRK_TXTFILES:s/.txt/.brk/)) # OpenOffice_dat.c is a generated file from the rule file list by gencmn MY_MISC_CXXFILES := \ - $(MISC)$/OpenOffice_dat.c \ + $(MISC)/OpenOffice_dat.c \ $(MY_BRK_BRKFILES:s/.brk/_brk.c/) SLOFILES= \ - $(SLO)$/breakiteratorImpl.obj \ - $(SLO)$/breakiterator_cjk.obj \ - $(SLO)$/breakiterator_ctl.obj \ - $(SLO)$/breakiterator_th.obj \ - $(SLO)$/breakiterator_unicode.obj \ - $(SLO)$/xdictionary.obj \ - $(subst,$(MISC)$/,$(SLO)$/ $(MY_MISC_CXXFILES:s/.c/.obj/)) + $(SLO)/breakiteratorImpl.obj \ + $(SLO)/breakiterator_cjk.obj \ + $(SLO)/breakiterator_ctl.obj \ + $(SLO)/breakiterator_th.obj \ + $(SLO)/breakiterator_unicode.obj \ + $(SLO)/xdictionary.obj \ + $(subst,$(MISC)/,$(SLO)/ $(MY_MISC_CXXFILES:s/.c/.obj/)) -OBJFILES = $(OBJ)$/gendict.obj +OBJFILES = $(OBJ)/gendict.obj APP1TARGET = gendict APP1RPATH = NONE -DEPOBJFILES = $(OBJ)$/gendict.obj +DEPOBJFILES = $(OBJ)/gendict.obj APP1OBJS = $(DEPOBJFILES) APP1STDLIBS = $(SALLIB) @@ -74,37 +74,36 @@ GENCMN:=$(SYSTEM_GENCMN) GENBRK:=$(SYSTEM_GENBRK) GENCCODE:=$(SYSTEM_GENCCODE) .ELSE -GENCMN:=$(AUGMENT_LIBRARY_PATH) $(SOLARBINDIR)$/gencmn -GENBRK:=$(AUGMENT_LIBRARY_PATH) $(SOLARBINDIR)$/genbrk -GENCCODE:=$(AUGMENT_LIBRARY_PATH) $(SOLARBINDIR)$/genccode +GENCMN:=$(AUGMENT_LIBRARY_PATH) $(SOLARBINDIR)/gencmn +GENBRK:=$(AUGMENT_LIBRARY_PATH) $(SOLARBINDIR)/genbrk +GENCCODE:=$(AUGMENT_LIBRARY_PATH) $(SOLARBINDIR)/genccode .ENDIF .INCLUDE .IGNORE : icuversion.mk -$(MISC)$/%.txt : data/%.txt +.INCLUDE : target.mk + +$(MISC)/%.txt : data/%.txt # fdo#31271 ")" reclassified in more recent ICU/Unicode Standards .IF "$(ICU_MAJOR)" >= "5" || ("$(ICU_MAJOR)" == "4" && "$(ICU_MINOR)" >= "4") - $(SED) "s#\[:LineBreak = Close_Punctuation:\]#\[\[:LineBreak = Close_Punctuation:\] \[:LineBreak = Close_Parenthesis:\]\]#" $< > $@ + $(SED) "s#\[:LineBreak = Close_Punctuation:\]#\[\[:LineBreak = Close_Punctuation:\] \[:LineBreak = Close_Parenthesis:\]\]#" $< > $@ .ELSE - $(COPY) $< $@ + $(COPY) $< $@ .ENDIF -$(MISC)$/%.brk : $(MISC)/%.txt - $(WRAPCMD) $(GENBRK) -r $< -o $(MISC)$/$*.brk +$(MISC)/%.brk : $(MISC)/%.txt + $(GENBRK) -r $< -o $(MISC)/$*.brk -$(MISC)$/%_brk.c : $(MISC)$/%.brk - $(WRAPCMD) $(GENCCODE) -n OpenOffice -d $(MISC)$ $(MISC)$/$*.brk +$(MISC)/%_brk.c : $(MISC)/%.brk + $(GENCCODE) -n OpenOffice -d $(MISC)$ $(MISC)/$*.brk # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules. # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools, -# so the output (OpenOffice_icu_dat.c) is changed here to include a pragma to disable the warnings. +# so the output (OpenOffice_dat.c) is changed here to include a pragma to disable the warnings. # Output of gencmn is redirected to OpenOffice_icu_tmp.c with the -t switch. -$(MISC)$/OpenOffice_%.c : $(MY_BRK_BRKFILES:s/.brk/_brk.c/) - $(WRAPCMD) $(GENCMN) -n OpenOffice -t tmp -S -d $(MISC) O $(mktmp $(subst,$(MISC)$/, $(MY_BRK_BRKFILES:t"\n"))) - echo $(USQ)#ifdef _MSC_VER$(USQ) > $@ - echo $(USQ)#pragma warning( disable : 4229 4668 )$(USQ) >> $@ - echo $(USQ)#endif$(USQ) >> $@ - $(TYPE) $(@:s/_dat/_tmp/) >> $@ - -.INCLUDE : target.mk - +$(MISC)/OpenOffice_dat.c : $(MY_BRK_BRKFILES:s/.brk/_brk.c/) + $(GENCMN) -n OpenOffice -t tmp -S -d $(MISC) O $(mktmp $(subst,$(MISC)/, $(MY_BRK_BRKFILES:t"\n"))) + echo $(USQ)#ifdef _MSC_VER$(USQ) > $@ + echo $(USQ)#pragma warning( disable : 4229 4668 )$(USQ) >> $@ + echo $(USQ)#endif$(USQ) >> $@ + $(TYPE) $(@:s/_dat/_tmp/) >> $@ diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx index aba69b5e9a21..0bff3d0b174c 100644 --- a/i18npool/source/breakiterator/xdictionary.cxx +++ b/i18npool/source/breakiterator/xdictionary.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -45,7 +46,8 @@ // Construction/Destruction ////////////////////////////////////////////////////////////////////// -using namespace rtl; +using ::rtl::OUString; +using ::rtl::OUStringBuffer; namespace com { namespace sun { namespace star { namespace i18n { @@ -60,12 +62,6 @@ xdictionary::xdictionary(const sal_Char *lang) : hModule( NULL ), boundary(), japaneseWordBreak( sal_False ) -#if USE_CELL_BOUNDARY_CODE - // For CTL breakiterator, where the word boundary should not be inside cell. - , - useCellBoundary( sal_False ), - cellBoundary( NULL ) -#endif { index1 = 0; #ifdef SAL_DLLPREFIX @@ -78,15 +74,15 @@ xdictionary::xdictionary(const sal_Char *lang) : hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT ); if( hModule ) { sal_IntPtr (*func)(); - func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData ); + func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString(RTL_CONSTASCII_USTRINGPARAM("getExistMark")).pData ); existMark = (sal_uInt8*) (*func)(); - func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData ); + func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString(RTL_CONSTASCII_USTRINGPARAM("getIndex1")).pData ); index1 = (sal_Int16*) (*func)(); - func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData ); + func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString(RTL_CONSTASCII_USTRINGPARAM("getIndex2")).pData ); index2 = (sal_Int32*) (*func)(); - func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData ); + func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString(RTL_CONSTASCII_USTRINGPARAM("getLenArray")).pData ); lenArray = (sal_Int32*) (*func)(); - func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData ); + func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString(RTL_CONSTASCII_USTRINGPARAM("getDataArea")).pData ); dataArea = (sal_Unicode*) (*func)(); } else @@ -101,10 +97,6 @@ xdictionary::xdictionary(const sal_Char *lang) : for (sal_Int32 i = 0; i < CACHE_MAX; i++) cache[i].size = 0; -#if USE_CELL_BOUNDARY_CODE - useCellBoundary = sal_False; - cellBoundary = NULL; -#endif japaneseWordBreak = sal_False; } @@ -112,8 +104,8 @@ xdictionary::~xdictionary() { osl_unloadModule(hModule); for (sal_Int32 i = 0; i < CACHE_MAX; i++) { if (cache[i].size > 0) { - delete cache[i].contents; - delete cache[i].wordboundary; + delete [] cache[i].contents; + delete [] cache[i].wordboundary; } } } @@ -241,86 +233,70 @@ static sal_Int16 JapaneseCharType(sal_Unicode c) WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary) { + WordBreakCache& rCache = cache[text[0] & 0x1f]; - WordBreakCache& aCache = cache[text[0] & 0x1f]; - - if (aCache.size != 0 && aCache.equals(text, wordBoundary)) - return aCache; + if (rCache.size != 0 && rCache.equals(text, wordBoundary)) + return rCache; - sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos; + sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos; - if (aCache.size == 0 || len > aCache.size) { - if (aCache.size != 0) { - delete aCache.contents; - delete aCache.wordboundary; - aCache.size = len; - } - else - aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE; - aCache.contents = new sal_Unicode[aCache.size + 1]; - aCache.wordboundary = new sal_Int32[aCache.size + 2]; + if (rCache.size == 0 || len > rCache.size) { + if (rCache.size != 0) { + delete rCache.contents; + delete rCache.wordboundary; + rCache.size = len; } - aCache.length = len; - memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode)); - *(aCache.contents + len) = 0x0000; - // reset the wordboundary in cache - memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2)); - - sal_Int32 i = 0; // loop variable - while (aCache.wordboundary[i] < aCache.length) { - len = 0; - // look the continuous white space as one word and cashe it - while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len])) - len ++; - - if (len == 0) { - const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i]; - sal_Int32 slen = aCache.length - aCache.wordboundary[i]; - sal_Int16 type = 0, count = 0; - for (;len == 0 && slen > 0; str++, slen--) { - len = getLongestMatch(str, slen); - if (len == 0) { - if (!japaneseWordBreak) { - len = 1; - } else { - if (count == 0) - type = JapaneseCharType(*str); - else if (type != JapaneseCharType(*str)) - break; - count++; - } - } - } - if (count) { - aCache.wordboundary[i+1] = aCache.wordboundary[i] + count; - i++; - -#if USE_CELL_BOUNDARY_CODE - if (useCellBoundary) { - sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1]; - if (cBoundary > 0) - aCache.wordboundary[i] = cBoundary - wordBoundary.startPos; + else + rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE; + rCache.contents = new sal_Unicode[rCache.size + 1]; + rCache.wordboundary = new sal_Int32[rCache.size + 2]; + } + rCache.length = len; + memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode)); + *(rCache.contents + len) = 0x0000; + // reset the wordboundary in cache + memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2)); + + sal_Int32 i = 0; // loop variable + while (rCache.wordboundary[i] < rCache.length) { + len = 0; + // look the continuous white space as one word and cashe it + while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len])) + len ++; + + if (len == 0) { + const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i]; + sal_Int32 slen = rCache.length - rCache.wordboundary[i]; + sal_Int16 type = 0, count = 0; + for (;len == 0 && slen > 0; str++, slen--) { + len = getLongestMatch(str, slen); + if (len == 0) { + if (!japaneseWordBreak) { + len = 1; + } else { + if (count == 0) + type = JapaneseCharType(*str); + else if (type != JapaneseCharType(*str)) + break; + count++; } -#endif } } - - if (len) { - aCache.wordboundary[i+1] = aCache.wordboundary[i] + len; + if (count) + { + rCache.wordboundary[i+1] = rCache.wordboundary[i] + count; i++; - -#if USE_CELL_BOUNDARY_CODE - if (useCellBoundary) { - sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1]; - if (cBoundary > 0) - aCache.wordboundary[i] = cBoundary - wordBoundary.startPos; - } -#endif } } - aCache.wordboundary[i + 1] = aCache.length + 1; - return aCache; + if (len) { + rCache.wordboundary[i+1] = rCache.wordboundary[i] + len; + i++; + } + } + rCache.wordboundary[i + 1] = rCache.length + 1; + + return rCache; } Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) @@ -391,12 +367,6 @@ Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, s return boundary; } -#if USE_CELL_BOUNDARY_CODE -void xdictionary::setCellBoundary(sal_Int32* cellArray) -{ - useCellBoundary = sal_True; - cellBoundary = cellArray; -} -#endif - } } } } + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |