1 files changed, 448 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
new file mode 100644
index 000000000000..ad934db2db11
--- /dev/null
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -0,0 +1,448 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org.  If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_i18npool.hxx"
+#include <breakiterator_unicode.hxx>
+#include <localedata.hxx>
+#include <unicode/uchar.h>
+#include <unicode/locid.h>
+#include <unicode/rbbi.h>
+#include <unicode/udata.h>
+#include <rtl/strbuf.hxx>
+#include <rtl/ustring.hxx>
+
+U_CDECL_BEGIN
+extern const char OpenOffice_dat[];
+U_CDECL_END
+
+using namespace ::com::sun::star;
+using namespace ::com::sun::star::lang;
+using namespace ::rtl;
+
+namespace com { namespace sun { namespace star { namespace i18n {
+
+#define ERROR ::com::sun::star::uno::RuntimeException()
+
+//#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
+
+
+BreakIterator_Unicode::BreakIterator_Unicode() :
+    cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ),    // implementation name
+    wordRule( "word" ),
+    lineRule( "line" ),
+    result(),
+    character(),
+    word(),
+    sentence(),
+    line(),
+    icuBI( NULL ),
+    aLocale(),
+    aBreakType(),
+    aWordType()
+{
+}
+
+
+BreakIterator_Unicode::~BreakIterator_Unicode()
+{
+        if (icuBI && icuBI->aBreakIterator) {
+            delete icuBI->aBreakIterator;
+            icuBI->aBreakIterator=NULL;
+        }
+        if (character.aBreakIterator) delete character.aBreakIterator;
+        if (word.aBreakIterator) delete word.aBreakIterator;
+        if (sentence.aBreakIterator) delete sentence.aBreakIterator;
+        if (line.aBreakIterator) delete line.aBreakIterator;
+}
+
+/*
+    Wrapper class to provide public access to the RuleBasedBreakIterator's
+    setbreakType method.
+*/
+class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
+    public:
+        inline void publicSetBreakType(int32_t type) {
+            setBreakType(type);
+        };
+        OOoRuleBasedBreakIterator(UDataMemory* image,
+                UErrorCode &status) :
+            RuleBasedBreakIterator(image, status) { };
+
+};
+
+// loading ICU breakiterator on demand.
+void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
+        sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
+{
+    sal_Bool newBreak = sal_False;
+    UErrorCode status = U_ZERO_ERROR;
+    sal_Int16 breakType = 0;
+    switch (rBreakType) {
+        case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
+        case LOAD_WORD_BREAKITERATOR: icuBI=&word;
+            switch (rWordType) {
+                case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
+                case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
+                case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
+            }
+            break;
+        case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
+        case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
+    }
+    if (!icuBI->aBreakIterator || rWordType != aWordType ||
+            rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
+            rLocale.Variant != aLocale.Variant) {
+        if (icuBI->aBreakIterator) {
+            delete icuBI->aBreakIterator;
+            icuBI->aBreakIterator=NULL;
+        }
+        if (rule) {
+            uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
+
+            status = U_ZERO_ERROR;
+            udata_setAppData("OpenOffice", OpenOffice_dat, &status);
+            if ( !U_SUCCESS(status) ) throw ERROR;
+
+            OOoRuleBasedBreakIterator *rbi = NULL;
+
+            if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
+                rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
+                    OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
+            } else {
+                status = U_ZERO_ERROR;
+                OStringBuffer aUDName(64);
+                aUDName.append(rule);
+                aUDName.append('_');
+                aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
+                UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
+                if( U_SUCCESS(status) )
+                    rbi = new OOoRuleBasedBreakIterator( pUData, status);
+                if (!U_SUCCESS(status) ) {
+                    status = U_ZERO_ERROR;
+                    pUData = udata_open("OpenOffice", "brk", rule, &status);
+                    if( U_SUCCESS(status) )
+                        rbi = new OOoRuleBasedBreakIterator( pUData, status);
+                    if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
+                }
+            }
+            if (rbi) {
+                switch (rBreakType) {
+                    case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
+                    case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
+                    case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
+                    case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
+                }
+                icuBI->aBreakIterator = rbi;
+            }
+        }
+
+        if (!icuBI->aBreakIterator) {
+            icu::Locale icuLocale(
+                    OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
+                    OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
+                    OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
+
+            status = U_ZERO_ERROR;
+            switch (rBreakType) {
+                case LOAD_CHARACTER_BREAKITERATOR:
+                    icuBI->aBreakIterator =  icu::BreakIterator::createCharacterInstance(icuLocale, status);
+                    break;
+                case LOAD_WORD_BREAKITERATOR:
+                    icuBI->aBreakIterator =  icu::BreakIterator::createWordInstance(icuLocale, status);
+                    break;
+                case LOAD_SENTENCE_BREAKITERATOR:
+                    icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
+                    break;
+                case LOAD_LINE_BREAKITERATOR:
+                    icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
+                    break;
+            }
+            if ( !U_SUCCESS(status) ) {
+                icuBI->aBreakIterator=NULL;
+                throw ERROR;
+            }
+        }
+        if (icuBI->aBreakIterator) {
+            aLocale=rLocale;
+            aWordType=rWordType;
+            aBreakType=rBreakType;
+            newBreak=sal_True;
+        } else {
+            throw ERROR;
+        }
+    }
+
+    if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) {   // UChar != sal_Unicode in MinGW
+        icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
+        icuBI->aBreakIterator->setText(icuBI->aICUText);
+    }
+}
+
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
+        sal_Int32 nStartPos, const lang::Locale &rLocale,
+        sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+        throw(uno::RuntimeException)
+{
+        if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+            loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+            for (nDone = 0; nDone < nCount; nDone++) {
+                nStartPos = character.aBreakIterator->following(nStartPos);
+                if (nStartPos == BreakIterator::DONE)
+                    return Text.getLength();
+            }
+        } else { // for CHARACTER mode
+            for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
+                Text.iterateCodePoints(&nStartPos, 1);
+        }
+        return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
+        sal_Int32 nStartPos, const lang::Locale& rLocale,
+        sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+        throw(uno::RuntimeException)
+{
+        if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+            loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+            for (nDone = 0; nDone < nCount; nDone++) {
+                nStartPos = character.aBreakIterator->preceding(nStartPos);
+                if (nStartPos == BreakIterator::DONE)
+                    return 0;
+            }
+        } else { // for BS to delete one char and CHARACTER mode.
+            for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
+                Text.iterateCodePoints(&nStartPos, -1);
+        }
+        return nStartPos;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
+    const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
+{
+        loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
+
+        result.startPos = word.aBreakIterator->following(nStartPos);
+        if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
+            result.endPos = result.startPos;
+        else {
+            if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
+                    rWordType == WordType::DICTIONARY_WORD ) &&
+                        u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
+                result.startPos = word.aBreakIterator->following(result.startPos);
+
+            result.endPos = word.aBreakIterator->following(result.startPos);
+            if(result.endPos == BreakIterator::DONE)
+                result.endPos = result.startPos;
+        }
+        return result;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
+{
+        loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
+
+        result.startPos = word.aBreakIterator->preceding(nStartPos);
+        if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
+            result.endPos = result.startPos;
+        else {
+            if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
+                    rWordType == WordType::DICTIONARY_WORD) &&
+                        u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
+                result.startPos = word.aBreakIterator->preceding(result.startPos);
+
+            result.endPos = word.aBreakIterator->following(result.startPos);
+            if(result.endPos == BreakIterator::DONE)
+                result.endPos = result.startPos;
+        }
+        return result;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
+        sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
+{
+        loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
+        sal_Int32 len = Text.getLength();
+
+        if(word.aBreakIterator->isBoundary(nPos)) {
+            result.startPos = result.endPos = nPos;
+            if((bDirection || nPos == 0) && nPos < len) //forward
+                result.endPos = word.aBreakIterator->following(nPos);
+            else
+                result.startPos = word.aBreakIterator->preceding(nPos);
+        } else {
+            if(nPos <= 0) {
+                result.startPos = 0;
+                result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
+            } else if(nPos >= len) {
+                result.startPos = word.aBreakIterator->preceding(len);
+                result.endPos = len;
+            } else {
+                result.startPos = word.aBreakIterator->preceding(nPos);
+                result.endPos = word.aBreakIterator->following(nPos);
+            }
+        }
+        if (result.startPos == BreakIterator::DONE)
+            result.startPos = result.endPos;
+        else if (result.endPos == BreakIterator::DONE)
+            result.endPos = result.startPos;
+
+        return result;
+}
+
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale &rLocale ) throw(uno::RuntimeException)
+{
+        loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+        sal_Int32 len = Text.getLength();
+        if (len > 0 && nStartPos == len)
+            Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+        if (!sentence.aBreakIterator->isBoundary(nStartPos))
+            nStartPos = sentence.aBreakIterator->preceding(nStartPos);
+
+        // skip preceding space.
+        sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
+        while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
+        Text.iterateCodePoints(&nStartPos, -1);
+
+        return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale &rLocale ) throw(uno::RuntimeException)
+{
+        loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+        sal_Int32 len = Text.getLength();
+        if (len > 0 && nStartPos == len)
+            Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+        nStartPos = sentence.aBreakIterator->following(nStartPos);
+
+        sal_Int32 nPos=nStartPos;
+        while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
+
+        return nStartPos;
+}
+
+LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
+        const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
+        const LineBreakHyphenationOptions& hOptions,
+        const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
+{
+        LineBreakResults lbr;
+
+        if (nStartPos >= Text.getLength()) {
+            lbr.breakIndex = Text.getLength();
+            lbr.breakType = BreakType::WORDBOUNDARY;
+            return lbr;
+        }
+
+        loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
+
+        sal_Bool GlueSpace=sal_True;
+        while (GlueSpace) {
+            if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
+                lbr.breakIndex = nStartPos;
+                lbr.breakType = BreakType::WORDBOUNDARY;
+            } else if (hOptions.rHyphenator.is()) { //Hyphenation break
+                Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
+                                                WordType::DICTIONARY_WORD, false);
+                uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
+                aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
+                    wBoundary.endPos - wBoundary.startPos), rLocale,
+                    (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
+                if (aHyphenatedWord.is()) {
+                    lbr.rHyphenatedWord = aHyphenatedWord;
+                    if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
+                        lbr.breakIndex = -1;
+                    else
+                        lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
+                    lbr.breakType = BreakType::HYPHENATION;
+                } else {
+                    lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
+                    lbr.breakType = BreakType::WORDBOUNDARY;;
+                }
+            } else { //word boundary break
+                lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
+                lbr.breakType = BreakType::WORDBOUNDARY;
+            }
+
+#define WJ 0x2060   // Word Joiner
+            GlueSpace=sal_False;
+            if (lbr.breakType == BreakType::WORDBOUNDARY) {
+                nStartPos = lbr.breakIndex;
+                if (Text[nStartPos--] == WJ)
+                    GlueSpace=sal_True;
+                while (nStartPos >= 0 &&
+                    (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
+                    if (Text[nStartPos--] == WJ)
+                        GlueSpace=sal_True;
+                }
+                if (GlueSpace && nStartPos < 0)  {
+                    lbr.breakIndex = 0;
+                    break;
+                }
+            }
+        }
+
+        return lbr;
+}
+
+
+
+OUString SAL_CALL
+BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
+{
+        return OUString::createFromAscii(cBreakIterator);
+}
+
+sal_Bool SAL_CALL
+BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
+{
+        return !rServiceName.compareToAscii(cBreakIterator);
+}
+
+uno::Sequence< OUString > SAL_CALL
+BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
+{
+        uno::Sequence< OUString > aRet(1);
+        aRet[0] = OUString::createFromAscii(cBreakIterator);
+        return aRet;
+}
+
+} } } }
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */