summaryrefslogtreecommitdiff
path: root/i18npool/source/breakiterator/breakiterator_unicode.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'i18npool/source/breakiterator/breakiterator_unicode.cxx')
-rw-r--r--i18npool/source/breakiterator/breakiterator_unicode.cxx448
1 files changed, 448 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
new file mode 100644
index 000000000000..ad934db2db11
--- /dev/null
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -0,0 +1,448 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org. If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+// MARKER(update_precomp.py): autogen include statement, do not remove
+#include "precompiled_i18npool.hxx"
+#include <breakiterator_unicode.hxx>
+#include <localedata.hxx>
+#include <unicode/uchar.h>
+#include <unicode/locid.h>
+#include <unicode/rbbi.h>
+#include <unicode/udata.h>
+#include <rtl/strbuf.hxx>
+#include <rtl/ustring.hxx>
+
+U_CDECL_BEGIN
+extern const char OpenOffice_dat[];
+U_CDECL_END
+
+using namespace ::com::sun::star;
+using namespace ::com::sun::star::lang;
+using namespace ::rtl;
+
+namespace com { namespace sun { namespace star { namespace i18n {
+
+#define ERROR ::com::sun::star::uno::RuntimeException()
+
+//#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
+
+
+BreakIterator_Unicode::BreakIterator_Unicode() :
+ cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
+ wordRule( "word" ),
+ lineRule( "line" ),
+ result(),
+ character(),
+ word(),
+ sentence(),
+ line(),
+ icuBI( NULL ),
+ aLocale(),
+ aBreakType(),
+ aWordType()
+{
+}
+
+
+BreakIterator_Unicode::~BreakIterator_Unicode()
+{
+ if (icuBI && icuBI->aBreakIterator) {
+ delete icuBI->aBreakIterator;
+ icuBI->aBreakIterator=NULL;
+ }
+ if (character.aBreakIterator) delete character.aBreakIterator;
+ if (word.aBreakIterator) delete word.aBreakIterator;
+ if (sentence.aBreakIterator) delete sentence.aBreakIterator;
+ if (line.aBreakIterator) delete line.aBreakIterator;
+}
+
+/*
+ Wrapper class to provide public access to the RuleBasedBreakIterator's
+ setbreakType method.
+*/
+class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
+ public:
+ inline void publicSetBreakType(int32_t type) {
+ setBreakType(type);
+ };
+ OOoRuleBasedBreakIterator(UDataMemory* image,
+ UErrorCode &status) :
+ RuleBasedBreakIterator(image, status) { };
+
+};
+
+// loading ICU breakiterator on demand.
+void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
+ sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
+{
+ sal_Bool newBreak = sal_False;
+ UErrorCode status = U_ZERO_ERROR;
+ sal_Int16 breakType = 0;
+ switch (rBreakType) {
+ case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
+ case LOAD_WORD_BREAKITERATOR: icuBI=&word;
+ switch (rWordType) {
+ case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
+ case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
+ case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
+ }
+ break;
+ case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
+ case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
+ }
+ if (!icuBI->aBreakIterator || rWordType != aWordType ||
+ rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
+ rLocale.Variant != aLocale.Variant) {
+ if (icuBI->aBreakIterator) {
+ delete icuBI->aBreakIterator;
+ icuBI->aBreakIterator=NULL;
+ }
+ if (rule) {
+ uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
+
+ status = U_ZERO_ERROR;
+ udata_setAppData("OpenOffice", OpenOffice_dat, &status);
+ if ( !U_SUCCESS(status) ) throw ERROR;
+
+ OOoRuleBasedBreakIterator *rbi = NULL;
+
+ if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
+ rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
+ OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
+ } else {
+ status = U_ZERO_ERROR;
+ OStringBuffer aUDName(64);
+ aUDName.append(rule);
+ aUDName.append('_');
+ aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
+ UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
+ if( U_SUCCESS(status) )
+ rbi = new OOoRuleBasedBreakIterator( pUData, status);
+ if (!U_SUCCESS(status) ) {
+ status = U_ZERO_ERROR;
+ pUData = udata_open("OpenOffice", "brk", rule, &status);
+ if( U_SUCCESS(status) )
+ rbi = new OOoRuleBasedBreakIterator( pUData, status);
+ if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
+ }
+ }
+ if (rbi) {
+ switch (rBreakType) {
+ case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
+ case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
+ case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
+ case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
+ }
+ icuBI->aBreakIterator = rbi;
+ }
+ }
+
+ if (!icuBI->aBreakIterator) {
+ icu::Locale icuLocale(
+ OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
+ OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
+ OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
+
+ status = U_ZERO_ERROR;
+ switch (rBreakType) {
+ case LOAD_CHARACTER_BREAKITERATOR:
+ icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
+ break;
+ case LOAD_WORD_BREAKITERATOR:
+ icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
+ break;
+ case LOAD_SENTENCE_BREAKITERATOR:
+ icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
+ break;
+ case LOAD_LINE_BREAKITERATOR:
+ icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
+ break;
+ }
+ if ( !U_SUCCESS(status) ) {
+ icuBI->aBreakIterator=NULL;
+ throw ERROR;
+ }
+ }
+ if (icuBI->aBreakIterator) {
+ aLocale=rLocale;
+ aWordType=rWordType;
+ aBreakType=rBreakType;
+ newBreak=sal_True;
+ } else {
+ throw ERROR;
+ }
+ }
+
+ if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) { // UChar != sal_Unicode in MinGW
+ icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
+ icuBI->aBreakIterator->setText(icuBI->aICUText);
+ }
+}
+
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
+ sal_Int32 nStartPos, const lang::Locale &rLocale,
+ sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+ throw(uno::RuntimeException)
+{
+ if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+ loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+ for (nDone = 0; nDone < nCount; nDone++) {
+ nStartPos = character.aBreakIterator->following(nStartPos);
+ if (nStartPos == BreakIterator::DONE)
+ return Text.getLength();
+ }
+ } else { // for CHARACTER mode
+ for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
+ Text.iterateCodePoints(&nStartPos, 1);
+ }
+ return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
+ sal_Int32 nStartPos, const lang::Locale& rLocale,
+ sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+ throw(uno::RuntimeException)
+{
+ if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+ loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+ for (nDone = 0; nDone < nCount; nDone++) {
+ nStartPos = character.aBreakIterator->preceding(nStartPos);
+ if (nStartPos == BreakIterator::DONE)
+ return 0;
+ }
+ } else { // for BS to delete one char and CHARACTER mode.
+ for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
+ Text.iterateCodePoints(&nStartPos, -1);
+ }
+ return nStartPos;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
+{
+ loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
+
+ result.startPos = word.aBreakIterator->following(nStartPos);
+ if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
+ result.endPos = result.startPos;
+ else {
+ if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
+ rWordType == WordType::DICTIONARY_WORD ) &&
+ u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
+ result.startPos = word.aBreakIterator->following(result.startPos);
+
+ result.endPos = word.aBreakIterator->following(result.startPos);
+ if(result.endPos == BreakIterator::DONE)
+ result.endPos = result.startPos;
+ }
+ return result;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
+{
+ loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
+
+ result.startPos = word.aBreakIterator->preceding(nStartPos);
+ if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
+ result.endPos = result.startPos;
+ else {
+ if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
+ rWordType == WordType::DICTIONARY_WORD) &&
+ u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
+ result.startPos = word.aBreakIterator->preceding(result.startPos);
+
+ result.endPos = word.aBreakIterator->following(result.startPos);
+ if(result.endPos == BreakIterator::DONE)
+ result.endPos = result.startPos;
+ }
+ return result;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
+ sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
+{
+ loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
+ sal_Int32 len = Text.getLength();
+
+ if(word.aBreakIterator->isBoundary(nPos)) {
+ result.startPos = result.endPos = nPos;
+ if((bDirection || nPos == 0) && nPos < len) //forward
+ result.endPos = word.aBreakIterator->following(nPos);
+ else
+ result.startPos = word.aBreakIterator->preceding(nPos);
+ } else {
+ if(nPos <= 0) {
+ result.startPos = 0;
+ result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
+ } else if(nPos >= len) {
+ result.startPos = word.aBreakIterator->preceding(len);
+ result.endPos = len;
+ } else {
+ result.startPos = word.aBreakIterator->preceding(nPos);
+ result.endPos = word.aBreakIterator->following(nPos);
+ }
+ }
+ if (result.startPos == BreakIterator::DONE)
+ result.startPos = result.endPos;
+ else if (result.endPos == BreakIterator::DONE)
+ result.endPos = result.startPos;
+
+ return result;
+}
+
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale &rLocale ) throw(uno::RuntimeException)
+{
+ loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+ sal_Int32 len = Text.getLength();
+ if (len > 0 && nStartPos == len)
+ Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+ if (!sentence.aBreakIterator->isBoundary(nStartPos))
+ nStartPos = sentence.aBreakIterator->preceding(nStartPos);
+
+ // skip preceding space.
+ sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
+ while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
+ Text.iterateCodePoints(&nStartPos, -1);
+
+ return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale &rLocale ) throw(uno::RuntimeException)
+{
+ loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+ sal_Int32 len = Text.getLength();
+ if (len > 0 && nStartPos == len)
+ Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+ nStartPos = sentence.aBreakIterator->following(nStartPos);
+
+ sal_Int32 nPos=nStartPos;
+ while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
+
+ return nStartPos;
+}
+
+LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
+ const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
+ const LineBreakHyphenationOptions& hOptions,
+ const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
+{
+ LineBreakResults lbr;
+
+ if (nStartPos >= Text.getLength()) {
+ lbr.breakIndex = Text.getLength();
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ return lbr;
+ }
+
+ loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
+
+ sal_Bool GlueSpace=sal_True;
+ while (GlueSpace) {
+ if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
+ lbr.breakIndex = nStartPos;
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ } else if (hOptions.rHyphenator.is()) { //Hyphenation break
+ Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
+ WordType::DICTIONARY_WORD, false);
+ uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
+ aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
+ wBoundary.endPos - wBoundary.startPos), rLocale,
+ (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
+ if (aHyphenatedWord.is()) {
+ lbr.rHyphenatedWord = aHyphenatedWord;
+ if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
+ lbr.breakIndex = -1;
+ else
+ lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
+ lbr.breakType = BreakType::HYPHENATION;
+ } else {
+ lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
+ lbr.breakType = BreakType::WORDBOUNDARY;;
+ }
+ } else { //word boundary break
+ lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ }
+
+#define WJ 0x2060 // Word Joiner
+ GlueSpace=sal_False;
+ if (lbr.breakType == BreakType::WORDBOUNDARY) {
+ nStartPos = lbr.breakIndex;
+ if (Text[nStartPos--] == WJ)
+ GlueSpace=sal_True;
+ while (nStartPos >= 0 &&
+ (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
+ if (Text[nStartPos--] == WJ)
+ GlueSpace=sal_True;
+ }
+ if (GlueSpace && nStartPos < 0) {
+ lbr.breakIndex = 0;
+ break;
+ }
+ }
+ }
+
+ return lbr;
+}
+
+
+
+OUString SAL_CALL
+BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
+{
+ return OUString::createFromAscii(cBreakIterator);
+}
+
+sal_Bool SAL_CALL
+BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
+{
+ return !rServiceName.compareToAscii(cBreakIterator);
+}
+
+uno::Sequence< OUString > SAL_CALL
+BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
+{
+ uno::Sequence< OUString > aRet(1);
+ aRet[0] = OUString::createFromAscii(cBreakIterator);
+ return aRet;
+}
+
+} } } }
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */