summaryrefslogtreecommitdiff
path: root/i18npool/source/breakiterator/xdictionary.cxx
diff options
context:
space:
mode:
authorIvo Hinkelmann <ihi@openoffice.org>2009-09-16 14:55:33 +0000
committerIvo Hinkelmann <ihi@openoffice.org>2009-09-16 14:55:33 +0000
commite260d2a64d6de337cc0aa991e577a1c00af4a936 (patch)
treeeb0858a0f45a8037b16e07d8106599841d46c4cb /i18npool/source/breakiterator/xdictionary.cxx
parente66e25555bc9d6c95144d349235216db4420c099 (diff)
CWS-TOOLING: integrate CWS locales32
2009-09-06 19:13:15 +0200 er r275860 : #i104308# Lower and Upper Sorbian 2009-09-03 01:40:09 +0200 erack r275732 : CWS-TOOLING: rebase CWS locales32 to trunk@275331 (milestone: DEV300:m56) 2009-08-17 21:48:26 +0200 erack r275072 : * #i102920# use OUString::iterateCodePoints() in xdictionary; patch from <cmc> * Adapted local iterateCodePoints() in breakiteratorImpl.cxx to cope with surrogates at text end. * Use OUString::iterateCodePoints() in BreakIterator_CJK::getLineBreak() 2009-08-09 23:14:39 +0200 erack r274792 : mergeinfo 2009-08-09 23:09:31 +0200 erack r274791 : re-adding this again, merge from tag didn't work; SVN IS A PIECE OF CRAP 2009-08-09 22:45:02 +0200 erack r274790 : don't re-add file, merge instead 2009-08-09 22:09:49 +0200 erack r274789 : cws rebase ERRONEOUSLY REMOVED THIS FILE! 2009-08-09 22:01:02 +0200 erack r274788 : adapt to new postset.mk completelangiso content; get rid of /i modifier uglyness 2009-08-09 19:40:46 +0200 erack r274787 : #i99823# sort currency list ISO,symbol,language/country; removed unused STR_EUROPE 2009-08-09 01:23:35 +0200 erack r274786 : make AZM currency legacyOnly to avoid duplicate listing in number formatter because of the identical 'man.' currency symbol 2009-08-09 01:11:38 +0200 erack r274785 : #i94445# make ROL currency legacyOnly to avoid duplicate listing in number formatter because of the identical 'lei' currency symbol 2009-08-06 19:10:34 +0200 erack r274743 : CWS-TOOLING: rebase CWS locales32 to trunk@274622 (milestone: DEV300:m54) 2009-08-04 21:26:24 +0200 erack r274634 : #i94445# new currency RON 2009-08-04 21:06:55 +0200 erack r274633 : #i103193# corrected data; from <calibaashi> 2009-07-18 23:56:55 +0200 erack r274104 : #i103408# ignore an empty LANGUAGE variable; patch from <cmc> 2009-07-07 00:21:02 +0200 erack r273764 : Langpack.ulf is gone 2009-07-07 00:02:39 +0200 erack r273763 : #i103358# add 'is' Icelandic 2009-07-04 02:10:55 +0200 erack r273720 : #i101173# typographic quotation marks 2009-07-04 01:07:44 +0200 erack r273719 : #i65127# make it compile with OSL_DEBUG_LEVEL>2 again 2009-06-26 22:50:49 +0200 erack r273444 : #i97602# add Asturian_Spain [ast-ES]; locale data contributed by <astur>/<it46> 2009-06-26 22:01:25 +0200 erack r273443 : #i101173# add Oromo [om-ET] locale data; contributed by <barreessaa> 2009-06-26 21:55:41 +0200 erack r273442 : #i101173# add Oromo [om-ET] locale data; contributed by <barreessaa> 2009-06-26 21:11:46 +0200 erack r273441 : #i102991# linguistic corrections 2009-06-26 21:02:45 +0200 erack r273440 : #i102986# add Somali 'so' 2009-06-22 00:04:35 +0200 erack r273190 : #i101235# add Uyghur_China [ug-CN] locale; contributed by <sahran>/<it46> 2009-06-21 23:35:21 +0200 erack r273189 : blah 2009-06-21 23:22:40 +0200 erack r273188 : #i99972# add Quechua (Ecuador) [qu-EC] 2009-06-21 21:57:29 +0200 erack r273187 : #i97791# add Yiddish [yi-IL] 2009-06-21 21:24:29 +0200 erack r273186 : #i102186# add Greek, Ancient [grc-GR] 2009-06-21 21:00:59 +0200 erack r273185 : #i98489# add Arabic (Oman) [ar-OM] locale data; contributed by <zayed2001>/<it46> 2009-06-21 20:54:07 +0200 erack r273184 : check ListSeparator for ';' semicolon, for consistency 2009-06-21 20:20:13 +0200 erack r273183 : ISO 4217 checks only if not legacy (e.g. Macau Pound 'P') 2009-06-21 20:00:27 +0200 erack r273182 : check CurrencyID and BankSymbol vor ISO 4217 2009-06-21 19:10:36 +0200 erack r273181 : #i73118# Bokmål instead of Bokmal 2009-06-21 19:00:47 +0200 erack r273180 : #i99827# add Sardinian locale data; contributed by <valterubuntu> 2009-06-21 18:20:05 +0200 erack r273179 : give URL of svn instead of legacy cvs 2009-06-21 17:45:31 +0200 erack r273178 : #i87907# add Oromo [om-ET] 2009-06-21 17:23:14 +0200 erack r273177 : grep in completelangiso of postset.mk 2009-03-24 00:22:16 +0100 erack r269901 : #i100368# parentheses, parentheses, parentheses ... 2009-03-23 23:31:22 +0100 erack r269899 : #i99712# call OutlineNumberingLevel_Impl dtor 2009-03-23 22:37:31 +0100 erack r269897 : #i98347# register NumToCharHalfwidth 2009-03-22 18:07:40 +0100 erack r269852 : definitely ignore any output resulting from the cd command in list of rules, which may have lead to a spurious rule file
Diffstat (limited to 'i18npool/source/breakiterator/xdictionary.cxx')
-rw-r--r--i18npool/source/breakiterator/xdictionary.cxx77
1 files changed, 52 insertions, 25 deletions
diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx
index f286dd2449ac..add22f39d58e 100644
--- a/i18npool/source/breakiterator/xdictionary.cxx
+++ b/i18npool/source/breakiterator/xdictionary.cxx
@@ -126,8 +126,9 @@ void xdictionary::setJapaneseWordBreak()
japaneseWordBreak = sal_True;
}
-sal_Bool xdictionary::exists(const sal_Unicode c) {
- sal_Bool exist = existMark ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
+sal_Bool xdictionary::exists(const sal_uInt32 c) {
+ // 0x1FFF is the hardcoded limit in gendict for existMarks
+ sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
if (!exist && japaneseWordBreak)
return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
else
@@ -197,20 +198,35 @@ sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
* @param pos : Position of the given character.
* @return true if CJK.
*/
-sal_Bool xdictionary::seekSegment(const sal_Unicode *text, sal_Int32 pos,
- sal_Int32 len, Boundary& segBoundary) {
- for (segBoundary.startPos = pos - 1;
- segBoundary.startPos >= 0 &&
- (u_isWhitespace((sal_uInt32)text[segBoundary.startPos]) || exists(text[segBoundary.startPos]));
- segBoundary.startPos--) ;
- segBoundary.startPos++;
-
- for (segBoundary.endPos = pos;
- segBoundary.endPos < len &&
- (u_isWhitespace((sal_uInt32)text[segBoundary.endPos]) || exists(text[segBoundary.endPos]));
- segBoundary.endPos++) ;
-
- return segBoundary.endPos > segBoundary.startPos + 1;
+sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
+ Boundary& segBoundary)
+{
+ sal_Int32 indexUtf16;
+ segBoundary.endPos = segBoundary.startPos = pos;
+
+ indexUtf16 = pos;
+ while (indexUtf16 > 0)
+ {
+ sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
+ if (u_isWhitespace(ch) || exists(ch))
+ segBoundary.startPos = indexUtf16;
+ else
+ break;
+ }
+
+ indexUtf16 = pos;
+ while (indexUtf16 < rText.getLength())
+ {
+ sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
+ if (u_isWhitespace(ch) || exists(ch))
+ segBoundary.endPos = indexUtf16;
+ else
+ break;
+ }
+
+ indexUtf16 = segBoundary.startPos;
+ rText.iterateCodePoints(&indexUtf16, 1);
+ return segBoundary.endPos > indexUtf16;
}
#define KANJA 1
@@ -340,19 +356,24 @@ Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, s
sal_Int32 len=rText.getLength();
if (anyPos >= len || anyPos < 0) {
boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
- } else if (seekSegment(text, anyPos, len, boundary)) { // character in dict
+ } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
WordBreakCache& aCache = getCache(text, boundary);
sal_Int32 i = 0;
- while (aCache.wordboundary[i] <= (sal_Int32)anyPos - boundary.startPos) i++;
+ while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
sal_Int32 startPos = aCache.wordboundary[i - 1];
// if bDirection is false
- if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos) &&
- u_isWhitespace((sal_uInt32) text[anyPos - 1]))
- i--;
- boundary.endPos = aCache.wordboundary[i] + boundary.startPos;
- boundary.startPos += aCache.wordboundary[i - 1];
+ if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
+ {
+ sal_Int32 indexUtf16 = anyPos-1;
+ sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
+ if (u_isWhitespace(ch))
+ i--;
+ }
+ boundary.endPos = boundary.startPos;
+ rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
+ rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
} else {
boundary.startPos = anyPos;
if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
@@ -360,8 +381,14 @@ Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, s
}
if (wordType == WordType::WORD_COUNT) {
// skip punctuation for word count.
- while (boundary.endPos < len && u_ispunct((sal_uInt32)text[boundary.endPos]))
- boundary.endPos++;
+ while (boundary.endPos < len)
+ {
+ sal_Int32 indexUtf16 = boundary.endPos;
+ if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
+ boundary.endPos = indexUtf16;
+ else
+ break;
+ }
}
return boundary;