summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Hosken <martin_hosken@sil.org>2016-03-17 09:57:35 +0700
committerAndras Timar <andras.timar@collabora.com>2017-04-23 22:00:54 +0200
commit3da9c44f67a0f4e8ac31ca2b64871ff58ab9c9cb (patch)
tree2169ec1739b9991bc22f6861f80047f9c000ae5e
parent3ad820385901e9bcd976d1c90f74e77bc17a7cc8 (diff)
Fix bug in khmr linebreaking and update dictionary
Change-Id: I2b776925c2c95cb56ccd592d036823c26054e059 Reviewed-on: https://gerrit.libreoffice.org/23316 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Martin Hosken <martin_hosken@sil.org> (cherry picked from commit a976a19ca82661d8b459b85f5514b0e4c9222d47) (cherry picked from commit 55dece94611e1b2a8a1974d11c10050d8d74b5f7)
-rw-r--r--external/icu/khmerbreakengine.patch327
-rw-r--r--external/icu/khmerdict.dictbin211340 -> 263537 bytes
2 files changed, 17 insertions, 310 deletions
diff --git a/external/icu/khmerbreakengine.patch b/external/icu/khmerbreakengine.patch
index ba3e392a27f3..bc0d287929b0 100644
--- a/external/icu/khmerbreakengine.patch
+++ b/external/icu/khmerbreakengine.patch
@@ -2,7 +2,7 @@ diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
index f1c874d..3ad1b3f 100644
--- misc/icu/source/common/dictbe.cpp
+++ build/icu/source/common/dictbe.cpp
-@@ -27,8 +27,16 @@ U_NAMESPACE_BEGIN
+@@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN
******************************************************************
*/
@@ -14,13 +14,14 @@ index f1c874d..3ad1b3f 100644
fTypes = breakTypes;
+ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
+
++ // note Skip Sets contain fIgnoreSet characters too.
+ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]\\u200C\\u200D\\u2060"), status);
+ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]\\u200C\\u200D\\u2060"), status);
+ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
}
DictionaryBreakEngine::~DictionaryBreakEngine() {
-@@ -90,7 +98,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
+@@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
utext_setNativeIndex(text, current);
}
@@ -29,7 +30,7 @@ index f1c874d..3ad1b3f 100644
return result;
}
-@@ -101,6 +109,163 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
+@@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
fSet.compact();
}
@@ -87,6 +88,8 @@ index f1c874d..3ad1b3f 100644
+ }
+ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
+ while (start > textStart) {
++ while (fIgnoreSet.contains(c))
++ c = utext_previous32(text);
+ if (!fMarkSet.contains(c)) {
+ if (fBaseSet.contains(c)) {
+ c = utext_previous32(text);
@@ -125,6 +128,10 @@ index f1c874d..3ad1b3f 100644
+ ++end;
+ }
+ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
++ while (fIgnoreSet.contains(c)) {
++ utext_next32(text);
++ c = utext_current32(text);
++ }
+ if (fBaseSet.contains(c)) {
+ while (end < textEnd) {
+ utext_next32(text);
@@ -193,7 +200,7 @@ index f1c874d..3ad1b3f 100644
/*
******************************************************************
* PossibleWord
-@@ -128,35 +293,35 @@ private:
+@@ -128,35 +302,35 @@ private:
public:
PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
~PossibleWord() {};
@@ -238,242 +245,7 @@ index f1c874d..3ad1b3f 100644
// Dictionary leaves text after longest prefix, not longest word. Back up.
if (count <= 0) {
utext_setNativeIndex(text, start);
-@@ -261,16 +426,16 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
- int32_t current;
- UErrorCode status = U_ZERO_ERROR;
- PossibleWord words[THAI_LOOKAHEAD];
--
-+
- utext_setNativeIndex(text, rangeStart);
--
-+
- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
- cpWordLength = 0;
- cuWordLength = 0;
-
- // Look for candidate words at the current position
- int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
--
-+
- // If we found exactly one, use that
- if (candidates == 1) {
- cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
-@@ -291,12 +456,12 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
- words[wordsFound%THAI_LOOKAHEAD].markCurrent();
- wordsMatched = 2;
- }
--
-+
- // If we're already at the end of the range, we're done
- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
- goto foundBest;
- }
--
-+
- // See if any of the possible second words is followed by a third word
- do {
- // If we find a third word, stop right away
-@@ -315,13 +480,13 @@ foundBest:
- cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
- wordsFound += 1;
- }
--
-+
- // We come here after having either found a word or not. We look ahead to the
- // next word. If it's not a dictionary word, we will combine it with the word we
- // just found (if there is one), but only if the preceding word does not exceed
- // the threshold.
- // The text iterator should now be positioned at the end of the word we found.
--
-+
- UChar32 uc = 0;
- if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
- // if it is a dictionary word, do nothing. If it isn't, then if there is
-@@ -357,12 +522,12 @@ foundBest:
- }
- }
- }
--
-+
- // Bump the word count if there wasn't already one
- if (cuWordLength <= 0) {
- wordsFound += 1;
- }
--
-+
- // Update the length with the passed-over characters
- cuWordLength += chars;
- }
-@@ -371,14 +536,14 @@ foundBest:
- utext_setNativeIndex(text, current+cuWordLength);
- }
- }
--
-+
- // Never stop before a combining mark.
- int32_t currPos;
- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
- utext_next32(text);
- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
- }
--
-+
- // Look ahead for possible suffixes if a dictionary word does not follow.
- // We do this in code rather than using a rule so that the heuristic
- // resynch continues to function. For example, one of the suffix characters
-@@ -496,16 +661,16 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
- int32_t current;
- UErrorCode status = U_ZERO_ERROR;
- PossibleWord words[LAO_LOOKAHEAD];
--
-+
- utext_setNativeIndex(text, rangeStart);
--
-+
- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
- cuWordLength = 0;
- cpWordLength = 0;
-
- // Look for candidate words at the current position
- int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
--
-+
- // If we found exactly one, use that
- if (candidates == 1) {
- cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
-@@ -526,12 +691,12 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
- words[wordsFound%LAO_LOOKAHEAD].markCurrent();
- wordsMatched = 2;
- }
--
-+
- // If we're already at the end of the range, we're done
- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
- goto foundBest;
- }
--
-+
- // See if any of the possible second words is followed by a third word
- do {
- // If we find a third word, stop right away
-@@ -549,7 +714,7 @@ foundBest:
- cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
- wordsFound += 1;
- }
--
-+
- // We come here after having either found a word or not. We look ahead to the
- // next word. If it's not a dictionary word, we will combine it withe the word we
- // just found (if there is one), but only if the preceding word does not exceed
-@@ -587,12 +752,12 @@ foundBest:
- }
- }
- }
--
-+
- // Bump the word count if there wasn't already one
- if (cuWordLength <= 0) {
- wordsFound += 1;
- }
--
-+
- // Update the length with the passed-over characters
- cuWordLength += chars;
- }
-@@ -601,14 +766,14 @@ foundBest:
- utext_setNativeIndex(text, current + cuWordLength);
- }
- }
--
-+
- // Never stop before a combining mark.
- int32_t currPos;
- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
- utext_next32(text);
- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
- }
--
-+
- // Look ahead for possible suffixes if a dictionary word does not follow.
- // We do this in code rather than using a rule so that the heuristic
- // resynch continues to function. For example, one of the suffix characters
-@@ -689,16 +854,16 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
- int32_t current;
- UErrorCode status = U_ZERO_ERROR;
- PossibleWord words[BURMESE_LOOKAHEAD];
--
-+
- utext_setNativeIndex(text, rangeStart);
--
-+
- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
- cuWordLength = 0;
- cpWordLength = 0;
-
- // Look for candidate words at the current position
- int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
--
-+
- // If we found exactly one, use that
- if (candidates == 1) {
- cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
-@@ -719,12 +884,12 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
- words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
- wordsMatched = 2;
- }
--
-+
- // If we're already at the end of the range, we're done
- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
- goto foundBest;
- }
--
-+
- // See if any of the possible second words is followed by a third word
- do {
- // If we find a third word, stop right away
-@@ -742,7 +907,7 @@ foundBest:
- cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
- wordsFound += 1;
- }
--
-+
- // We come here after having either found a word or not. We look ahead to the
- // next word. If it's not a dictionary word, we will combine it withe the word we
- // just found (if there is one), but only if the preceding word does not exceed
-@@ -780,12 +945,12 @@ foundBest:
- }
- }
- }
--
-+
- // Bump the word count if there wasn't already one
- if (cuWordLength <= 0) {
- wordsFound += 1;
- }
--
-+
- // Update the length with the passed-over characters
- cuWordLength += chars;
- }
-@@ -794,14 +959,14 @@ foundBest:
- utext_setNativeIndex(text, current + cuWordLength);
- }
- }
--
-+
- // Never stop before a combining mark.
- int32_t currPos;
- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
- utext_next32(text);
- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
- }
--
-+
- // Look ahead for possible suffixes if a dictionary word does not follow.
- // We do this in code rather than using a rule so that the heuristic
- // resynch continues to function. For example, one of the suffix characters
-@@ -828,51 +993,28 @@ foundBest:
+@@ -828,51 +1002,28 @@ foundBest:
* KhmerBreakEngine
*/
@@ -536,7 +308,7 @@ index f1c874d..3ad1b3f 100644
}
KhmerBreakEngine::~KhmerBreakEngine() {
-@@ -884,180 +1027,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
+@@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
@@ -560,10 +332,10 @@ index f1c874d..3ad1b3f 100644
+ startZwsp = scanBeforeStart(text, scanStart, breakStart);
+ }
+ utext_setNativeIndex(text, rangeStart);
-+ scanFwdClusters(text, rangeEnd, initAfter);
++ scanFwdClusters(text, rangeStart, initAfter);
+ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
+ utext_setNativeIndex(text, rangeEnd - 1);
-+ scanBackClusters(text, rangeStart, finalBefore);
++ scanBackClusters(text, rangeEnd, finalBefore);
+ if (finalBefore < initAfter) { // the whole run is tented so no breaks
+ if (breakStart || fTypes < UBRK_LINE)
+ foundBreaks.push(rangeStart, status);
@@ -715,7 +487,7 @@ index f1c874d..3ad1b3f 100644
+ if (count == 0) {
+ utext_setNativeIndex(text, ix);
+ int32_t c = utext_current32(text);
-+ if (fPuncSet.contains(c) || c == ZWSP || c == WJ) {
++ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
+ values.setElementAt(0, count);
+ lengths.setElementAt(1, count++);
+ } else if (fBaseSet.contains(c)) {
@@ -767,7 +539,7 @@ index f1c874d..3ad1b3f 100644
+ int32_t ln = lengths.elementAti(j);
+ utext_setNativeIndex(text, ln+ix);
+ int32_t c = utext_current32(text);
-+ while (fPuncSet.contains(c)) {
++ while (fPuncSet.contains(c) || fIgnoreSet.contains(c)) {
+ ++ln;
+ utext_next32(text);
+ c = utext_current32(text);
@@ -887,71 +659,6 @@ index f1c874d..3ad1b3f 100644
}
#if !UCONFIG_NO_NORMALIZATION
-@@ -1121,7 +1288,7 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
- return (int32_t)1 << bitIndex;
- }
-
--
-+
- /*
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
-@@ -1129,7 +1296,7 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
--int32_t
-+int32_t
- CjkBreakEngine::divideUpDictionaryRange( UText *inText,
- int32_t rangeStart,
- int32_t rangeEnd,
-@@ -1192,7 +1359,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
- if (U_FAILURE(status)) {
- return 0;
- }
--
-+
- UnicodeString fragment;
- UnicodeString normalizedFragment;
- for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
-@@ -1261,7 +1428,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
- }
- }
- }
--
-+
- // bestSnlp[i] is the snlp of the best segmentation of the first i
- // code points in the range to be matched.
- UVector32 bestSnlp(numCodePts + 1, status);
-@@ -1271,7 +1438,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
- }
-
-
-- // prev[i] is the index of the last CJK code point in the previous word in
-+ // prev[i] is the index of the last CJK code point in the previous word in
- // the best segmentation of the first i characters.
- UVector32 prev(numCodePts + 1, status);
- for(int32_t i = 0; i <= numCodePts; i++){
-@@ -1305,8 +1472,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
- // Note: lengths is filled with code point lengths
- // The NULL parameter is the ignored code unit lengths.
-
-- // if there are no single character matches found in the dictionary
-- // starting with this charcter, treat character as a 1-character word
-+ // if there are no single character matches found in the dictionary
-+ // starting with this charcter, treat character as a 1-character word
- // with the highest value possible, i.e. the least likely to occur.
- // Exclude Korean characters from this treatment, as they should be left
- // together by default.
-@@ -1380,7 +1547,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
- numBreaks++;
- }
-
-- // Now that we're done, convert positions in t_boundary[] (indices in
-+ // Now that we're done, convert positions in t_boundary[] (indices in
- // the normalized input string) back to indices in the original input UText
- // while reversing t_boundary and pushing values to foundBreaks.
- for (int32_t i = numBreaks-1; i >= 0; i--) {
diff --git a/source/common/dictbe.h b/source/common/dictbe.h
index d3488cd..26caa75 100644
--- misc/icu/source/common/dictbe.h
diff --git a/external/icu/khmerdict.dict b/external/icu/khmerdict.dict
index c935cd088659..52605b65469d 100644
--- a/external/icu/khmerdict.dict
+++ b/external/icu/khmerdict.dict
Binary files differ