drop custom Indic grapheme rules and rely on contemporary icu defaults

similar to... commit 8578a1c9d167c19f1d8038fac5946b4b3cae305e Date: Thu Nov 26 15:47:26 2020 +0200 tdf#138481: Trust the built-in break iterator character data in ICU Don't use our own char.txt. the char_in.txt hasn't really changed since 2008 and is woefully out of date at this point. we have cppunit tests for the only documented bug that touched char_in.txt, #i111152# and tdf#40292, for tdf#40292 change the test to test what was actually reported as a bug Change-Id: I8e35b102b0a46d2c63e47e055e472892f65022ac Reviewed-on: https://gerrit.libreoffice.org/c/core/+/106763 Tested-by: Jenkins Reviewed-by: Caolán McNamara <caolanm@redhat.com>
author: Caolán McNamara <caolanm@redhat.com> 2020-11-27 12:14:44 +0000
committer: Caolán McNamara <caolanm@redhat.com> 2020-11-28 20:31:29 +0100
commit: fc011ac027da7574baa4b431ac800020170ba8b0 (patch)
tree: 21066a3bda35695cd55c4cba4913f34da8e2eacf /i18npool
parent: 5ec2cf4e1ff448a3d9b3b30a825a40f962e0a53d (diff)
13 files changed, 18 insertions, 193 deletions
diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk
index e6d658f01405..89827cfc6a92 100644
--- a/i18npool/CustomTarget_breakiterator.mk
+++ b/i18npool/CustomTarget_breakiterator.mk
@@ -42,15 +42,7 @@ $(i18npool_BIDIR)/dict_%.cxx : \
 
 endif
 
-# Do we want the char_in.brk data? It's for languages in India (and bn_BD)
-ifeq ($(WITH_LOCALES),)
-i18npool_breakiterator_want_in=TRUE
-else ifneq ($(filter bn bn_% gu gu_% hi hi_% kn kn_% ml ml_% or or_% pa pa_% ta ta_% te te_%,$(WITH_LOCALES)),)
-i18npool_breakiterator_want_in=TRUE
-endif
-
 i18npool_BRKTXTS := \
-    $(if $(i18npool_breakiterator_want_in),char_in.brk) \
     count_word.brk \
     $(call gb_Helper_optional_locale,he,dict_word_he.brk) \
     $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
index 7d900fcfdbac..384041ccd5db 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -662,17 +662,29 @@ void TestBreakIterator::testGraphemeIteration()
     aLocale.Country = "IN";
 
     {
-        const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
-        OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
+        const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI [] = { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
+
+        OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI, SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
 
         sal_Int32 nDone=0;
         sal_Int32 nPos = 0;
 
-        nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
-            i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
-        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
-        nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
+        nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
+        nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
+        nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
+        nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI)), nPos);
+        nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI), aLocale,
             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
+        nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
+        nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
+        nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
     }
 
diff --git a/i18npool/source/breakiterator/data/char_in.txt b/i18npool/source/breakiterator/data/char_in.txt
deleted file mode 100644
index 414259524302..000000000000
--- a/i18npool/source/breakiterator/data/char_in.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-#
-#   Copyright (C) 2002-2009, International Business Machines Corporation and others.
-#       All Rights Reserved.
-#
-#   file:  char.txt 
-#
-#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
-#      See Unicode Standard Annex #29.
-#      These rules are based on TR29 Revision 13, for Unicode Version 5.1
-#
-
-#
-#  Character Class Definitions.
-#
-$CR          = [\p{Grapheme_Cluster_Break = CR}];
-$LF          = [\p{Grapheme_Cluster_Break = LF}];
-$Control     = [\p{Grapheme_Cluster_Break = Control}];
-$Prepend     = [\p{Grapheme_Cluster_Break = Prepend}];
-$Extend      = [\p{Grapheme_Cluster_Break = Extend}];
-$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
-$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1];
-$BengaliSignVirama = \u09CD;
-$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1];
-$GujaratiSignVirama = \u0ACD;
-$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F];
-$DevanagariSignVirama = \u094D;
-$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1];
-$KannadaSignVirama = \u0CCD;
-$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F];
-$MalayalamSignVirama = \u0D4D;
-$OdiaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71];
-$OdiaSignVirama = \u0B4D;
-$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E];
-$GurmukhiSignVirama = \u0A4D;
-$TamilKa = \u0B95;
-$TamilSignVirama = \u0BCD;
-$TamilSsa = \u0BB7;
-$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61];
-$TeluguSignVirama = \u0C4D;
-
-#
-# Korean Syllable Definitions
-#
-$L       = [\p{Grapheme_Cluster_Break = L}];
-$V       = [\p{Grapheme_Cluster_Break = V}];
-$T       = [\p{Grapheme_Cluster_Break = T}];
-
-$LV      = [\p{Grapheme_Cluster_Break = LV}];
-$LVT     = [\p{Grapheme_Cluster_Break = LVT}];
-
-
-## -------------------------------------------------
-!!chain;
-
-!!forward;
-
-$CR $LF;
-
-$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+;
-$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+;
-$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+;
-$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+;
-$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+;
-$OdiaLetter ($OdiaSignVirama $OdiaLetter?)+;
-$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+;
-$TamilKa $TamilSignVirama $TamilSsa;
-$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+;
-
-$L ($L | $V | $LV | $LVT);
-($LV | $V) ($V | $T);
-($LVT | $T) $T;
-
-[^$Control $CR $LF] $Extend;
-
-[^$Control $CR $LF] $SpacingMark;
-$Prepend [^$Control $CR $LF];
-
-
-## -------------------------------------------------
-
-!!reverse;
-$LF $CR;
-($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter;
-($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter;
-($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter;
-($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter;
-($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter;
-($OdiaLetter? $OdiaSignVirama)+ $OdiaLetter;
-($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter;
-$TamilSsa $TamilSignVirama $TamilKa;
-($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter;
-($L | $V | $LV | $LVT) $L;
-($V | $T) ($LV | $V);
-$T ($LVT | $T);
-
-$Extend      [^$Control $CR $LF];
-$SpacingMark [^$Control $CR $LF];
-[^$Control $CR $LF] $Prepend;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-
-## -------------------------------------------------
-
-!!safe_forward;
-
diff --git a/i18npool/source/localedata/data/bn_BD.xml b/i18npool/source/localedata/data/bn_BD.xml
index 221ba6f997fa..40b51e02c45a 100644
--- a/i18npool/source/localedata/data/bn_BD.xml
+++ b/i18npool/source/localedata/data/bn_BD.xml
@@ -168,13 +168,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>সত্য</trueWord>
       <falseWord>মিথ্যা</falseWord>
diff --git a/i18npool/source/localedata/data/bn_IN.xml b/i18npool/source/localedata/data/bn_IN.xml
index 081a87cb79ad..7064336deb0a 100644
--- a/i18npool/source/localedata/data/bn_IN.xml
+++ b/i18npool/source/localedata/data/bn_IN.xml
@@ -346,13 +346,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>সত্য</trueWord>
       <falseWord>মিথ্যা</falseWord>
diff --git a/i18npool/source/localedata/data/gu_IN.xml b/i18npool/source/localedata/data/gu_IN.xml
index 186197731533..67d72b60dad0 100644
--- a/i18npool/source/localedata/data/gu_IN.xml
+++ b/i18npool/source/localedata/data/gu_IN.xml
@@ -170,13 +170,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>ખરું</trueWord>
       <falseWord>ખોટું</falseWord>
diff --git a/i18npool/source/localedata/data/hi_IN.xml b/i18npool/source/localedata/data/hi_IN.xml
index eaf88ca57364..b5607a94171b 100644
--- a/i18npool/source/localedata/data/hi_IN.xml
+++ b/i18npool/source/localedata/data/hi_IN.xml
@@ -344,13 +344,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>सही</trueWord>
       <falseWord>गलत</falseWord>
diff --git a/i18npool/source/localedata/data/kn_IN.xml b/i18npool/source/localedata/data/kn_IN.xml
index 77276027012c..fee83d08db12 100644
--- a/i18npool/source/localedata/data/kn_IN.xml
+++ b/i18npool/source/localedata/data/kn_IN.xml
@@ -170,13 +170,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>ಸಹೀ</trueWord>
       <falseWord>ಗಲತ್</falseWord>
diff --git a/i18npool/source/localedata/data/ml_IN.xml b/i18npool/source/localedata/data/ml_IN.xml
index b2484457b239..1fb10fef3559 100644
--- a/i18npool/source/localedata/data/ml_IN.xml
+++ b/i18npool/source/localedata/data/ml_IN.xml
@@ -344,13 +344,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>അതെ</trueWord>
       <falseWord>അല്ല</falseWord>
diff --git a/i18npool/source/localedata/data/or_IN.xml b/i18npool/source/localedata/data/or_IN.xml
index bc7109a2e72e..98ab9d7992f2 100644
--- a/i18npool/source/localedata/data/or_IN.xml
+++ b/i18npool/source/localedata/data/or_IN.xml
@@ -360,13 +360,6 @@
     <Transliteration unoid="IGNORE_CASE"/>
   </LC_TRANSLITERATION>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>ସତ</trueWord>
       <falseWord>ମିଛ</falseWord>
diff --git a/i18npool/source/localedata/data/pa_IN.xml b/i18npool/source/localedata/data/pa_IN.xml
index 08d92a4d651a..7d967984bf00 100644
--- a/i18npool/source/localedata/data/pa_IN.xml
+++ b/i18npool/source/localedata/data/pa_IN.xml
@@ -192,13 +192,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>ਸਹੀ</trueWord>
       <falseWord>ਗਲਤ</falseWord>
diff --git a/i18npool/source/localedata/data/ta_IN.xml b/i18npool/source/localedata/data/ta_IN.xml
index e7cf0796f198..f7c6c260a35e 100644
--- a/i18npool/source/localedata/data/ta_IN.xml
+++ b/i18npool/source/localedata/data/ta_IN.xml
@@ -170,13 +170,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>ஸரி</trueWord>
       <falseWord>தப்ப</falseWord>
diff --git a/i18npool/source/localedata/data/te_IN.xml b/i18npool/source/localedata/data/te_IN.xml
index 7039d1ae2f3e..9eecd0f3ebd0 100644
--- a/i18npool/source/localedata/data/te_IN.xml
+++ b/i18npool/source/localedata/data/te_IN.xml
@@ -169,13 +169,6 @@
   </LC_CURRENCY>
   <LC_TRANSLITERATION ref="en_US"/>
   <LC_MISC>
-    <BreakIteratorRules>
-      <EditMode/>
-      <DictionaryMode/>
-      <WordCountMode/>
-      <CharacterMode>char_in</CharacterMode>
-      <LineMode/>
-    </BreakIteratorRules>
     <ReservedWords>
       <trueWord>నిజము</trueWord>
       <falseWord>అపాదము</falseWord>
author	Caolán McNamara <caolanm@redhat.com>	2020-11-27 12:14:44 +0000
committer	Caolán McNamara <caolanm@redhat.com>	2020-11-28 20:31:29 +0100
commit	fc011ac027da7574baa4b431ac800020170ba8b0 (patch)
tree	21066a3bda35695cd55c4cba4913f34da8e2eacf /i18npool
parent	5ec2cf4e1ff448a3d9b3b30a825a40f962e0a53d (diff)