fdo#72219: Fix for corruption of symbols in docx

Issue: OUString uses UTF-16, so for a Unicode surrogate character there are 2 values stored, not just 1. So we are getting assert failure in "rtl_uString_iterateCodePoints" method. erAck: Underlying cause was that the dictionary breakiterator misused UTF-16 positions as Unicode code point positions. Change-Id: I923485f56c2d879b63687adaea2b489a3479991c Reviewed-on: https://gerrit.libreoffice.org/6955 Reviewed-by: Eike Rathke <erack@redhat.com> Tested-by: Eike Rathke <erack@redhat.com> (cherry picked from commit d8fd15875901d584a4bbcc07c927fa20332e4841) Reviewed-on: https://gerrit.libreoffice.org/7322 (cherry picked from commit 994d0c9e7aa8d1a7602e61b770991da980c1cde5) Reviewed-on: https://gerrit.libreoffice.org/7324
author: Rohit Deshmukh <rohit.deshmukh@synerzip.com> 2013-12-06 15:42:53 +0530
committer: Eike Rathke <erack@redhat.com> 2014-01-08 19:48:27 +0000
commit: 2421317990d00e14325298f34db3c60735527697 (patch)
tree: aaa3835c1baac1271697f1a975f02abf30781029
parent: 515c6cf7a3832bfc7a6eeed65704bc9eee96adc1 (diff)
2 files changed, 22 insertions, 3 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
index 41e40779de88..b4174bae17d8 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -44,7 +44,7 @@ public:
 #endif
     void testKhmer();
     void testJapanese();
-
+    void testChinese();
     CPPUNIT_TEST_SUITE(TestBreakIterator);
     CPPUNIT_TEST(testLineBreaking);
     CPPUNIT_TEST(testGraphemeIteration);
@@ -60,6 +60,7 @@ public:
     CPPUNIT_TEST(testKhmer);
 #endif
     CPPUNIT_TEST(testJapanese);
+    CPPUNIT_TEST(testChinese);
     CPPUNIT_TEST_SUITE_END();
 private:
     uno::Reference<i18n::XBreakIterator> m_xBreak;
@@ -909,6 +910,22 @@ void TestBreakIterator::testJapanese()
     }
 }
 
+void TestBreakIterator::testChinese()
+{
+    lang::Locale aLocale;
+    aLocale.Language = "zh";
+    aLocale.Country = "CN";
+    i18n::Boundary aBounds;
+
+    {
+        const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB  };
+
+        OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
+        aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
+            i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
+    }
+}
 void TestBreakIterator::setUp()
 {
     BootstrapFixtureBase::setUp();
diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx
index 72da09f87629..3b43fa31e4e2 100644
--- a/i18npool/source/breakiterator/xdictionary.cxx
+++ b/i18npool/source/breakiterator/xdictionary.cxx
@@ -383,9 +383,11 @@ Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, s
                 if (u_isWhitespace(ch))
                     i--;
             }
+
             boundary.endPos = boundary.startPos;
-            rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
-            rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
+            boundary.endPos += aCache.wordboundary[i];
+            boundary.startPos += aCache.wordboundary[i-1];
+
         } else {
             boundary.startPos = anyPos;
             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
author	Rohit Deshmukh <rohit.deshmukh@synerzip.com>	2013-12-06 15:42:53 +0530
committer	Eike Rathke <erack@redhat.com>	2014-01-08 19:48:27 +0000
commit	2421317990d00e14325298f34db3c60735527697 (patch)
tree	aaa3835c1baac1271697f1a975f02abf30781029
parent	515c6cf7a3832bfc7a6eeed65704bc9eee96adc1 (diff)