fdo#53399 Word count is inconsistent and wrong with non-breaking space

This change replaces lcl_IsSkippableWhitespace with a call to ICU's u_isspace, which covers all Unicode separators. It also updates and fixes one of the SwScanner unit tests. Bug details: SwScanner::NextWord skips whitespace before calling into ICU's BreakIterator. The function used to identify whitespace (lcl_IsSkippableWhitespace) doesn't cover the full category of Unicode separators (code [Zs], 18 in total. See: http://www.fileformat.info/info/unicode/category/Zs/index.htm). Since 0xA0 (no-break space) is not identified as whitespace and not skipped, we end up calling ICU starting at the position 0xA0, asking it to get us the boundary of the next word forward. ICU sees that it's called at the end of a word, and reverses the query direction to backward, and returns the word before. This causes NextWord to think we've hit the end of the string and call it a day, terminating word count for the rest of the line. Change-Id: I29c89ddb0b26e88da822501253898856b28e3fa5 Reviewed-on: https://gerrit.libreoffice.org/453 Reviewed-by: Andras Timar <atimar@suse.com> Tested-by: Andras Timar <atimar@suse.com>
author: Muhammad Haggag <mhaggag@gmail.com> 2012-08-22 16:46:23 +0200
committer: Andras Timar <atimar@suse.com> 2012-08-22 15:23:07 +0000
commit: 3ba107606682b5e675127483a514f0e6580ecfd1 (patch)
tree: eccfab8231bfb0a65b2c280e68083cf7e6f19351
parent: daf23128482894e6efee21463c613bc857e15b5e (diff)
2 files changed, 11 insertions, 12 deletions
diff --git a/sw/qa/core/swdoc-test.cxx b/sw/qa/core/swdoc-test.cxx
index a5df1433aaa6..54a1cffcd4cd 100644
--- a/sw/qa/core/swdoc-test.cxx
+++ b/sw/qa/core/swdoc-test.cxx
@@ -423,6 +423,7 @@ void SwDocTest::testSwScanner()
     }
 
     //See https://issues.apache.org/ooo/show_bug.cgi?id=89042
+    //See https://bugs.freedesktop.org/show_bug.cgi?id=53399
     {
         SwDocStat aDocStat;
 
@@ -439,15 +440,20 @@ void SwDocTest::testSwScanner()
         CPPUNIT_ASSERT_MESSAGE("Should be 3", aDocStat.nWord == 3);
 
         const sal_Unicode aShouldBeFive[] = {
+            // f    r       e       n       c       h       space
             0x0046, 0x0072, 0x0065, 0x006E, 0x0063, 0x0068, 0x0020,
+            // <<   nbsp    s       a       v       o       i
             0x00AB, 0x00A0, 0x0073, 0x0061, 0x0076, 0x006F, 0x0069,
-            0x0072, 0x0020, 0x0063, 0x0061, 0x006C, 0x0063, 0x0075,
-            0x006C, 0x0065, 0x0072, 0x00A0, 0x00BB
+            // r    nnbsp   c       a       l       c       u
+            0x0072, 0x202f, 0x0063, 0x0061, 0x006C, 0x0063, 0x0075,
+            // l    e       r       idspace >>
+            0x006C, 0x0065, 0x0072, 0x3000, 0x00BB
         };
 
         m_pDoc->AppendTxtNode(*aPaM.GetPoint());
         m_pDoc->InsertString(aPaM, rtl::OUString(aShouldBeFive, SAL_N_ELEMENTS(aShouldBeFive)));
         pTxtNode = aPaM.GetNode()->GetTxtNode();
+        aDocStat.Reset();
         pTxtNode->CountWords(aDocStat, 0, SAL_N_ELEMENTS(aShouldBeFive));
         CPPUNIT_ASSERT_MESSAGE("Should be 5", aDocStat.nWord == 5);
     }
diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx
index 66650bab0d4c..ca3b9614d25c 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -71,6 +71,7 @@
 #include <txtatr.hxx>
 #include <fmtautofmt.hxx>
 #include <istyleaccess.hxx>
+#include <unicode/uchar.h>
 
 #include <unomid.h>
 
@@ -96,14 +97,6 @@ using namespace ::com::sun::star::smarttags;
 extern const SwTxtNode *pLinguNode;
 extern       SwTxtFrm  *pLinguFrm;
 
-bool lcl_IsSkippableWhiteSpace( xub_Unicode cCh )
-{
-    return 0x3000 == cCh ||
-           ' ' == cCh ||
-           '\t' == cCh ||
-           0x0a == cCh;
-}
-
 /*
  * This has basically the same function as SwScriptInfo::MaskHiddenRanges,
  * only for deleted redlines
@@ -731,7 +724,7 @@ sal_Bool SwScanner::NextWord()
         // skip non-letter characters:
         while ( nBegin < aText.getLength() )
         {
-            if ( !lcl_IsSkippableWhiteSpace( aText[nBegin] ) )
+            if ( !u_isspace( aText[nBegin] ) )
             {
                 if ( !pLanguage )
                 {
@@ -1877,7 +1870,7 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
     //do the count
     // all counts exclude hidden paras and hidden+redlined within para
     // definition of space/white chars in SwScanner (and BreakIter!)
-    // uses both lcl_IsSkippableWhiteSpace and BreakIter getWordBoundary in SwScanner
+    // uses both u_isspace and BreakIter getWordBoundary in SwScanner
     sal_uInt32 nTmpWords = 0;        // count of all words
     sal_uInt32 nTmpAsianWords = 0;   //count of all Asian codepoints
     sal_uInt32 nTmpChars = 0;        // count of all chars
author	Muhammad Haggag <mhaggag@gmail.com>	2012-08-22 16:46:23 +0200
committer	Andras Timar <atimar@suse.com>	2012-08-22 15:23:07 +0000
commit	3ba107606682b5e675127483a514f0e6580ecfd1 (patch)
tree	eccfab8231bfb0a65b2c280e68083cf7e6f19351
parent	daf23128482894e6efee21463c613bc857e15b5e (diff)