Resolves: fdo#38983 allow extra word boundary characters

i.e. word overrides emdash and endash to be word boundary characters for the purposes of counting words. And there are some who want to treat =,- etc similarly. Default to a configuration that gives the same results as Word for word counting. Change-Id: Ia8ce6ac12011a1d6e547f11644c76163c4c993c5
author: Caolán McNamara <caolanm@redhat.com> 2012-08-28 17:10:35 +0100
committer: Caolán McNamara <caolanm@redhat.com> 2012-08-29 09:02:50 +0100
commit: 42a15f45ff4e02f98229de02efd0d8c19f10bcd5 (patch)
tree: 67031948d50d251825c1d05d5547a499a1c5e51b /sw/source/core/txtnode
parent: 02f6e55231c8b1646cbafc0e3e591da8122e2bf1 (diff)
1 files changed, 59 insertions, 20 deletions
diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx
index e37dada60149..40e6dc28f42a 100644
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@@ -37,6 +37,7 @@
 #include <editeng/hangulhanja.hxx>
 #include <SwSmartTagMgr.hxx>
 #include <linguistic/lngprops.hxx>
+#include <officecfg/Office/Writer.hxx>
 #include <unotools/transliterationwrapper.hxx>
 #include <unotools/charclass.hxx>
 #include <dlelstnr.hxx>
@@ -655,12 +656,44 @@ XubString SwTxtNode::GetCurWord( xub_StrLen nPos ) const
 SwScanner::SwScanner( const SwTxtNode& rNd, const rtl::OUString& rTxt,
     const LanguageType* pLang, const ModelToViewHelper& rConvMap,
     sal_uInt16 nType, sal_Int32 nStart, sal_Int32 nEnde, sal_Bool bClp )
-    : rNode( rNd ), aText( rTxt), pLanguage( pLang ), rConversionMap( rConvMap ), nLen( 0 ), nWordType( nType ), bClip( bClp )
+    : rNode( rNd )
+    , aPreDashReplacementText(rTxt)
+    , pLanguage( pLang )
+    , rConversionMap( rConvMap )
+    , nLen( 0 )
+    , nOverriddenDashCount( 0 )
+    , nWordType( nType )
+    , bClip( bClp )
 {
-    OSL_ENSURE( !aText.isEmpty(), "SwScanner: EmptyString" );
+    OSL_ENSURE( !aPreDashReplacementText.isEmpty(), "SwScanner: EmptyString" );
     nStartPos = nBegin = nStart;
     nEndPos = nEnde;
 
+    //MSWord f.e has special emdash and endash behaviour in that they break
+    //words for the purposes of word counting, while a hyphen etc. doesn't.
+    //
+    //The default configuration treats emdash/endash as a word break, but
+    //additional ones can be added in under tools->options
+    if (nWordType == i18n::WordType::WORD_COUNT)
+    {
+        rtl::OUString sDashes = officecfg::Office::Writer::WordCount::AdditionalSeperators::get();
+        rtl::OUStringBuffer aBuf(aPreDashReplacementText);
+        for (sal_Int32 i = nStartPos; i < nEndPos; ++i)
+        {
+            sal_Unicode cChar = aBuf[i];
+            if (sDashes.indexOf(cChar) != -1)
+            {
+                aBuf[i] = ' ';
+                ++nOverriddenDashCount;
+            }
+        }
+        aText = aBuf.makeStringAndClear();
+    }
+    else
+        aText = aPreDashReplacementText;
+
+    assert(aPreDashReplacementText.getLength() == aText.getLength());
+
     if ( pLanguage )
     {
         aCurrLang = *pLanguage;
@@ -836,7 +869,7 @@ sal_Bool SwScanner::NextWord()
     if ( nWordType == i18n::WordType::WORD_COUNT )
         nLen = forceEachAsianCodePointToWord(aText, nBegin, nLen);
 
-    aWord = aText.copy( nBegin, nLen );
+    aWord = aPreDashReplacementText.copy( nBegin, nLen );
 
     return sal_True;
 }
@@ -1892,30 +1925,35 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
     sal_uInt32 nTmpCharsExcludingSpaces = 0;  // all non-white chars
 
     // count words in masked and expanded text:
-    if (!aExpandText.isEmpty() && pBreakIt->GetBreakIter().is())
+    if (!aExpandText.isEmpty())
     {
-        // zero is NULL for pLanguage -----------v               last param = true for clipping
-        SwScanner aScanner( *this, aExpandText, 0, aConversionMap, i18n::WordType::WORD_COUNT,
-                            nExpandBegin, nExpandEnd, true );
+        if (pBreakIt->GetBreakIter().is())
+        {
+            // zero is NULL for pLanguage -----------v               last param = true for clipping
+            SwScanner aScanner( *this, aExpandText, 0, aConversionMap, i18n::WordType::WORD_COUNT,
+                                nExpandBegin, nExpandEnd, true );
 
-        // used to filter out scanner returning almost empty strings (len=1; unichar=0x0001)
-        const rtl::OUString aBreakWord( CH_TXTATR_BREAKWORD );
+            // used to filter out scanner returning almost empty strings (len=1; unichar=0x0001)
+            const rtl::OUString aBreakWord( CH_TXTATR_BREAKWORD );
 
-        while ( aScanner.NextWord() )
-        {
-            //  1 is len(CH_TXTATR_BREAKWORD) : match returns length of match
-            if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
+            while ( aScanner.NextWord() )
             {
-                ++nTmpWords;
-                const rtl::OUString &rWord = aScanner.GetWord();
-                if (pBreakIt->GetBreakIter()->getScriptType(rWord, 0) == i18n::ScriptType::ASIAN)
-                    ++nTmpAsianWords;
-                nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord);
+                //  1 is len(CH_TXTATR_BREAKWORD) : match returns length of match
+                if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() ))
+                {
+                    ++nTmpWords;
+                    const rtl::OUString &rWord = aScanner.GetWord();
+                    if (pBreakIt->GetBreakIter()->getScriptType(rWord, 0) == i18n::ScriptType::ASIAN)
+                        ++nTmpAsianWords;
+                    nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord);
+                }
             }
+
+            nTmpCharsExcludingSpaces += aScanner.getOverriddenDashCount();
         }
-    }
 
-    nTmpChars = pBreakIt->getGraphemeCount(aExpandText, nExpandBegin, nExpandEnd);
+        nTmpChars = pBreakIt->getGraphemeCount(aExpandText, nExpandBegin, nExpandEnd);
+    }
 
     // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars
     // nor for mid-word selection - set scanner bClip = true at creation
@@ -1938,6 +1976,7 @@ void SwTxtNode::CountWords( SwDocStat& rStat,
             nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord);
         }
 
+        nTmpCharsExcludingSpaces += aScanner.getOverriddenDashCount();
         nTmpChars += pBreakIt->getGraphemeCount(sNumString);
     }
     else if ( bHasBullet )
author	Caolán McNamara <caolanm@redhat.com>	2012-08-28 17:10:35 +0100
committer	Caolán McNamara <caolanm@redhat.com>	2012-08-29 09:02:50 +0100
commit	42a15f45ff4e02f98229de02efd0d8c19f10bcd5 (patch)
tree	67031948d50d251825c1d05d5547a499a1c5e51b /sw/source/core/txtnode
parent	02f6e55231c8b1646cbafc0e3e591da8122e2bf1 (diff)