diff options
author | Caolán McNamara <caolanm@redhat.com> | 2012-08-28 17:10:35 +0100 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2012-08-29 09:02:50 +0100 |
commit | 42a15f45ff4e02f98229de02efd0d8c19f10bcd5 (patch) | |
tree | 67031948d50d251825c1d05d5547a499a1c5e51b | |
parent | 02f6e55231c8b1646cbafc0e3e591da8122e2bf1 (diff) |
Resolves: fdo#38983 allow extra word boundary characters
i.e. word overrides emdash and endash to be word boundary characters
for the purposes of counting words. And there are some who want
to treat =,- etc similarly.
Default to a configuration that gives the same results as Word for
word counting.
Change-Id: Ia8ce6ac12011a1d6e547f11644c76163c4c993c5
-rw-r--r-- | i18npool/source/breakiterator/data/README | 7 | ||||
-rw-r--r-- | officecfg/registry/data/org/openoffice/Office/Writer.xcu | 10 | ||||
-rw-r--r-- | officecfg/registry/schema/org/openoffice/Office/Writer.xcs | 13 | ||||
-rw-r--r-- | sw/inc/swscanner.hxx | 6 | ||||
-rw-r--r-- | sw/qa/core/swdoc-test.cxx | 110 | ||||
-rw-r--r-- | sw/source/core/txtnode/txtedt.cxx | 79 | ||||
-rw-r--r-- | sw/source/ui/config/optload.cxx | 24 | ||||
-rw-r--r-- | sw/source/ui/config/optload.hrc | 5 | ||||
-rw-r--r-- | sw/source/ui/config/optload.src | 20 | ||||
-rw-r--r-- | sw/source/ui/inc/optload.hxx | 3 |
10 files changed, 238 insertions, 39 deletions
diff --git a/i18npool/source/breakiterator/data/README b/i18npool/source/breakiterator/data/README index 7d67cf0c6766..6858f7a538da 100644 --- a/i18npool/source/breakiterator/data/README +++ b/i18npool/source/breakiterator/data/README @@ -22,12 +22,6 @@ Date: Sat Jan 29 12:51:52 2011 +0000 Resolves: fdo#31271 wrong line break with ( -commit 109fa8224194edfc4ca75ee5cc5e760e54d76a3f -Author: Thomas Lange [tl] <tl@openoffice.org> -Date: Wed Dec 8 14:39:09 2010 +0100 - - cws tl84: #i89042# word count fix - commit 42be5541baf18e3292a14a9d478eda33f61e10ab Author: Mattias Johnsson <m.t.johnsson@gmail.com> Date: Thu Nov 4 23:25:02 2010 +1100 @@ -585,6 +579,7 @@ Date: Mon Mar 8 16:17:05 2004 +0000 done, regression tests added: +#i89042# word count fix (regression test is in writer) #i58513# add break iterator rules for Finish #i19716# fix wrong line break on bracket characters #i21290# extend Greek script type diff --git a/officecfg/registry/data/org/openoffice/Office/Writer.xcu b/officecfg/registry/data/org/openoffice/Office/Writer.xcu index 55ab299cb1db..861b777f29b6 100644 --- a/officecfg/registry/data/org/openoffice/Office/Writer.xcu +++ b/officecfg/registry/data/org/openoffice/Office/Writer.xcu @@ -735,9 +735,9 @@ </prop> </node> </node> -<node oor:name="Notes"> - <prop oor:name="ShowAnkor"> - <value>false</value> - </prop> -</node> + <node oor:name="Notes"> + <prop oor:name="ShowAnkor"> + <value>false</value> + </prop> + </node> </oor:component-data> diff --git a/officecfg/registry/schema/org/openoffice/Office/Writer.xcs b/officecfg/registry/schema/org/openoffice/Office/Writer.xcs index e79798885a83..cffc7406f643 100644 --- a/officecfg/registry/schema/org/openoffice/Office/Writer.xcs +++ b/officecfg/registry/schema/org/openoffice/Office/Writer.xcs @@ -5736,6 +5736,19 @@ <value>false</value> </prop> </group> + <group oor:name="WordCount"> + <info> + <desc>Contains settings for word counting</desc> + </info> + <prop oor:name="AdditionalSeperators" oor:type="xs:string" oor:nillable="false"> + <info> + <author>cmc</author> + <desc>configures additional word seperators for word counting</desc> + <label>Additional Word Seperators</label> + </info> + <value>—–</value> + </prop> + </group> <group oor:name="Navigator"> <info> <desc>Contains settings for the Navigator.</desc> diff --git a/sw/inc/swscanner.hxx b/sw/inc/swscanner.hxx index f657f238c2ca..7a71c0a1e9e9 100644 --- a/sw/inc/swscanner.hxx +++ b/sw/inc/swscanner.hxx @@ -43,13 +43,15 @@ class SwScanner { rtl::OUString aWord; const SwTxtNode& rNode; - const rtl::OUString aText; + const rtl::OUString aPreDashReplacementText; + rtl::OUString aText; const LanguageType* pLanguage; const ModelToViewHelper& rConversionMap; sal_Int32 nStartPos; sal_Int32 nEndPos; sal_Int32 nBegin; sal_Int32 nLen; + sal_Int32 nOverriddenDashCount; LanguageType aCurrLang; sal_uInt16 nWordType; sal_Bool bClip; @@ -74,6 +76,8 @@ public: sal_Int32 GetLen() const { return nLen; } LanguageType GetCurrentLanguage() const {return aCurrLang;} + + sal_Int32 getOverriddenDashCount() const {return nOverriddenDashCount; } }; #endif diff --git a/sw/qa/core/swdoc-test.cxx b/sw/qa/core/swdoc-test.cxx index 54a1cffcd4cd..2f1bf8668824 100644 --- a/sw/qa/core/swdoc-test.cxx +++ b/sw/qa/core/swdoc-test.cxx @@ -567,6 +567,116 @@ void SwDocTest::testSwScanner() CPPUNIT_ASSERT_EQUAL(aDocStat.nWord, static_cast<sal_uLong>(0)); CPPUNIT_ASSERT_EQUAL(aDocStat.nChar, static_cast<sal_uLong>(0)); } + + //See https://bugs.freedesktop.org/show_bug.cgi?id=38983 + { + SwDocStat aDocStat; + + rtl::OUString sTemplate("ThisXis a test."); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', ' ')); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 4 && + aDocStat.nCharExcludingSpaces == 12 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replaceAll(rtl::OUString('X'), rtl::OUString(" = "))); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 5 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 17); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replaceAll(rtl::OUString('X'), rtl::OUString(" _ "))); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 5 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 17); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replaceAll(rtl::OUString('X'), rtl::OUString(" -- "))); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 5 && + aDocStat.nCharExcludingSpaces == 14 && + aDocStat.nChar == 18); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', '_')); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 3 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', '-')); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 3 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', 0x2012)); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 3 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', 0x2015)); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 3 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + //But default configuration should, msword-alike treak emdash + //and endash as word seperators for word-counting + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', 0x2013)); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 4 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replace('X', 0x2014)); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 4 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 15); + aDocStat.Reset(); + + const sal_Unicode aChunk[] = {' ', 0x2013, ' '}; + rtl::OUString sChunk(aChunk, SAL_N_ELEMENTS(aChunk)); + m_pDoc->AppendTxtNode(*aPaM.GetPoint()); + m_pDoc->InsertString(aPaM, sTemplate.replaceAll(rtl::OUString('X'), sChunk)); + pTxtNode = aPaM.GetNode()->GetTxtNode(); + pTxtNode->CountWords(aDocStat, 0, pTxtNode->Len()); + CPPUNIT_ASSERT(aDocStat.nWord == 4 && + aDocStat.nCharExcludingSpaces == 13 && + aDocStat.nChar == 17); + aDocStat.Reset(); + } } //See https://bugs.freedesktop.org/show_bug.cgi?id=40599 diff --git a/sw/source/core/txtnode/txtedt.cxx b/sw/source/core/txtnode/txtedt.cxx index e37dada60149..40e6dc28f42a 100644 --- a/sw/source/core/txtnode/txtedt.cxx +++ b/sw/source/core/txtnode/txtedt.cxx @@ -37,6 +37,7 @@ #include <editeng/hangulhanja.hxx> #include <SwSmartTagMgr.hxx> #include <linguistic/lngprops.hxx> +#include <officecfg/Office/Writer.hxx> #include <unotools/transliterationwrapper.hxx> #include <unotools/charclass.hxx> #include <dlelstnr.hxx> @@ -655,12 +656,44 @@ XubString SwTxtNode::GetCurWord( xub_StrLen nPos ) const SwScanner::SwScanner( const SwTxtNode& rNd, const rtl::OUString& rTxt, const LanguageType* pLang, const ModelToViewHelper& rConvMap, sal_uInt16 nType, sal_Int32 nStart, sal_Int32 nEnde, sal_Bool bClp ) - : rNode( rNd ), aText( rTxt), pLanguage( pLang ), rConversionMap( rConvMap ), nLen( 0 ), nWordType( nType ), bClip( bClp ) + : rNode( rNd ) + , aPreDashReplacementText(rTxt) + , pLanguage( pLang ) + , rConversionMap( rConvMap ) + , nLen( 0 ) + , nOverriddenDashCount( 0 ) + , nWordType( nType ) + , bClip( bClp ) { - OSL_ENSURE( !aText.isEmpty(), "SwScanner: EmptyString" ); + OSL_ENSURE( !aPreDashReplacementText.isEmpty(), "SwScanner: EmptyString" ); nStartPos = nBegin = nStart; nEndPos = nEnde; + //MSWord f.e has special emdash and endash behaviour in that they break + //words for the purposes of word counting, while a hyphen etc. doesn't. + // + //The default configuration treats emdash/endash as a word break, but + //additional ones can be added in under tools->options + if (nWordType == i18n::WordType::WORD_COUNT) + { + rtl::OUString sDashes = officecfg::Office::Writer::WordCount::AdditionalSeperators::get(); + rtl::OUStringBuffer aBuf(aPreDashReplacementText); + for (sal_Int32 i = nStartPos; i < nEndPos; ++i) + { + sal_Unicode cChar = aBuf[i]; + if (sDashes.indexOf(cChar) != -1) + { + aBuf[i] = ' '; + ++nOverriddenDashCount; + } + } + aText = aBuf.makeStringAndClear(); + } + else + aText = aPreDashReplacementText; + + assert(aPreDashReplacementText.getLength() == aText.getLength()); + if ( pLanguage ) { aCurrLang = *pLanguage; @@ -836,7 +869,7 @@ sal_Bool SwScanner::NextWord() if ( nWordType == i18n::WordType::WORD_COUNT ) nLen = forceEachAsianCodePointToWord(aText, nBegin, nLen); - aWord = aText.copy( nBegin, nLen ); + aWord = aPreDashReplacementText.copy( nBegin, nLen ); return sal_True; } @@ -1892,30 +1925,35 @@ void SwTxtNode::CountWords( SwDocStat& rStat, sal_uInt32 nTmpCharsExcludingSpaces = 0; // all non-white chars // count words in masked and expanded text: - if (!aExpandText.isEmpty() && pBreakIt->GetBreakIter().is()) + if (!aExpandText.isEmpty()) { - // zero is NULL for pLanguage -----------v last param = true for clipping - SwScanner aScanner( *this, aExpandText, 0, aConversionMap, i18n::WordType::WORD_COUNT, - nExpandBegin, nExpandEnd, true ); + if (pBreakIt->GetBreakIter().is()) + { + // zero is NULL for pLanguage -----------v last param = true for clipping + SwScanner aScanner( *this, aExpandText, 0, aConversionMap, i18n::WordType::WORD_COUNT, + nExpandBegin, nExpandEnd, true ); - // used to filter out scanner returning almost empty strings (len=1; unichar=0x0001) - const rtl::OUString aBreakWord( CH_TXTATR_BREAKWORD ); + // used to filter out scanner returning almost empty strings (len=1; unichar=0x0001) + const rtl::OUString aBreakWord( CH_TXTATR_BREAKWORD ); - while ( aScanner.NextWord() ) - { - // 1 is len(CH_TXTATR_BREAKWORD) : match returns length of match - if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() )) + while ( aScanner.NextWord() ) { - ++nTmpWords; - const rtl::OUString &rWord = aScanner.GetWord(); - if (pBreakIt->GetBreakIter()->getScriptType(rWord, 0) == i18n::ScriptType::ASIAN) - ++nTmpAsianWords; - nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord); + // 1 is len(CH_TXTATR_BREAKWORD) : match returns length of match + if( 1 != aExpandText.match(aBreakWord, aScanner.GetBegin() )) + { + ++nTmpWords; + const rtl::OUString &rWord = aScanner.GetWord(); + if (pBreakIt->GetBreakIter()->getScriptType(rWord, 0) == i18n::ScriptType::ASIAN) + ++nTmpAsianWords; + nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord); + } } + + nTmpCharsExcludingSpaces += aScanner.getOverriddenDashCount(); } - } - nTmpChars = pBreakIt->getGraphemeCount(aExpandText, nExpandBegin, nExpandEnd); + nTmpChars = pBreakIt->getGraphemeCount(aExpandText, nExpandBegin, nExpandEnd); + } // no nTmpCharsExcludingSpaces adjust needed neither for blanked out MaskedChars // nor for mid-word selection - set scanner bClip = true at creation @@ -1938,6 +1976,7 @@ void SwTxtNode::CountWords( SwDocStat& rStat, nTmpCharsExcludingSpaces += pBreakIt->getGraphemeCount(rWord); } + nTmpCharsExcludingSpaces += aScanner.getOverriddenDashCount(); nTmpChars += pBreakIt->getGraphemeCount(sNumString); } else if ( bHasBullet ) diff --git a/sw/source/ui/config/optload.cxx b/sw/source/ui/config/optload.cxx index e2d519a62651..760cf2035103 100644 --- a/sw/source/ui/config/optload.cxx +++ b/sw/source/ui/config/optload.cxx @@ -26,6 +26,7 @@ * ************************************************************************/ +#include <officecfg/Office/Writer.hxx> #include <comphelper/string.hxx> #include <tools/shl.hxx> #include <swtypes.hxx> @@ -88,7 +89,10 @@ SwLoadOptPage::SwLoadOptPage( Window* pParent, const SfxItemSet& rSet ) : aTabFT ( this, SW_RES( FT_TAB ) ), aTabMF ( this, SW_RES( MF_TAB ) ), aUseSquaredPageMode ( this, SW_RES( CB_USE_SQUARE_PAGE_MODE ) ), - aUseCharUnit ( this , SW_RES( CB_USE_CHAR_UNIT ) ), + aUseCharUnit ( this , SW_RES( CB_USE_CHAR_UNIT ) ), + aWordCountFL ( this , SW_RES( FL_WORDCOUNT ) ), + aWordCountFT ( this , SW_RES( FT_WORDCOUNT ) ), + aWordCountED ( this , SW_RES( ED_WORDCOUNT ) ), pWrtShell ( NULL ), bHTMLMode ( sal_False ), @@ -131,10 +135,10 @@ SwLoadOptPage::SwLoadOptPage( Window* pParent, const SfxItemSet& rSet ) : SvtCJKOptions aCJKOptions; if(!aCJKOptions.IsAsianTypographyEnabled()) - { + { aUseSquaredPageMode.Hide(); - aUseCharUnit.Hide(); - } + aUseCharUnit.Hide(); + } } SwLoadOptPage::~SwLoadOptPage() @@ -209,6 +213,15 @@ sal_Bool SwLoadOptPage::FillItemSet( SfxItemSet& rSet ) bRet = sal_True; } + if (aWordCountED.GetText() != aWordCountED.GetSavedValue()) + { + boost::shared_ptr< comphelper::ConfigurationChanges > batch( + comphelper::ConfigurationChanges::create()); + officecfg::Office::Writer::WordCount::AdditionalSeperators::set(aWordCountED.GetText(), batch); + batch->commit(); + bRet = sal_True; + } + sal_Bool bIsSquaredPageModeFlag = aUseSquaredPageMode.IsChecked(); if ( bIsSquaredPageModeFlag != aUseSquaredPageMode.GetSavedValue() ) { @@ -304,6 +317,9 @@ void SwLoadOptPage::Reset( const SfxItemSet& rSet) aUseCharUnit.Check(pUsrPref->IsApplyCharUnit()); } aUseCharUnit.SaveValue(); + + aWordCountED.SetText(officecfg::Office::Writer::WordCount::AdditionalSeperators::get()); + aWordCountED.SaveValue(); } IMPL_LINK_NOARG(SwLoadOptPage, MetricHdl) diff --git a/sw/source/ui/config/optload.hrc b/sw/source/ui/config/optload.hrc index 2d123c07ffa1..8ee917e93875 100644 --- a/sw/source/ui/config/optload.hrc +++ b/sw/source/ui/config/optload.hrc @@ -32,7 +32,10 @@ #define FT_TAB 21 #define MF_TAB 22 #define CB_USE_SQUARE_PAGE_MODE 23 -#define CB_USE_CHAR_UNIT 24 +#define CB_USE_CHAR_UNIT 24 +#define FL_WORDCOUNT 25 +#define FT_WORDCOUNT 26 +#define ED_WORDCOUNT 27 // SwCaptionOptPage ----------------------------- diff --git a/sw/source/ui/config/optload.src b/sw/source/ui/config/optload.src index 8c7089a83968..5dc8dd05fd2c 100644 --- a/sw/source/ui/config/optload.src +++ b/sw/source/ui/config/optload.src @@ -142,14 +142,30 @@ TabPage TP_OPTLOAD_PAGE Size = MAP_APPFONT ( 248 , 10 ) ; Text [ en-US ] = "Use square page mode for text grid"; }; - CheckBox CB_USE_CHAR_UNIT { Pos = MAP_APPFONT ( 12 , 130) ; Size = MAP_APPFONT ( 109 , 10 ) ; Text [ en-US ] = "Enable char unit"; }; - + FixedLine FL_WORDCOUNT + { + Pos = MAP_APPFONT ( 6 , 144 ) ; + Size = MAP_APPFONT ( 248 , 8 ) ; + Text [ en-US ] = "Word Count"; + }; + FixedText FT_WORDCOUNT + { + Pos = MAP_APPFONT ( 12 , 157 ) ; + Size = MAP_APPFONT ( 80 , 8 ) ; + Text [ en-US ] = "Additional separators"; + }; + Edit ED_WORDCOUNT + { + Pos = MAP_APPFONT ( 95 , 155 ) ; + Size = MAP_APPFONT ( 159 , 12 ) ; + Border = TRUE ; + }; }; TabPage TP_OPTCAPTION_PAGE diff --git a/sw/source/ui/inc/optload.hxx b/sw/source/ui/inc/optload.hxx index 88c04b0fe310..6889c227d3fb 100644 --- a/sw/source/ui/inc/optload.hxx +++ b/sw/source/ui/inc/optload.hxx @@ -55,6 +55,9 @@ private: MetricField aTabMF; CheckBox aUseSquaredPageMode; CheckBox aUseCharUnit; + FixedLine aWordCountFL; + FixedText aWordCountFT; + Edit aWordCountED; SwWrtShell* pWrtShell; sal_Bool bHTMLMode; |