cws sw33bf08: #i113584#, #i113587# transliteration fixed

author: Thomas Lange [tl] <tl@openoffice.org> 2010-08-17 16:46:36 +0200
committer: Thomas Lange [tl] <tl@openoffice.org> 2010-08-17 16:46:36 +0200
commit: 30ff826ddc974f37c72d2d74b3934ed691a7ef7e (patch)
tree: 4e99fe1a0c7e2069a452bd5d6b777422c4521f39 /i18npool
parent: 6e027a4662f04e311670ec5f36344d931b75a589 (diff)
2 files changed, 116 insertions, 158 deletions
diff --git a/i18npool/source/transliteration/transliterationImpl.cxx b/i18npool/source/transliteration/transliterationImpl.cxx
index dfadecfd5eb7..2109c310b233 100644
--- a/i18npool/source/transliteration/transliterationImpl.cxx
+++ b/i18npool/source/transliteration/transliterationImpl.cxx
@@ -295,7 +295,6 @@ OUString SAL_CALL
 TransliterationImpl::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
                     Sequence< sal_Int32 >& offset ) throw(RuntimeException)
 {
-
     if (numCascade == 0)
         return inStr;
 
diff --git a/i18npool/source/transliteration/transliteration_body.cxx b/i18npool/source/transliteration/transliteration_body.cxx
index b58347826470..43222b7a41eb 100755
--- a/i18npool/source/transliteration/transliteration_body.cxx
+++ b/i18npool/source/transliteration/transliteration_body.cxx
@@ -35,6 +35,7 @@
 #include <comphelper/processfactory.hxx>
 #include <osl/diagnose.h>
 
+#include <string.h>
 
 #include "characterclassificationImpl.hxx"
 #include "breakiteratorImpl.hxx"
@@ -96,7 +97,7 @@ static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Un
             nRes = MappingTypeLowerToUpper;
         else
         {
-            OSL_ENSURE( nType & 0x01 /* upper case */, "uppercase character expected! 'Toggle case' failed?" );
+            // should also work properly for non-upper characters like white spacs, numbers, ...
             nRes = MappingTypeUpperToLower;
         }
     }
@@ -330,185 +331,143 @@ Transliteration_titlecase::Transliteration_titlecase()
     implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase";
 }
 
-rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
-    const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
-    Sequence< sal_Int32 >& /*offset*/ )
-    throw(RuntimeException)
+#if 0
+struct LigatureData
 {
-    Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
-    CharacterClassificationImpl aCharClassImpl( xMSF );
-
-    // possible problem: the locale is not exactly specific for each word in the text...
-    OUString aRes( aCharClassImpl.toTitle( inStr, startPos, nCount, aLocale ) );
-    return aRes;
+    sal_uInt32  cChar;
+    sal_Char *  pUtf8Text;
+};
+
+// available Unicode ligatures:
+// http://www.unicode.org/charts
+// http://www.unicode.org/charts/PDF/UFB00.pdf
+static LigatureData aLigatures[] =
+{
+    { 0x0FB00,     "ff" },
+    { 0x0FB01,     "fi" },
+    { 0x0FB02,     "fl" },
+    { 0x0FB03,     "ffi" },
+    { 0x0FB04,     "ffl" },
+    { 0x0FB05,     "ft" },
+    { 0x0FB06,     "st" },
+
+    { 0x0FB13,     "\xD5\xB4\xD5\xB6" },     // Armenian small men now
+    { 0x0FB14,     "\xD5\xB4\xD5\xA5" },     // Armenian small men ech
+    { 0x0FB15,     "\xD5\xB4\xD5\xAB" },     // Armenian small men ini
+    { 0x0FB16,     "\xD5\xBE\xD5\xB6" },     // Armenian small vew now
+    { 0x0FB17,     "\xD5\xB4\xD5\xAD" },     // Armenian small men xeh
+    { 0x00000,     "" }
+};
+
+static inline bool lcl_IsLigature( sal_uInt32 cChar )
+{
+    return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17);
 }
 
-Transliteration_sentencecase::Transliteration_sentencecase()
+static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar )
 {
-    nMappingType = MappingTypeToTitle;  // though only to be applied to the first word...
-    transliterationName = "sentence(generic)";
-    implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
+    rtl::OUString aRes;
+    if (lcl_IsLigature( cChar ))
+    {
+        LigatureData *pFound = NULL;
+        LigatureData *pData = aLigatures;
+        while (!pFound && pData->cChar != 0)
+        {
+            if (pData->cChar == cChar)
+                pFound = pData;
+            ++pData;
+        }
+        if (pFound)
+            aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 );
+    }
+    else
+        aRes = rtl::OUString( &cChar, 1 );
+    return aRes;
 }
+#endif // if 0
 
-rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
+static rtl::OUString transliterate_titlecase_Impl(
     const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
+    const Locale &rLocale,
     Sequence< sal_Int32 >& offset )
     throw(RuntimeException)
 {
-    // inspired from Transliteration_body::transliterate
-    sal_Int32 nOffCount = 0, i;
-    bool bPoint = true;
-    if (useOffset)
-    {
-        for( i = 0; i < nCount; ++i ) {
-            sal_Unicode c = inStr.getStr()[ i + startPos ];
-            if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) {
-                bPoint = true;
-                nOffCount++;
-            }
-            else if( unicode::isAlpha( c ) || unicode::isDigit( c ) )
-            {
-                const Mapping* map = 0;
-                if( bPoint && unicode::isLower( c ))
-                {
-                    map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper);
-                    bPoint = false;
-                }
-                else if (!bPoint && unicode::isUpper( c ))
-                {
-                    map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower);
-                }
-
-                if(map == 0)
-                {
-                    nOffCount++;
-                }
-                else
-                {
-                    nOffCount += map->nmap;
-                }
-            }
-            else
-            {
-                nOffCount++;
-            }
-        }
-    }
-
-    bPoint = true;
-    rtl::OUStringBuffer result;
+    const OUString aText( inStr.copy( startPos, nCount ) );
 
-    if (useOffset)
+    OUString aRes;
+    if (aText.getLength() > 0)
     {
-        result.ensureCapacity(nOffCount);
-        if ( nOffCount != offset.getLength() )
-            offset.realloc( nOffCount );
-    }
-
-
-    sal_Int32 j = 0;
-    sal_Int32 * pArr = offset.getArray();
-    for(  i = 0; i < nCount; ++i ) {
-        sal_Unicode c = inStr.getStr()[ i + startPos ];
-        if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) {
-            bPoint = true;
-            result.append(c);
-            pArr[j++] = i + startPos;
-        }
-        else if( unicode::isAlpha( c ) || unicode::isDigit( c ) )
-        {
-            const Mapping* map = 0;
-            if( bPoint && unicode::isLower( c ))
-            {
-                map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper);
-            }
-            else if (!bPoint && unicode::isUpper( c ))
-            {
-                map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower);
-            }
-
-            if(map == 0)
-            {
-                result.append( c );
-                pArr[j++] = i + startPos;
-            }
-            else
-            {
-                for (sal_Int32 k = 0; k < map->nmap; k++)
-                {
-                    result.append( map->map[k] );
-                    pArr[j++] = i + startPos;
-                }
-            }
-            bPoint = false;
-        }
-        else
+        Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
+        CharacterClassificationImpl aCharClassImpl( xMSF );
+
+        // because aCharClassImpl.toTitle does not handle ligatures or � but will raise
+        // an exception we need to handle the first chara manually...
+
+        // we don't want to change surrogates by accident, thuse we use proper code point iteration
+        sal_Int32 nPos = 0;
+        sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos );
+        OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) );
+        // toUpper can be used to properly resolve ligatures and characters like �
+        aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
+        // since toTitle will leave all-uppercase text unchanged we first need to
+        // use toLower to bring possible 2nd and following charas in lowercase
+        aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale );
+        sal_Int32 nResolvedLen = aResolvedLigature.getLength();
+
+        // now we can properly use toTitle to get the expected result for the resolved string.
+        // The rest of the text should just become lowercase.
+        aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale );
+        aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale );
+        offset.realloc( aRes.getLength() );
+
+        sal_Int32 *pOffset = offset.getArray();
+        sal_Int32 nLen = offset.getLength();
+        for (sal_Int32 i = 0; i < nLen; ++i)
         {
-            result.append( c );
-            pArr[j++] = i + startPos;
+            sal_Int32 nIdx = 0;
+            if (i >= nResolvedLen)
+                nIdx = i - nResolvedLen + 1;
+            pOffset[i] = nIdx;
         }
     }
-    return result.makeStringAndClear();
+#if OSL_DEBUG_LEVEL > 1
+    const sal_Int32 *pCOffset = offset.getConstArray();
+    (void) pCOffset;
+#endif
+
+    return aRes;
 }
 
-#if 0
-// TL: alternative implemntation try. But breakiterator has its problem too since
-// beginOfSentence does not work as expected with '.'. See comment below.
-// For the time being I will leave this code here as a from-scratch sample if the
-// breakiterator works better at some point...
-rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
-    const OUString& inStr, sal_Int32 nStartPos, sal_Int32 nCount,
-    Sequence< sal_Int32 >& /*offset*/ )
+
+// this function expects to be called on a word-by-word basis,
+// namely that startPos points to the first char of the word
+rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
+    const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
+    Sequence< sal_Int32 >& offset )
     throw(RuntimeException)
 {
-    OUString aRes( inStr.copy( nStartPos, nCount ) );
+    return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
+}
 
-    if (nStartPos >= 0 && nStartPos < inStr.getLength() && nCount > 0)
-    {
-        Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
-        BreakIteratorImpl brk( xMSF );
 
-        sal_Int32 nSentenceStart = -1, nOldSentenceStart = -1;
-        sal_Int32 nPos = nStartPos + nCount - 1;
-        while (nPos >= nStartPos && nPos != -1)
-        {
-            // possible problem: the locale is not exactly specific for each sentence in the text,
-            // but it is the only one we have...
-            nOldSentenceStart = nSentenceStart;
-            nSentenceStart = brk.beginOfSentence( inStr, nPos, aLocale );
-
-            // since the breakiterator completely ignores '.' characvters as end-of-sentence when
-            // the next word is lower case we need to take care of that ourself. The drawback:
-            // la mid-sentence abbreviation like e.g. will now be identified as end-of-sentence. :-(
-            // Well, at least the other product does it in the same way...
-            sal_Int32 nFullStopPos = inStr.lastIndexOf( (sal_Unicode)'.', nPos );
-            nPos = nSentenceStart;
-            if (nFullStopPos > 0 && nFullStopPos > nSentenceStart)
-            {
-                Boundary aBd2 = brk.nextWord( inStr, nFullStopPos, aLocale, WordType::DICTIONARY_WORD );
-                nSentenceStart = aBd2.startPos;
-                nPos = nFullStopPos;
-            }
+Transliteration_sentencecase::Transliteration_sentencecase()
+{
+    nMappingType = MappingTypeToTitle;  // though only to be applied to the first word...
+    transliterationName = "sentence(generic)";
+    implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
+}
 
-            if (nSentenceStart < nOldSentenceStart || nOldSentenceStart == -1)
-            {
-                // the sentence start might be a quotation mark or some kind of bracket, thus
-                // we need the first dictionary word starting or following this position
-    //            Boundary aBd1 = brk.nextWord( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD );
-                Boundary aBd2 = brk.getWordBoundary( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD, true );
-    //            OUString aWord1( inStr.copy( aBd1.startPos, aBd1.endPos - aBd1.startPos + 1 ) );
-                OUString aWord2( inStr.copy( aBd2.startPos, aBd2.endPos - aBd2.startPos + 1 ) );
-            }
-            else
-                break;  // prevent endless loop
 
-            // continue with previous sentence
-            if (nPos != -1)
-                --nPos;
-        }
-    }
-    return aRes;
+// this function expects to be called on a sentence-by-sentence basis,
+// namely that startPos points to the first word (NOT first char!) in the sentence
+rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
+    const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
+    Sequence< sal_Int32 >& offset )
+    throw(RuntimeException)
+{
+    return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset );
 }
-#endif
+
 
 } } } }
+
author	Thomas Lange [tl] <tl@openoffice.org>	2010-08-17 16:46:36 +0200
committer	Thomas Lange [tl] <tl@openoffice.org>	2010-08-17 16:46:36 +0200
commit	30ff826ddc974f37c72d2d74b3934ed691a7ef7e (patch)
tree	4e99fe1a0c7e2069a452bd5d6b777422c4521f39 /i18npool
parent	6e027a4662f04e311670ec5f36344d931b75a589 (diff)