diff options
author | Thomas Lange <tl@openoffice.org> | 2009-09-23 12:29:56 +0000 |
---|---|---|
committer | Thomas Lange <tl@openoffice.org> | 2009-09-23 12:29:56 +0000 |
commit | 892a6935027c6817ddf45720109273bb4fd282c6 (patch) | |
tree | 05c0a995a22a7560003d1e76bf61e44f4681b45f /i18npool/source/transliteration | |
parent | 3327c079e5160ba94d92e4a728a017c904ed2b99 (diff) |
#i1601# transliteration for sentence case, title case and toggle case
Diffstat (limited to 'i18npool/source/transliteration')
3 files changed, 247 insertions, 200 deletions
diff --git a/i18npool/source/transliteration/makefile.mk b/i18npool/source/transliteration/makefile.mk index 5ad615c64dbe..daf3068d4d20 100644 --- a/i18npool/source/transliteration/makefile.mk +++ b/i18npool/source/transliteration/makefile.mk @@ -50,7 +50,6 @@ SLOFILES= \ $(SLO)$/transliteration_OneToOne.obj \ $(SLO)$/transliteration_Ignore.obj \ $(SLO)$/transliteration_Numeric.obj \ - $(SLO)$/transliteration_sentencecase.obj \ $(SLO)$/hiraganaToKatakana.obj \ $(SLO)$/katakanaToHiragana.obj \ $(SLO)$/ignoreKana.obj \ diff --git a/i18npool/source/transliteration/transliteration_body.cxx b/i18npool/source/transliteration/transliteration_body.cxx index d75eede13bf9..0426f89b604d 100644 --- a/i18npool/source/transliteration/transliteration_body.cxx +++ b/i18npool/source/transliteration/transliteration_body.cxx @@ -31,7 +31,17 @@ // MARKER(update_precomp.py): autogen include statement, do not remove #include "precompiled_i18npool.hxx" +#include <rtl/ustrbuf.hxx> #include <i18nutil/casefolding.hxx> +#include <i18nutil/unicode.hxx> + +#include <comphelper/processfactory.hxx> +#include <tools/debug.hxx> + + +#include "CharacterClassificationImpl.hxx" +#include "breakiteratorImpl.hxx" + #define TRANSLITERATION_ALL #include "transliteration_body.hxx" @@ -39,8 +49,11 @@ using namespace ::com::sun::star::uno; using namespace ::com::sun::star::lang; using namespace ::rtl; +#define A2OU(x) OUString::createFromAscii(x) + namespace com { namespace sun { namespace star { namespace i18n { + Transliteration_body::Transliteration_body() { nMappingType = 0; @@ -71,9 +84,35 @@ Transliteration_body::transliterateRange( const OUString& str1, const OUString& return ostr; } + +static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar ) +{ + sal_uInt8 nRes = nMappingType; + + // take care of TOGGLE_CASE transliteration: + // nMappingType should not be a combination of flags, thuse we decide now + // which one to use. + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + { + const sal_Int16 nType = unicode::getUnicodeType( cChar ); + if (nType & 0x02 /* lower case*/) + nRes = MappingTypeLowerToUpper; + else + { + DBG_ASSERT( nType & 0x01 /* upper case */, "uppercase character expected! 'Toggle case' failed?" ); + nRes = MappingTypeUpperToLower; + } + } + + return nRes; +} + + OUString SAL_CALL -Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, - Sequence< sal_Int32 >& offset) throw(RuntimeException) +Transliteration_body::transliterate( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& offset) + throw(RuntimeException) { #if 0 /* Performance optimization: @@ -142,7 +181,12 @@ Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nOffCount = 0, i; for (i = 0; i < nCount; i++) { - const Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); + // take care of TOGGLE_CASE transliteration: + sal_uInt8 nTmpMappingType = nMappingType; + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); nOffCount += map.nmap; } rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 ); // our x_rtl_ustring.h @@ -155,7 +199,12 @@ Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 * pArr = offset.getArray(); for (i = 0; i < nCount; i++) { - const Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); + // take care of TOGGLE_CASE transliteration: + sal_uInt8 nTmpMappingType = nMappingType; + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); for (sal_Int32 k = 0; k < map.nmap; k++) { pArr[j] = i + startPos; @@ -187,7 +236,12 @@ Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 j = 0; for ( sal_Int32 i = 0; i < nCount; i++) { - const Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); + // take care of TOGGLE_CASE transliteration: + sal_uInt8 nTmpMappingType = nMappingType; + if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) + nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); for (sal_Int32 k = 0; k < map.nmap; k++) { out[j++] = map.map[k]; @@ -261,6 +315,17 @@ Transliteration_l2u::Transliteration_l2u() implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; } +Transliteration_togglecase::Transliteration_togglecase() +{ + // usually nMappingType must NOT be a combiantion of different flages here, + // but we take care of that problem in Transliteration_body::transliterate above + // before that value is used. There we will decide which of both is to be used on + // a per character basis. + nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; + transliterationName = "toggle(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; +} + Transliteration_titlecase::Transliteration_titlecase() { nMappingType = MappingTypeToTitle; @@ -268,12 +333,185 @@ Transliteration_titlecase::Transliteration_titlecase() implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; } -Transliteration_togglecase::Transliteration_togglecase() +rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
+ const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& /*offset*/ ) + throw(RuntimeException) +{
+ Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
+ CharacterClassificationImpl aCharClassImpl( xMSF );
+
+ // possible problem: the locale is not exactly specific for each word in the text...
+ OUString aRes( aCharClassImpl.toTitle( inStr, startPos, nCount, aLocale ) );
+ return aRes;
+}
+ +Transliteration_sentencecase::Transliteration_sentencecase() { - nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; - transliterationName = "toggle(generic)"; - implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; + nMappingType = MappingTypeToTitle; // though only to be applied to the first word... + transliterationName = "sentence(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; } +rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >& offset ) + throw(RuntimeException) +{ + // inspired from Transliteration_body::transliterate + sal_Int32 nOffCount = 0, i; + bool bPoint = true; + if (useOffset) + { + for( i = 0; i < nCount; ++i ) { + sal_Unicode c = inStr.getStr()[ i + startPos ]; + if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) { + bPoint = true; + nOffCount++; + } + else if( unicode::isAlpha( c ) || unicode::isDigit( c ) ) + { + const Mapping* map = 0; + if( bPoint && unicode::isLower( c )) + { + map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper); + bPoint = false; + } + else if (!bPoint && unicode::isUpper( c )) + { + map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower); + } + + if(map == 0) + { + nOffCount++; + } + else + { + nOffCount += map->nmap; + } + } + else + { + nOffCount++; + } + } + } + + bPoint = true; + rtl::OUStringBuffer result; + + if (useOffset) + { + result.ensureCapacity(nOffCount); + if ( nOffCount != offset.getLength() ) + offset.realloc( nOffCount ); + } + + + sal_Int32 j = 0; + sal_Int32 * pArr = offset.getArray(); + for( i = 0; i < nCount; ++i ) { + sal_Unicode c = inStr.getStr()[ i + startPos ]; + if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) { + bPoint = true; + result.append(c); + pArr[j++] = i + startPos; + } + else if( unicode::isAlpha( c ) || unicode::isDigit( c ) ) + { + const Mapping* map = 0; + if( bPoint && unicode::isLower( c )) + { + map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper); + } + else if (!bPoint && unicode::isUpper( c )) + { + map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower); + } + + if(map == 0) + { + result.append( c ); + pArr[j++] = i + startPos; + } + else + { + for (sal_Int32 k = 0; k < map->nmap; k++) + { + result.append( map->map[k] ); + pArr[j++] = i + startPos; + } + } + bPoint = false; + } + else + { + result.append( c ); + pArr[j++] = i + startPos; + } + } + return result.makeStringAndClear(); +} + +#if 0 +// TL: alternative implemntation try. But breakiterator has its problem too since +// beginOfSentence does not work as expected with '.'. See comment below. +// For the time being I will leave this code here as a from-scratch sample if the +// breakiterator works better at some point... +rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
+ const OUString& inStr, sal_Int32 nStartPos, sal_Int32 nCount, + Sequence< sal_Int32 >& /*offset*/ ) + throw(RuntimeException) +{
+ OUString aRes( inStr.copy( nStartPos, nCount ) ); + + if (nStartPos >= 0 && nStartPos < inStr.getLength() && nCount > 0) + { + Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
+ BreakIteratorImpl brk( xMSF ); + + sal_Int32 nSentenceStart = -1, nOldSentenceStart = -1; + sal_Int32 nPos = nStartPos + nCount - 1; + while (nPos >= nStartPos && nPos != -1) + { + // possible problem: the locale is not exactly specific for each sentence in the text, + // but it is the only one we have...
+ nOldSentenceStart = nSentenceStart;
+ nSentenceStart = brk.beginOfSentence( inStr, nPos, aLocale ); + + // since the breakiterator completely ignores '.' characvters as end-of-sentence when + // the next word is lower case we need to take care of that ourself. The drawback: + // la mid-sentence abbreviation like e.g. will now be identified as end-of-sentence. :-( + // Well, at least the other product does it in the same way... + sal_Int32 nFullStopPos = inStr.lastIndexOf( (sal_Unicode)'.', nPos ); + nPos = nSentenceStart; + if (nFullStopPos > 0 && nFullStopPos > nSentenceStart) + { + Boundary aBd2 = brk.nextWord( inStr, nFullStopPos, aLocale, WordType::DICTIONARY_WORD ); + nSentenceStart = aBd2.startPos; + nPos = nFullStopPos; + } + + if (nSentenceStart < nOldSentenceStart || nOldSentenceStart == -1) + { + // the sentence start might be a quotation mark or some kind of bracket, thus + // we need the first dictionary word starting or following this position + // Boundary aBd1 = brk.nextWord( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD ); + Boundary aBd2 = brk.getWordBoundary( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD, true ); + // OUString aWord1( inStr.copy( aBd1.startPos, aBd1.endPos - aBd1.startPos + 1 ) ); + OUString aWord2( inStr.copy( aBd2.startPos, aBd2.endPos - aBd2.startPos + 1 ) ); + } + else + break; // prevent endless loop + + // continue with previous sentence + if (nPos != -1) + --nPos; + } + } + return aRes;
+}
+#endif } } } } diff --git a/i18npool/source/transliteration/transliteration_sentencecase.cxx b/i18npool/source/transliteration/transliteration_sentencecase.cxx deleted file mode 100644 index 35f9a69abb4d..000000000000 --- a/i18npool/source/transliteration/transliteration_sentencecase.cxx +++ /dev/null @@ -1,190 +0,0 @@ -/************************************************************************* - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * Copyright 2008 by Sun Microsystems, Inc. - * - * OpenOffice.org - a multi-platform office productivity suite - * - * $RCSfile: halfwidthToFullwidth.cxx,v $ - * $Revision: 1.12 $ - * - * This file is part of OpenOffice.org. - * - * OpenOffice.org is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License version 3 - * only, as published by the Free Software Foundation. - * - * OpenOffice.org is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License version 3 for more details - * (a copy is included in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU Lesser General Public License - * version 3 along with OpenOffice.org. If not, see - * <http://www.openoffice.org/license.html> - * for a copy of the LGPLv3 License. - * - ************************************************************************/ - -// MARKER(update_precomp.py): autogen include statement, do not remove -#include "precompiled_i18npool.hxx" - -// prevent internal compiler error with MSVC6SP3 -#include <utility> - -#include <i18nutil/widthfolding.hxx> -#include <i18nutil/casefolding.hxx> -#define TRANSLITERATION_sentencecase -#include <transliteration_sentencecase.hxx> -#include <rtl/ustring.hxx> -#include <i18nutil/x_rtl_ustring.h> -#include <i18nutil/unicode.hxx> -#include <rtl/ustrbuf.hxx> -#include <rtl/strbuf.hxx> - -using namespace com::sun::star::uno; -using namespace com::sun::star::lang; -using namespace rtl; - -namespace com { namespace sun { namespace star { namespace i18n { - - -Transliteration_sentencecase::Transliteration_sentencecase() -{ - transliterationName = "sentenceCase"; - implementationName = "com.sun.star.i18n.Transliteration.SENTENCE_CASE"; -} - -OUString SAL_CALL -Transliteration_sentencecase::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset ) - throw(RuntimeException) -{ - // inspired from Transliteration_body::transliterate - sal_Int32 nOffCount = 0, i; - bool bPoint = true; - if (useOffset) - { - for( i = 0; i < nCount; ++i ) { - sal_Unicode c = inStr.getStr()[ i + startPos ]; - if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) { - bPoint = true; - nOffCount++; - } - else if( unicode::isAlpha( c ) || unicode::isDigit( c ) ) - { - const Mapping* map = 0; - if( bPoint && unicode::isLower( c )) - { - map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper); - } - bPoint = false; - - if(map == 0) - { - nOffCount++; - } - else - { - nOffCount += map->nmap; - } - } - else - { - nOffCount++; - } - } - } - - bPoint = true; - rtl::OUStringBuffer result; - - if (useOffset) - { - result.ensureCapacity(nOffCount); - if ( nOffCount != offset.getLength() ) - offset.realloc( nOffCount ); - } - - - sal_Int32 j = 0; - sal_Int32 * pArr = offset.getArray(); - for( i = 0; i < nCount; ++i ) { - sal_Unicode c = inStr.getStr()[ i + startPos ]; - if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) { - bPoint = true; - result.append(c); - pArr[j++] = i + startPos; - } - else if( unicode::isAlpha( c ) || unicode::isDigit( c ) ) - { - const Mapping* map = 0; - if( bPoint && unicode::isLower( c )) - { - map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper); - } - bPoint = false; - - if(map == 0) - { - result.append( c ); - pArr[j++] = i + startPos; - } - else - { - for (sal_Int32 k = 0; k < map->nmap; k++) - { - result.append( map->map[k] ); - pArr[j++] = i + startPos; - } - } - } - else - { - result.append( c ); - pArr[j++] = i + startPos; - } - } - return result.makeStringAndClear(); -} - -sal_Unicode SAL_CALL -Transliteration_sentencecase::transliterateChar2Char( sal_Unicode inChar) - throw(RuntimeException, MultipleCharsOutputException) -{ - return inChar; -} - -sal_Int16 SAL_CALL Transliteration_sentencecase::getType() throw(RuntimeException) -{ - return TransliterationType::IGNORE; -} - -OUString SAL_CALL -Transliteration_sentencecase::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, - Sequence< sal_Int32 >& offset) throw(RuntimeException) -{ - return this->transliterate(inStr, startPos, nCount, offset); -} - -sal_Bool SAL_CALL Transliteration_sentencecase::equals( - const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, - const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) - throw(RuntimeException) -{ - throw RuntimeException(); -} - -Sequence< OUString > SAL_CALL -Transliteration_sentencecase::transliterateRange( const OUString& str1, const OUString& str2 ) - throw( RuntimeException) -{ - Sequence< OUString > ostr(2); - ostr[0] = str1; - ostr[1] = str2; - return ostr; -} - -} } } } - |