summaryrefslogtreecommitdiff
path: root/i18npool/source/transliteration
diff options
context:
space:
mode:
authorThomas Lange <tl@openoffice.org>2009-09-23 12:29:56 +0000
committerThomas Lange <tl@openoffice.org>2009-09-23 12:29:56 +0000
commit892a6935027c6817ddf45720109273bb4fd282c6 (patch)
tree05c0a995a22a7560003d1e76bf61e44f4681b45f /i18npool/source/transliteration
parent3327c079e5160ba94d92e4a728a017c904ed2b99 (diff)
#i1601# transliteration for sentence case, title case and toggle case
Diffstat (limited to 'i18npool/source/transliteration')
-rw-r--r--i18npool/source/transliteration/makefile.mk1
-rw-r--r--i18npool/source/transliteration/transliteration_body.cxx256
-rw-r--r--i18npool/source/transliteration/transliteration_sentencecase.cxx190
3 files changed, 247 insertions, 200 deletions
diff --git a/i18npool/source/transliteration/makefile.mk b/i18npool/source/transliteration/makefile.mk
index 5ad615c64dbe..daf3068d4d20 100644
--- a/i18npool/source/transliteration/makefile.mk
+++ b/i18npool/source/transliteration/makefile.mk
@@ -50,7 +50,6 @@ SLOFILES= \
$(SLO)$/transliteration_OneToOne.obj \
$(SLO)$/transliteration_Ignore.obj \
$(SLO)$/transliteration_Numeric.obj \
- $(SLO)$/transliteration_sentencecase.obj \
$(SLO)$/hiraganaToKatakana.obj \
$(SLO)$/katakanaToHiragana.obj \
$(SLO)$/ignoreKana.obj \
diff --git a/i18npool/source/transliteration/transliteration_body.cxx b/i18npool/source/transliteration/transliteration_body.cxx
index d75eede13bf9..0426f89b604d 100644
--- a/i18npool/source/transliteration/transliteration_body.cxx
+++ b/i18npool/source/transliteration/transliteration_body.cxx
@@ -31,7 +31,17 @@
// MARKER(update_precomp.py): autogen include statement, do not remove
#include "precompiled_i18npool.hxx"
+#include <rtl/ustrbuf.hxx>
#include <i18nutil/casefolding.hxx>
+#include <i18nutil/unicode.hxx>
+
+#include <comphelper/processfactory.hxx>
+#include <tools/debug.hxx>
+
+
+#include "CharacterClassificationImpl.hxx"
+#include "breakiteratorImpl.hxx"
+
#define TRANSLITERATION_ALL
#include "transliteration_body.hxx"
@@ -39,8 +49,11 @@ using namespace ::com::sun::star::uno;
using namespace ::com::sun::star::lang;
using namespace ::rtl;
+#define A2OU(x) OUString::createFromAscii(x)
+
namespace com { namespace sun { namespace star { namespace i18n {
+
Transliteration_body::Transliteration_body()
{
nMappingType = 0;
@@ -71,9 +84,35 @@ Transliteration_body::transliterateRange( const OUString& str1, const OUString&
return ostr;
}
+
+static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar )
+{
+ sal_uInt8 nRes = nMappingType;
+
+ // take care of TOGGLE_CASE transliteration:
+ // nMappingType should not be a combination of flags, thuse we decide now
+ // which one to use.
+ if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
+ {
+ const sal_Int16 nType = unicode::getUnicodeType( cChar );
+ if (nType & 0x02 /* lower case*/)
+ nRes = MappingTypeLowerToUpper;
+ else
+ {
+ DBG_ASSERT( nType & 0x01 /* upper case */, "uppercase character expected! 'Toggle case' failed?" );
+ nRes = MappingTypeUpperToLower;
+ }
+ }
+
+ return nRes;
+}
+
+
OUString SAL_CALL
-Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
- Sequence< sal_Int32 >& offset) throw(RuntimeException)
+Transliteration_body::transliterate(
+ const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
+ Sequence< sal_Int32 >& offset)
+ throw(RuntimeException)
{
#if 0
/* Performance optimization:
@@ -142,7 +181,12 @@ Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos,
sal_Int32 nOffCount = 0, i;
for (i = 0; i < nCount; i++)
{
- const Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType);
+ // take care of TOGGLE_CASE transliteration:
+ sal_uInt8 nTmpMappingType = nMappingType;
+ if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
+ nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
+
+ const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
nOffCount += map.nmap;
}
rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount, 1 ); // our x_rtl_ustring.h
@@ -155,7 +199,12 @@ Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos,
sal_Int32 * pArr = offset.getArray();
for (i = 0; i < nCount; i++)
{
- const Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType);
+ // take care of TOGGLE_CASE transliteration:
+ sal_uInt8 nTmpMappingType = nMappingType;
+ if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
+ nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
+
+ const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
for (sal_Int32 k = 0; k < map.nmap; k++)
{
pArr[j] = i + startPos;
@@ -187,7 +236,12 @@ Transliteration_body::transliterate( const OUString& inStr, sal_Int32 startPos,
sal_Int32 j = 0;
for ( sal_Int32 i = 0; i < nCount; i++)
{
- const Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType);
+ // take care of TOGGLE_CASE transliteration:
+ sal_uInt8 nTmpMappingType = nMappingType;
+ if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower))
+ nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] );
+
+ const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType );
for (sal_Int32 k = 0; k < map.nmap; k++)
{
out[j++] = map.map[k];
@@ -261,6 +315,17 @@ Transliteration_l2u::Transliteration_l2u()
implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u";
}
+Transliteration_togglecase::Transliteration_togglecase()
+{
+ // usually nMappingType must NOT be a combiantion of different flages here,
+ // but we take care of that problem in Transliteration_body::transliterate above
+ // before that value is used. There we will decide which of both is to be used on
+ // a per character basis.
+ nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower;
+ transliterationName = "toggle(generic)";
+ implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase";
+}
+
Transliteration_titlecase::Transliteration_titlecase()
{
nMappingType = MappingTypeToTitle;
@@ -268,12 +333,185 @@ Transliteration_titlecase::Transliteration_titlecase()
implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase";
}
-Transliteration_togglecase::Transliteration_togglecase()
+rtl::OUString SAL_CALL Transliteration_titlecase::transliterate(
+ const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
+ Sequence< sal_Int32 >& /*offset*/ )
+ throw(RuntimeException)
+{
+ Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
+ CharacterClassificationImpl aCharClassImpl( xMSF );
+
+ // possible problem: the locale is not exactly specific for each word in the text...
+ OUString aRes( aCharClassImpl.toTitle( inStr, startPos, nCount, aLocale ) );
+ return aRes;
+}
+
+Transliteration_sentencecase::Transliteration_sentencecase()
{
- nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower;
- transliterationName = "toggle(generic)";
- implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase";
+ nMappingType = MappingTypeToTitle; // though only to be applied to the first word...
+ transliterationName = "sentence(generic)";
+ implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase";
}
+rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
+ const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
+ Sequence< sal_Int32 >& offset )
+ throw(RuntimeException)
+{
+ // inspired from Transliteration_body::transliterate
+ sal_Int32 nOffCount = 0, i;
+ bool bPoint = true;
+ if (useOffset)
+ {
+ for( i = 0; i < nCount; ++i ) {
+ sal_Unicode c = inStr.getStr()[ i + startPos ];
+ if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) {
+ bPoint = true;
+ nOffCount++;
+ }
+ else if( unicode::isAlpha( c ) || unicode::isDigit( c ) )
+ {
+ const Mapping* map = 0;
+ if( bPoint && unicode::isLower( c ))
+ {
+ map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper);
+ bPoint = false;
+ }
+ else if (!bPoint && unicode::isUpper( c ))
+ {
+ map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower);
+ }
+
+ if(map == 0)
+ {
+ nOffCount++;
+ }
+ else
+ {
+ nOffCount += map->nmap;
+ }
+ }
+ else
+ {
+ nOffCount++;
+ }
+ }
+ }
+
+ bPoint = true;
+ rtl::OUStringBuffer result;
+
+ if (useOffset)
+ {
+ result.ensureCapacity(nOffCount);
+ if ( nOffCount != offset.getLength() )
+ offset.realloc( nOffCount );
+ }
+
+
+ sal_Int32 j = 0;
+ sal_Int32 * pArr = offset.getArray();
+ for( i = 0; i < nCount; ++i ) {
+ sal_Unicode c = inStr.getStr()[ i + startPos ];
+ if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) {
+ bPoint = true;
+ result.append(c);
+ pArr[j++] = i + startPos;
+ }
+ else if( unicode::isAlpha( c ) || unicode::isDigit( c ) )
+ {
+ const Mapping* map = 0;
+ if( bPoint && unicode::isLower( c ))
+ {
+ map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper);
+ }
+ else if (!bPoint && unicode::isUpper( c ))
+ {
+ map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeUpperToLower);
+ }
+
+ if(map == 0)
+ {
+ result.append( c );
+ pArr[j++] = i + startPos;
+ }
+ else
+ {
+ for (sal_Int32 k = 0; k < map->nmap; k++)
+ {
+ result.append( map->map[k] );
+ pArr[j++] = i + startPos;
+ }
+ }
+ bPoint = false;
+ }
+ else
+ {
+ result.append( c );
+ pArr[j++] = i + startPos;
+ }
+ }
+ return result.makeStringAndClear();
+}
+
+#if 0
+// TL: alternative implemntation try. But breakiterator has its problem too since
+// beginOfSentence does not work as expected with '.'. See comment below.
+// For the time being I will leave this code here as a from-scratch sample if the
+// breakiterator works better at some point...
+rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate(
+ const OUString& inStr, sal_Int32 nStartPos, sal_Int32 nCount,
+ Sequence< sal_Int32 >& /*offset*/ )
+ throw(RuntimeException)
+{
+ OUString aRes( inStr.copy( nStartPos, nCount ) );
+
+ if (nStartPos >= 0 && nStartPos < inStr.getLength() && nCount > 0)
+ {
+ Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory();
+ BreakIteratorImpl brk( xMSF );
+
+ sal_Int32 nSentenceStart = -1, nOldSentenceStart = -1;
+ sal_Int32 nPos = nStartPos + nCount - 1;
+ while (nPos >= nStartPos && nPos != -1)
+ {
+ // possible problem: the locale is not exactly specific for each sentence in the text,
+ // but it is the only one we have...
+ nOldSentenceStart = nSentenceStart;
+ nSentenceStart = brk.beginOfSentence( inStr, nPos, aLocale );
+
+ // since the breakiterator completely ignores '.' characvters as end-of-sentence when
+ // the next word is lower case we need to take care of that ourself. The drawback:
+ // la mid-sentence abbreviation like e.g. will now be identified as end-of-sentence. :-(
+ // Well, at least the other product does it in the same way...
+ sal_Int32 nFullStopPos = inStr.lastIndexOf( (sal_Unicode)'.', nPos );
+ nPos = nSentenceStart;
+ if (nFullStopPos > 0 && nFullStopPos > nSentenceStart)
+ {
+ Boundary aBd2 = brk.nextWord( inStr, nFullStopPos, aLocale, WordType::DICTIONARY_WORD );
+ nSentenceStart = aBd2.startPos;
+ nPos = nFullStopPos;
+ }
+
+ if (nSentenceStart < nOldSentenceStart || nOldSentenceStart == -1)
+ {
+ // the sentence start might be a quotation mark or some kind of bracket, thus
+ // we need the first dictionary word starting or following this position
+ // Boundary aBd1 = brk.nextWord( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD );
+ Boundary aBd2 = brk.getWordBoundary( inStr, nSentenceStart, aLocale, WordType::DICTIONARY_WORD, true );
+ // OUString aWord1( inStr.copy( aBd1.startPos, aBd1.endPos - aBd1.startPos + 1 ) );
+ OUString aWord2( inStr.copy( aBd2.startPos, aBd2.endPos - aBd2.startPos + 1 ) );
+ }
+ else
+ break; // prevent endless loop
+
+ // continue with previous sentence
+ if (nPos != -1)
+ --nPos;
+ }
+ }
+ return aRes;
+}
+#endif
} } } }
diff --git a/i18npool/source/transliteration/transliteration_sentencecase.cxx b/i18npool/source/transliteration/transliteration_sentencecase.cxx
deleted file mode 100644
index 35f9a69abb4d..000000000000
--- a/i18npool/source/transliteration/transliteration_sentencecase.cxx
+++ /dev/null
@@ -1,190 +0,0 @@
-/*************************************************************************
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * Copyright 2008 by Sun Microsystems, Inc.
- *
- * OpenOffice.org - a multi-platform office productivity suite
- *
- * $RCSfile: halfwidthToFullwidth.cxx,v $
- * $Revision: 1.12 $
- *
- * This file is part of OpenOffice.org.
- *
- * OpenOffice.org is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License version 3
- * only, as published by the Free Software Foundation.
- *
- * OpenOffice.org is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License version 3 for more details
- * (a copy is included in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU Lesser General Public License
- * version 3 along with OpenOffice.org. If not, see
- * <http://www.openoffice.org/license.html>
- * for a copy of the LGPLv3 License.
- *
- ************************************************************************/
-
-// MARKER(update_precomp.py): autogen include statement, do not remove
-#include "precompiled_i18npool.hxx"
-
-// prevent internal compiler error with MSVC6SP3
-#include <utility>
-
-#include <i18nutil/widthfolding.hxx>
-#include <i18nutil/casefolding.hxx>
-#define TRANSLITERATION_sentencecase
-#include <transliteration_sentencecase.hxx>
-#include <rtl/ustring.hxx>
-#include <i18nutil/x_rtl_ustring.h>
-#include <i18nutil/unicode.hxx>
-#include <rtl/ustrbuf.hxx>
-#include <rtl/strbuf.hxx>
-
-using namespace com::sun::star::uno;
-using namespace com::sun::star::lang;
-using namespace rtl;
-
-namespace com { namespace sun { namespace star { namespace i18n {
-
-
-Transliteration_sentencecase::Transliteration_sentencecase()
-{
- transliterationName = "sentenceCase";
- implementationName = "com.sun.star.i18n.Transliteration.SENTENCE_CASE";
-}
-
-OUString SAL_CALL
-Transliteration_sentencecase::transliterate( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset )
- throw(RuntimeException)
-{
- // inspired from Transliteration_body::transliterate
- sal_Int32 nOffCount = 0, i;
- bool bPoint = true;
- if (useOffset)
- {
- for( i = 0; i < nCount; ++i ) {
- sal_Unicode c = inStr.getStr()[ i + startPos ];
- if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) {
- bPoint = true;
- nOffCount++;
- }
- else if( unicode::isAlpha( c ) || unicode::isDigit( c ) )
- {
- const Mapping* map = 0;
- if( bPoint && unicode::isLower( c ))
- {
- map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper);
- }
- bPoint = false;
-
- if(map == 0)
- {
- nOffCount++;
- }
- else
- {
- nOffCount += map->nmap;
- }
- }
- else
- {
- nOffCount++;
- }
- }
- }
-
- bPoint = true;
- rtl::OUStringBuffer result;
-
- if (useOffset)
- {
- result.ensureCapacity(nOffCount);
- if ( nOffCount != offset.getLength() )
- offset.realloc( nOffCount );
- }
-
-
- sal_Int32 j = 0;
- sal_Int32 * pArr = offset.getArray();
- for( i = 0; i < nCount; ++i ) {
- sal_Unicode c = inStr.getStr()[ i + startPos ];
- if( sal_Unicode('.') == c || sal_Unicode('!') == c || sal_Unicode('?') == c ) {
- bPoint = true;
- result.append(c);
- pArr[j++] = i + startPos;
- }
- else if( unicode::isAlpha( c ) || unicode::isDigit( c ) )
- {
- const Mapping* map = 0;
- if( bPoint && unicode::isLower( c ))
- {
- map = &casefolding::getValue(&c, 0, 1, aLocale, MappingTypeLowerToUpper);
- }
- bPoint = false;
-
- if(map == 0)
- {
- result.append( c );
- pArr[j++] = i + startPos;
- }
- else
- {
- for (sal_Int32 k = 0; k < map->nmap; k++)
- {
- result.append( map->map[k] );
- pArr[j++] = i + startPos;
- }
- }
- }
- else
- {
- result.append( c );
- pArr[j++] = i + startPos;
- }
- }
- return result.makeStringAndClear();
-}
-
-sal_Unicode SAL_CALL
-Transliteration_sentencecase::transliterateChar2Char( sal_Unicode inChar)
- throw(RuntimeException, MultipleCharsOutputException)
-{
- return inChar;
-}
-
-sal_Int16 SAL_CALL Transliteration_sentencecase::getType() throw(RuntimeException)
-{
- return TransliterationType::IGNORE;
-}
-
-OUString SAL_CALL
-Transliteration_sentencecase::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount,
- Sequence< sal_Int32 >& offset) throw(RuntimeException)
-{
- return this->transliterate(inStr, startPos, nCount, offset);
-}
-
-sal_Bool SAL_CALL Transliteration_sentencecase::equals(
- const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/,
- const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/)
- throw(RuntimeException)
-{
- throw RuntimeException();
-}
-
-Sequence< OUString > SAL_CALL
-Transliteration_sentencecase::transliterateRange( const OUString& str1, const OUString& str2 )
- throw( RuntimeException)
-{
- Sequence< OUString > ostr(2);
- ostr[0] = str1;
- ostr[1] = str2;
- return ostr;
-}
-
-} } } }
-