summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEike Rathke <erack@redhat.com>2013-09-02 18:01:13 +0200
committerEike Rathke <erack@redhat.com>2013-09-02 18:20:51 +0200
commit49656398d43fa03f8adb70b9be417f2fd65dd9ea (patch)
treeab6f355984b0cf8d81ec566debdc76fd6e73d877
parent8614ff2993b2fe49b27311546290d957f036487c (diff)
simpleExtract() with variants
rsc needs to resolve all known defined languages without access to liblangtag because that would need /usr/local/share/liblangtag/language-subtag-registry.xml so we'd end up with a bootstrap problem (or would have to pass the database path). Change-Id: I6b966d45080da26cb89169cdb40cd8a58c04a276
-rw-r--r--i18nlangtag/source/languagetag/languagetag.cxx143
-rw-r--r--include/i18nlangtag/languagetag.hxx25
2 files changed, 131 insertions, 37 deletions
diff --git a/i18nlangtag/source/languagetag/languagetag.cxx b/i18nlangtag/source/languagetag/languagetag.cxx
index b5c24eeb0c60..22f82fe67dfb 100644
--- a/i18nlangtag/source/languagetag/languagetag.cxx
+++ b/i18nlangtag/source/languagetag/languagetag.cxx
@@ -155,6 +155,7 @@ LanguageTag::LanguageTag( const OUString & rBcp47LanguageTag, bool bCanonicalize
mbCachedLanguage( false),
mbCachedScript( false),
mbCachedCountry( false),
+ mbCachedVariants( false),
mbIsFallback( false)
{
if (bCanonicalize)
@@ -178,6 +179,7 @@ LanguageTag::LanguageTag( const com::sun::star::lang::Locale & rLocale )
mbCachedLanguage( false),
mbCachedScript( false),
mbCachedCountry( false),
+ mbCachedVariants( false),
mbIsFallback( false)
{
}
@@ -198,6 +200,7 @@ LanguageTag::LanguageTag( LanguageType nLanguage )
mbCachedLanguage( false),
mbCachedScript( false),
mbCachedCountry( false),
+ mbCachedVariants( false),
mbIsFallback( false)
{
}
@@ -220,6 +223,7 @@ LanguageTag::LanguageTag( const OUString& rBcp47, const OUString& rLanguage,
mbCachedLanguage( false),
mbCachedScript( false),
mbCachedCountry( false),
+ mbCachedVariants( false),
mbIsFallback( false)
{
if (!mbSystemLocale && !mbInitializedBcp47)
@@ -257,6 +261,7 @@ LanguageTag::LanguageTag( const rtl_Locale & rLocale )
mbCachedLanguage( false),
mbCachedScript( false),
mbCachedCountry( false),
+ mbCachedVariants( false),
mbIsFallback( false)
{
convertFromRtlLocale();
@@ -284,6 +289,7 @@ LanguageTag::LanguageTag( const LanguageTag & rLanguageTag )
mbCachedLanguage( rLanguageTag.mbCachedLanguage),
mbCachedScript( rLanguageTag.mbCachedScript),
mbCachedCountry( rLanguageTag.mbCachedCountry),
+ mbCachedVariants( rLanguageTag.mbCachedVariants),
mbIsFallback( rLanguageTag.mbIsFallback)
{
if (mpImplLangtag)
@@ -313,6 +319,7 @@ LanguageTag& LanguageTag::operator=( const LanguageTag & rLanguageTag )
mbCachedLanguage = rLanguageTag.mbCachedLanguage;
mbCachedScript = rLanguageTag.mbCachedScript;
mbCachedCountry = rLanguageTag.mbCachedCountry;
+ mbCachedVariants = rLanguageTag.mbCachedVariants;
mbIsFallback = rLanguageTag.mbIsFallback;
if (mpImplLangtag)
theDataRef::get().incRef();
@@ -360,6 +367,7 @@ void LanguageTag::resetVars()
mbCachedLanguage = false;
mbCachedScript = false;
mbCachedCountry = false;
+ mbCachedVariants = false;
mbIsFallback = false;
}
@@ -441,20 +449,22 @@ bool LanguageTag::canonicalize()
// and want to determine if parsing it would be possible
// without using liblangtag just to see if it is a simple known
// locale.
- OUString aLanguage, aScript, aCountry;
- Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry);
+ OUString aLanguage, aScript, aCountry, aVariants;
+ Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants);
if (eExt != EXTRACTED_NONE)
{
- if (eExt == EXTRACTED_LSC)
+ if (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV)
{
// Rebuild bcp47 with proper casing of tags.
OUStringBuffer aBuf( aLanguage.getLength() + 1 + aScript.getLength() +
- 1 + aCountry.getLength());
+ 1 + aCountry.getLength() + 1 + aVariants.getLength());
aBuf.append( aLanguage);
if (!aScript.isEmpty())
aBuf.append("-" + aScript);
if (!aCountry.isEmpty())
aBuf.append("-" + aCountry);
+ if (!aVariants.isEmpty())
+ aBuf.append("-" + aVariants);
OUString aStr( aBuf.makeStringAndClear());
if (maBcp47 != aStr)
@@ -749,7 +759,7 @@ OUString LanguageTag::getLanguageFromLangtag()
}
else
{
- if (mbCachedLanguage || cacheSimpleLSC())
+ if (mbCachedLanguage || cacheSimpleLSCV())
aLanguage = maCachedLanguage;
}
return aLanguage;
@@ -775,7 +785,7 @@ OUString LanguageTag::getScriptFromLangtag()
}
else
{
- if (mbCachedScript || cacheSimpleLSC())
+ if (mbCachedScript || cacheSimpleLSCV())
aScript = maCachedScript;
}
return aScript;
@@ -808,7 +818,7 @@ OUString LanguageTag::getRegionFromLangtag()
}
else
{
- if (mbCachedCountry || cacheSimpleLSC())
+ if (mbCachedCountry || cacheSimpleLSCV())
aRegion = maCachedCountry;
}
return aRegion;
@@ -840,6 +850,11 @@ OUString LanguageTag::getVariantsFromLangtag()
}
}
}
+ else
+ {
+ if (mbCachedVariants || cacheSimpleLSCV())
+ aVariants = maCachedVariants;
+ }
return aVariants;
}
@@ -1020,7 +1035,12 @@ OUString LanguageTag::getRegion() const
OUString LanguageTag::getVariants() const
{
- return const_cast<LanguageTag*>(this)->getVariantsFromLangtag();
+ if (!mbCachedVariants)
+ {
+ maCachedVariants = const_cast<LanguageTag*>(this)->getVariantsFromLangtag();
+ mbCachedVariants = true;
+ }
+ return maCachedVariants;
}
@@ -1055,16 +1075,18 @@ bool LanguageTag::hasScript() const
}
-bool LanguageTag::cacheSimpleLSC()
+bool LanguageTag::cacheSimpleLSCV()
{
- OUString aLanguage, aScript, aCountry;
- bool bRet = (simpleExtract( maBcp47, aLanguage, aScript, aCountry) == EXTRACTED_LSC);
+ OUString aLanguage, aScript, aCountry, aVariants;
+ Extraction eExt = simpleExtract( maBcp47, aLanguage, aScript, aCountry, aVariants);
+ bool bRet = (eExt == EXTRACTED_LSC || eExt == EXTRACTED_LV);
if (bRet)
{
maCachedLanguage = aLanguage;
maCachedScript = aScript;
maCachedCountry = aCountry;
- mbCachedLanguage = mbCachedScript = mbCachedCountry = true;
+ maCachedVariants = aVariants;
+ mbCachedLanguage = mbCachedScript = mbCachedCountry = mbCachedVariants = true;
}
return bRet;
}
@@ -1275,11 +1297,14 @@ bool LanguageTag::operator!=( const LanguageTag & rLanguageTag ) const
// static
LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47,
- OUString& rLanguage, OUString& rScript, OUString& rCountry )
+ OUString& rLanguage, OUString& rScript, OUString& rCountry, OUString& rVariants )
{
Extraction eRet = EXTRACTED_NONE;
const sal_Int32 nLen = rBcp47.getLength();
const sal_Int32 nHyph1 = rBcp47.indexOf( '-');
+ const sal_Int32 nHyph2 = (nHyph1 < 0 ? -1 : rBcp47.indexOf( '-', nHyph1 + 1));
+ const sal_Int32 nHyph3 = (nHyph2 < 0 ? -1 : rBcp47.indexOf( '-', nHyph2 + 1));
+ const sal_Int32 nHyph4 = (nHyph3 < 0 ? -1 : rBcp47.indexOf( '-', nHyph3 + 1));
if (nLen == 1 && rBcp47[0] == '*') // * the dreaded jolly joker
{
// It's f*d up but we need to recognize this.
@@ -1290,34 +1315,96 @@ LanguageTag::Extraction LanguageTag::simpleExtract( const OUString& rBcp47,
// x-... privateuse tags MUST be known to us by definition.
eRet = EXTRACTED_X;
}
- else if ((nLen == 2 || nLen == 3) && nHyph1 < 0) // ll or lll
+ else if (nLen == 2 || nLen == 3) // ll or lll
{
- rLanguage = rBcp47.toAsciiLowerCase();
- rScript = rCountry = OUString();
- eRet = EXTRACTED_LSC;
+ if (nHyph1 < 0)
+ {
+ rLanguage = rBcp47.toAsciiLowerCase();
+ rScript = rCountry = rVariants = OUString();
+ eRet = EXTRACTED_LSC;
+ }
}
- else if ( (nLen == 5 && nHyph1 == 2) // ll-CC
- || (nLen == 6 && nHyph1 == 3)) // lll-CC
+ else if ( (nHyph1 == 2 && nLen == 5) // ll-CC
+ || (nHyph1 == 3 && nLen == 6)) // lll-CC
{
- rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
- rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
- rScript = OUString();
- eRet = EXTRACTED_LSC;
+ if (nHyph2 < 0)
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
+ rScript = rVariants = OUString();
+ eRet = EXTRACTED_LSC;
+ }
+ }
+ else if ( (nHyph1 == 2 && nLen == 7) // ll-Ssss
+ || (nHyph1 == 3 && nLen == 8)) // lll-Ssss
+ {
+ /* TODO: also accept a (DIGIT 3*ALNUM) vvvv variant instead of Ssss */
+ if (nHyph2 < 0)
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
+ rCountry = rVariants = OUString();
+ eRet = EXTRACTED_LSC;
+ }
}
- else if ( (nHyph1 == 2 && nLen == 10) // ll-Ssss-CC check
- || (nHyph1 == 3 && nLen == 11)) // lll-Ssss-CC check
+ else if ( (nHyph1 == 2 && nHyph2 == 7 && nLen == 10) // ll-Ssss-CC
+ || (nHyph1 == 3 && nHyph2 == 8 && nLen == 11)) // lll-Ssss-CC
{
- const sal_Int32 nHyph2 = rBcp47.indexOf( '-', nHyph1 + 1);
- if (nHyph2 == nHyph1 + 5)
+ if (nHyph3 < 0)
{
rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
+ rVariants = OUString();
eRet = EXTRACTED_LSC;
}
}
+ else if ( (nHyph1 == 2 && nHyph2 == 7 && nHyph3 == 10 && nLen >= 15) // ll-Ssss-CC-vvvv[vvvv][-...]
+ || (nHyph1 == 3 && nHyph2 == 8 && nHyph3 == 11 && nLen >= 16)) // lll-Ssss-CC-vvvv[vvvv][-...]
+ {
+ if (nHyph4 < 0 || (nHyph4 - nHyph3 > 4 && nHyph4 - nHyph3 <= 9))
+ {
+ rVariants = rBcp47.copy( nHyph3 + 1);
+ if (nHyph4 < 0 && (rVariants.getLength() < 4 || 8 < rVariants.getLength()))
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rScript = rBcp47.copy( nHyph1 + 1, 1).toAsciiUpperCase() + rBcp47.copy( nHyph1 + 2, 3).toAsciiLowerCase();
+ rCountry = rBcp47.copy( nHyph2 + 1, 2).toAsciiUpperCase();
+ eRet = EXTRACTED_LV;
+ }
+ }
+ }
+ else if ( (nHyph1 == 2 && nHyph2 == 5 && nLen >= 10) // ll-CC-vvvv[vvvv][-...]
+ || (nHyph1 == 3 && nHyph2 == 6 && nLen >= 11)) // lll-CC-vvvv[vvvv][-...]
+ {
+ if (nHyph3 < 0 || (nHyph3 - nHyph2 > 4 && nHyph3 - nHyph2 <= 9))
+ {
+ rVariants = rBcp47.copy( nHyph2 + 1);
+ if (nHyph3 < 0 && (rVariants.getLength() < 4 || 8 < rVariants.getLength()))
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rCountry = rBcp47.copy( nHyph1 + 1, 2).toAsciiUpperCase();
+ rScript = OUString();
+ eRet = EXTRACTED_LV;
+ }
+ }
+ }
+ else if ( (nHyph1 == 2 && nLen >= 8) // ll-vvvvv[vvv][-...]
+ || (nHyph1 == 3 && nLen >= 9)) // lll-vvvvv[vvv][-...]
+ {
+ if (nHyph2 < 0 || (nHyph2 - nHyph1 > 5 && nHyph2 - nHyph1 <= 9))
+ {
+ rVariants = rBcp47.copy( nHyph1 + 1);
+ if (nHyph2 < 0 && (rVariants.getLength() < 5 || 8 < rVariants.getLength()))
+ {
+ rLanguage = rBcp47.copy( 0, nHyph1).toAsciiLowerCase();
+ rScript = rCountry = OUString();
+ eRet = EXTRACTED_LV;
+ }
+ }
+ }
if (eRet == EXTRACTED_NONE)
- rLanguage = rScript = rCountry = OUString();
+ rLanguage = rScript = rCountry = rVariants = OUString();
return eRet;
}
diff --git a/include/i18nlangtag/languagetag.hxx b/include/i18nlangtag/languagetag.hxx
index 403221fe754f..d87c5e3f7821 100644
--- a/include/i18nlangtag/languagetag.hxx
+++ b/include/i18nlangtag/languagetag.hxx
@@ -449,6 +449,7 @@ private:
mutable OUString maCachedLanguage; ///< cache getLanguage()
mutable OUString maCachedScript; ///< cache getScript()
mutable OUString maCachedCountry; ///< cache getCountry()
+ mutable OUString maCachedVariants; ///< cache getVariants()
mutable void* mpImplLangtag; ///< actually lt_tag_t pointer, encapsulated
mutable LanguageType mnLangID;
mutable Decision meIsValid;
@@ -462,6 +463,7 @@ private:
mutable bool mbCachedLanguage : 1;
mutable bool mbCachedScript : 1;
mutable bool mbCachedCountry : 1;
+ mutable bool mbCachedVariants : 1;
bool mbIsFallback : 1;
void convertLocaleToBcp47();
@@ -489,12 +491,12 @@ private:
void resetVars();
- /** Obtain Language, Script and Country via simpleExtract() and assign them
- to the cached variables if successful.
+ /** Obtain Language, Script, Country and Variants via simpleExtract() and
+ assign them to the cached variables if successful.
@return return of simpleExtract()
*/
- bool cacheSimpleLSC();
+ bool cacheSimpleLSCV();
static bool isIsoLanguage( const OUString& rLanguage );
static bool isIsoScript( const OUString& rScript );
@@ -504,23 +506,28 @@ private:
{
EXTRACTED_NONE,
EXTRACTED_LSC,
+ EXTRACTED_LV,
EXTRACTED_X,
EXTRACTED_X_JOKER
};
- /** Of a simple language tag of the form lll[-Ssss][-CC] (i.e. one that
- would fulfill the isIsoODF() condition) extract the portions.
+ /** Of a language tag of the form lll[-Ssss][-CC][-vvvvvvvv] extract the
+ portions.
Does not check case or content!
- @return EXTRACTED_LSC if simple tag was detected, EXTRACTED_X if x-...
- privateuse tag was detected, EXTRACTED_X_JOKER if "*" joker was
- detected, else EXTRACTED_NONE.
+ @return EXTRACTED_LSC if simple tag was detected (i.e. one that
+ would fulfill the isIsoODF() condition),
+ EXTRACTED_LV if a tag with variant was detected,
+ EXTRACTED_X if x-... privateuse tag was detected,
+ EXTRACTED_X_JOKER if "*" joker was detected,
+ EXTRACTED_NONE else.
*/
static Extraction simpleExtract( const OUString& rBcp47,
OUString& rLanguage,
OUString& rScript,
- OUString& rCountry );
+ OUString& rCountry,
+ OUString& rVariants );
};