From bfeeed3e986f3ce5090d5a03c50546cbda3b99e6 Mon Sep 17 00:00:00 2001 From: Caolán McNamara Date: Mon, 26 Nov 2012 16:22:49 +0000 Subject: implement a new iscii (devangari) <-> unicode converter this time with support for the multi-byte encodings possible in ISCII Change-Id: I1dc09e8836676ab614b531e8dc10f91a90b7c4fd --- sal/Library_sal_textenc.mk | 1 + sal/qa/rtl/textenc/rtl_tencinfo.cxx | 2 +- sal/qa/rtl/textenc/rtl_textcvt.cxx | 81 +++--- sal/textenc/convertisciidevangari.cxx | 496 ++++++++++++++++++++++++++++++++++ sal/textenc/convertisciidevangari.hxx | 34 +++ sal/textenc/convertisciidevangari.tab | 35 +++ sal/textenc/tables.cxx | 5 +- 7 files changed, 611 insertions(+), 43 deletions(-) create mode 100644 sal/textenc/convertisciidevangari.cxx create mode 100644 sal/textenc/convertisciidevangari.hxx create mode 100644 sal/textenc/convertisciidevangari.tab (limited to 'sal') diff --git a/sal/Library_sal_textenc.mk b/sal/Library_sal_textenc.mk index 5d5d40e0bb1b..972f0ee5d405 100644 --- a/sal/Library_sal_textenc.mk +++ b/sal/Library_sal_textenc.mk @@ -41,6 +41,7 @@ $(eval $(call gb_Library_add_exception_objects,sal_textenc,\ sal/textenc/convertbig5hkscs \ sal/textenc/converteuctw \ sal/textenc/convertgb18030 \ + sal/textenc/convertisciidevangari \ sal/textenc/convertiso2022cn \ sal/textenc/convertiso2022jp \ sal/textenc/convertiso2022kr \ diff --git a/sal/qa/rtl/textenc/rtl_tencinfo.cxx b/sal/qa/rtl/textenc/rtl_tencinfo.cxx index 7d3ade94ae56..70e2f2e13564 100644 --- a/sal/qa/rtl/textenc/rtl_tencinfo.cxx +++ b/sal/qa/rtl/textenc/rtl_tencinfo.cxx @@ -509,8 +509,8 @@ namespace CPPUNIT_TEST( MimeCharsetFromTextEncoding_BIG5_HKSCS ); CPPUNIT_TEST( MimeCharsetFromTextEncoding_TIS_620 ); CPPUNIT_TEST( MimeCharsetFromTextEncoding_KOI8_U ); -#if 0 CPPUNIT_TEST( MimeCharsetFromTextEncoding_ISCII_DEVANAGARI ); +#if 0 CPPUNIT_TEST( MimeCharsetFromTextEncoding_JAVA_UTF8 ); #endif diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx b/sal/qa/rtl/textenc/rtl_textcvt.cxx index 3c8c33e156bb..ba7074f26ec3 100644 --- a/sal/qa/rtl/textenc/rtl_textcvt.cxx +++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx @@ -134,12 +134,13 @@ void testSingleByteCharSet(SingleByteCharSet const & rSet) { | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR), &nInfo, &nConverted); + + sal_uInt32 nExpectedInfo = (RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_UNDEFINED); + CPPUNIT_ASSERT_MESSAGE( "failure #9", (nSize == 0 - && (nInfo - == (RTL_TEXTTOUNICODE_INFO_ERROR - | RTL_TEXTTOUNICODE_INFO_UNDEFINED)) + && (nInfo == nExpectedInfo) && nConverted == 0)); rtl_destroyTextToUnicodeContext(aConverter, aContext); rtl_destroyTextToUnicodeConverter(aConverter); @@ -1098,41 +1099,6 @@ void Test::testSingleByte() { 0x0425,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E, 0x041F,0x042F,0x0420,0x0421,0x0422,0x0423,0x0416,0x0412, 0x042C,0x042B,0x0417,0x0428,0x042D,0x0429,0x0427,0x042A } }, -#if 0 - { RTL_TEXTENCODING_ISCII_DEVANAGARI, - { 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007, - 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F, - 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017, - 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F, - 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027, - 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F, - 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037, - 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F, - 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047, - 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F, - 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057, - 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F, - 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067, - 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F, - 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077, - 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908, - 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912, - 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919, - 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921, - 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929, - 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930, - 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938, - 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943, - 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949, - 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, - 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C, - 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF } }, -#endif { RTL_TEXTENCODING_ADOBE_STANDARD, { 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, @@ -2498,6 +2464,43 @@ void Test::testComplex() { false, true, false, + RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR }, + { RTL_TEXTENCODING_ISCII_DEVANAGARI, + RTL_CONSTASCII_STRINGPARAM( + "\xD7\xE6\x20\xD4\xCF\xE8\xD6\x20" + "\xC8\xD8\xD1\xE1\x20\xB3\xCA\xDC" + "\xCF\xC4\xDA\xD7\x20\xD8\xDB\xA2" + "\xC4\xDE\x20\xB1\xCF\x20\xCC\xDD" + "\xD7\xD1\xCC\xDA\xC6\x20\xC4\xE5" + "\xC6\xE5\xA2\x20\xB3\xE1\x20\xB3" + "\xBD\xE8\xBD\xCF\xC8\xC6\x20\xB3" + "\xE5\x20\xC9\xBD\xB3\xDA\xCF\x20" + "\xB8\xDD\xB3\xE1\x20\xC3\xE1\x20" + "\xEA"), + { 0x0938, 0x094C, 0x0020, 0x0935, 0x0930, 0x094D, 0x0937, 0x0020, + 0x092A, 0x0939, 0x0932, 0x0947, 0x0020, 0x0915, 0x092C, 0x0940, + 0x0930, 0x0926, 0x093E, 0x0938, 0x0020, 0x0939, 0x093F, 0x0902, + 0x0926, 0x0942, 0x0020, 0x0914, 0x0930, 0x0020, 0x092E, 0x0941, + 0x0938, 0x0932, 0x092E, 0x093E, 0x0928, 0x0020, 0x0926, 0x094B, + 0x0928, 0x094B, 0x0902, 0x0020, 0x0915, 0x0947, 0x0020, 0x0915, + 0x091F, 0x094D, 0x091F, 0x0930, 0x092A, 0x0928, 0x0020, 0x0915, + 0x094B, 0x0020, 0x092B, 0x091F, 0x0915, 0x093E, 0x0930, 0x0020, + 0x091A, 0x0941, 0x0915, 0x0947, 0x0020, 0x0925, 0x0947, 0x0020, + 0x0964 }, + 73, + false, + true, + true, + false, + RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR }, + { RTL_TEXTENCODING_ISCII_DEVANAGARI, + RTL_CONSTASCII_STRINGPARAM("\xE8\xE8\xE8\xE9\xA1\xE9\xEA\xE9"), + { 0x094D, 0x200C, 0x094D, 0x200D, 0x0950, 0x93D }, + 6, + false, + true, + true, + false, RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR } }; for (std::size_t i = 0; i < SAL_N_ELEMENTS(data); ++i) { @@ -2870,10 +2873,8 @@ void Test::testInfo() { { RTL_TEXTENCODING_IBM_861, RTL_TEXTENCODING_INFO_MIME, true }, { RTL_TEXTENCODING_IBM_863, RTL_TEXTENCODING_INFO_MIME, true }, { RTL_TEXTENCODING_IBM_865, RTL_TEXTENCODING_INFO_MIME, true }, -#if 0 { RTL_TEXTENCODING_ISCII_DEVANAGARI, RTL_TEXTENCODING_INFO_ASCII, true }, { RTL_TEXTENCODING_ISCII_DEVANAGARI, RTL_TEXTENCODING_INFO_MIME, false }, -#endif { RTL_TEXTENCODING_ADOBE_STANDARD, RTL_TEXTENCODING_INFO_ASCII, false }, { RTL_TEXTENCODING_ADOBE_STANDARD, RTL_TEXTENCODING_INFO_MIME, true }, { RTL_TEXTENCODING_ADOBE_STANDARD, RTL_TEXTENCODING_INFO_SYMBOL, false }, diff --git a/sal/textenc/convertisciidevangari.cxx b/sal/textenc/convertisciidevangari.cxx new file mode 100644 index 000000000000..8adeb4b37095 --- /dev/null +++ b/sal/textenc/convertisciidevangari.cxx @@ -0,0 +1,496 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "converter.hxx" +#include "convertisciidevangari.hxx" +#include "convertsinglebytetobmpunicode.hxx" +#include + +using namespace sal::detail::textenc; +using namespace rtl::textenc; + +struct IsciiDevanagariToUnicode +{ + sal_uInt8 m_cPrevChar; + IsciiDevanagariToUnicode() + : m_cPrevChar(0) + { + } + void reset() + { + m_cPrevChar = 0; + } + sal_Size convert(char const* pSrcBuf, sal_Size nSrcBytes, + sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags, + sal_uInt32* pInfo, sal_Size* pSrcCvtBytes); +}; + +struct UnicodeToIsciiDevanagari +{ + sal_Unicode m_cPrevChar; + sal_Unicode m_cHighSurrogate; + UnicodeToIsciiDevanagari() + : m_cPrevChar(0) + , m_cHighSurrogate(0) + { + } + void reset() + { + m_cPrevChar = 0; + m_cHighSurrogate = 0; + } + sal_Size convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars, + char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size * pSrcCvtChars); +}; + +static const sal_Unicode IsciiDevanagariMap[256] = +{ + 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007, + 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F, + 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017, + 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F, + 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027, + 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F, + 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037, + 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F, + 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047, + 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F, + 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057, + 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F, + 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067, + 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F, + 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077, + 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F, + 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, + 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, + 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, + 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, + 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908, + 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912, + 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919, + 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921, + 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929, + 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930, + 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938, + 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943, + 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949, + 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF, + 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C, + 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF +}; + +sal_Size IsciiDevanagariToUnicode::convert( + char const* pSrcBuf, sal_Size nSrcBytes, + sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags, + sal_uInt32* pInfo, sal_Size* pSrcCvtBytes) +{ + sal_uInt32 nInfo = 0; + sal_Size nConverted = 0; + sal_Unicode* pDestBufPtr = pDestBuf; + sal_Unicode* pDestBufEnd = pDestBuf + nDestChars; + + while (nConverted < nSrcBytes) + { + if (pDestBufPtr == pDestBufEnd) + { + nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; + break; + } + + sal_Unicode cChar; + sal_uInt8 nIn = static_cast(pSrcBuf[nConverted]); + sal_uInt8 nNext = nConverted < nSrcBytes + 1 ? static_cast(pSrcBuf[nConverted+1]) : 0; + bool bNormal = true; + bool bDouble = false; + //halant + halant E8 E8 -> halant + ZWNJ 094D 200C + //halant + nukta E8 E9 halant + ZWJ 094D 200D + if (m_cPrevChar == 0xE8 && nIn == 0xE8) + { + cChar = 0x200C; + bNormal = false; + } + else if (m_cPrevChar == 0xE8 && nIn == 0xE9) + { + cChar = 0x200D; + bNormal = false; + } + else if (nNext == 0xE9) + { + bNormal = false; + bDouble = true; + switch(nIn) + { + case 0xA1: + cChar = 0x0950; + break; + case 0xA6: + cChar = 0x090C; + break; + case 0xA7: + cChar = 0x0961; + break; + case 0xAA: + cChar = 0x0960; + break; + case 0xB3: + cChar = 0x0958; + break; + case 0xB4: + cChar = 0x0959; + break; + case 0xB5: + cChar = 0x095A; + break; + case 0xBA: + cChar = 0x095B; + break; + case 0xBF: + cChar = 0x095C; + break; + case 0xC0: + cChar = 0x095D; + break; + case 0xC9: + cChar = 0x095E; + break; + case 0xDB: + cChar = 0x0962; + break; + case 0xDC: + cChar = 0x0963; + break; + case 0xDF: + cChar = 0x0944; + break; + case 0xEA: + cChar = 0x093D; + break; + default: + bNormal = true; + bDouble = false; + break; + }; + } + + if (bNormal) + cChar = IsciiDevanagariMap[nIn]; + + bool bUndefined = cChar == 0xffff; + + if (bUndefined) + { + BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion( + bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd, + &nInfo); + if (eAction == BAD_INPUT_CONTINUE) + continue; + if (eAction == BAD_INPUT_STOP) + break; + else if (eAction == BAD_INPUT_NO_OUTPUT) + { + nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; + break; + } + } + ++nConverted; + if (bDouble) + ++nConverted; + + *pDestBufPtr++ = cChar; + m_cPrevChar = bNormal ? nIn : 0; + } + + if (pInfo) + *pInfo = nInfo; + if (pSrcCvtBytes) + *pSrcCvtBytes = nConverted; + + return pDestBufPtr - pDestBuf; +} + +BmpUnicodeToSingleByteRange const unicodeToISCIIEncoding[] = +{ + { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 }, + { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE }, + { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 }, + { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 }, + { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 }, + { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 }, + { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 }, + { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 }, + { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA }, + { 0x0966, 0x096F - 0x0966, 0xF1 } +}; + +sal_Size UnicodeToIsciiDevanagari::convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars, + char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size* pSrcCvtChars) +{ + size_t entries = SAL_N_ELEMENTS(unicodeToISCIIEncoding); + BmpUnicodeToSingleByteRange const * ranges = unicodeToISCIIEncoding; + + sal_Unicode cHighSurrogate = m_cHighSurrogate; + sal_uInt32 nInfo = 0; + sal_Size nConverted = 0; + sal_Char* pDestBufPtr = pDestBuf; + sal_Char* pDestBufEnd = pDestBuf + nDestBytes; + for (; nConverted < nSrcChars; ++nConverted) + { + bool bUndefined = true; + sal_uInt32 c = *pSrcBuf++; + sal_Char cSpecialChar = 0; + if (cHighSurrogate == 0) + { + if (ImplIsHighSurrogate(c)) + { + cHighSurrogate = static_cast< sal_Unicode >(c); + continue; + } + } + else if (ImplIsLowSurrogate(c)) + { + c = ImplCombineSurrogates(cHighSurrogate, c); + } + else + { + bUndefined = false; + goto bad_input; + } + if (ImplIsLowSurrogate(c) || ImplIsNoncharacter(c)) + { + bUndefined = false; + goto bad_input; + } + + //halant + halant E8 E8 -> halant + ZWNJ 094D 200C + //halant + nukta E8 E9 halant + ZWJ 094D 200D + if (m_cPrevChar == 0x094D && c == 0x200C) + cSpecialChar = 0xE8; + else if (m_cPrevChar == 0x094D && c == 0x200D) + cSpecialChar = 0xE9; + if (cSpecialChar) + { + if (pDestBufEnd - pDestBufPtr < 1) + { + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + *pDestBufPtr++ = cSpecialChar; + m_cPrevChar = 0; + goto done; + } + switch (c) + { + case 0x0950: + cSpecialChar = 0xA1; + break; + case 0x090C: + cSpecialChar = 0xA6; + break; + case 0x0961: + cSpecialChar = 0xA7; + break; + case 0x0960: + cSpecialChar = 0xAA; + break; + case 0x0958: + cSpecialChar = 0xB3; + break; + case 0x0959: + cSpecialChar = 0xB4; + break; + case 0x095A: + cSpecialChar = 0xB5; + break; + case 0x095B: + cSpecialChar = 0xBA; + break; + case 0x095C: + cSpecialChar = 0xBF; + break; + case 0x095D: + cSpecialChar = 0xC0; + break; + case 0x095E: + cSpecialChar = 0xC9; + break; + case 0x0962: + cSpecialChar = 0xDB; + break; + case 0x0963: + cSpecialChar = 0xDC; + break; + case 0x0944: + cSpecialChar = 0xDF; + break; + case 0x093D: + cSpecialChar = 0xEA; + break; + default: + break; + } + if (cSpecialChar) + { + if (pDestBufEnd - pDestBufPtr < 2) + { + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + *pDestBufPtr++ = cSpecialChar; + *pDestBufPtr++ = 0xE9; + m_cPrevChar = 0; + goto done; + } + + // Linearly searching through the ranges if probably fastest, assuming + // that most converted characters belong to the ASCII subset: + for (size_t i = 0; i < entries; ++i) + { + if (c < ranges[i].unicode) + { + break; + } + else if (c <= sal::static_int_cast< sal_uInt32 >( + ranges[i].unicode + ranges[i].range)) + { + if (pDestBufEnd - pDestBufPtr < 1) + { + goto no_output; + } + *pDestBufPtr++ = static_cast< sal_Char >( + ranges[i].byte + (c - ranges[i].unicode)); + m_cPrevChar = c; + goto done; + } + } + goto bad_input; + done: + cHighSurrogate = 0; + continue; + bad_input: + switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion( + bUndefined, c, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, 0, + 0, 0)) + { + case sal::detail::textenc::BAD_INPUT_STOP: + cHighSurrogate = 0; + break; + + case sal::detail::textenc::BAD_INPUT_CONTINUE: + cHighSurrogate = 0; + continue; + + case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: + goto no_output; + } + break; + no_output: + --pSrcBuf; + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + + if (cHighSurrogate != 0 + && ((nInfo + & (RTL_UNICODETOTEXT_INFO_ERROR + | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) + == 0)) + { + if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) + { + nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; + } + else + { + switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion( + false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, 0, + 0, 0)) + { + case sal::detail::textenc::BAD_INPUT_STOP: + case sal::detail::textenc::BAD_INPUT_CONTINUE: + cHighSurrogate = 0; + break; + + case sal::detail::textenc::BAD_INPUT_NO_OUTPUT: + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + } + } + m_cHighSurrogate = cHighSurrogate; + if (pInfo) + *pInfo = nInfo; + if (pSrcCvtChars) + *pSrcCvtChars = nConverted; + + return pDestBufPtr - pDestBuf; +} + +sal_Size ImplConvertIsciiDevanagariToUnicode(void const*, + void* pContext, char const* pSrcBuf, sal_Size nSrcBytes, + sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags, + sal_uInt32* pInfo, sal_Size* pSrcCvtBytes) +{ + IsciiDevanagariToUnicode *pCtx = + static_cast(pContext); + return pCtx->convert(pSrcBuf, nSrcBytes, pDestBuf, nDestChars, nFlags, + pInfo, pSrcCvtBytes); +} + +sal_Size ImplConvertUnicodeToIsciiDevanagari(void const*, + void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars, + char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size * pSrcCvtChars) +{ + UnicodeToIsciiDevanagari *pCtx = + static_cast(pContext); + return pCtx->convert(pSrcBuf, nSrcChars, + pDestBuf, nDestBytes, nFlags, pInfo, pSrcCvtChars); +} + +void *ImplCreateIsciiDevanagariToUnicodeContext() +{ + return new IsciiDevanagariToUnicode; +} + +void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext) +{ + IsciiDevanagariToUnicode *pCtx = + static_cast(pContext); + delete pCtx; +} + +void ImplResetIsciiDevanagariToUnicodeContext(void * pContext) +{ + IsciiDevanagariToUnicode *pCtx = + static_cast(pContext); + pCtx->reset(); +} + +void *ImplCreateUnicodeToIsciiDevanagariContext() +{ + return new UnicodeToIsciiDevanagari; +} + +void ImplResetUnicodeToIsciiDevanagariContext(void * pContext) +{ + UnicodeToIsciiDevanagari *pCtx = + static_cast(pContext); + pCtx->reset(); +} + +void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext) +{ + UnicodeToIsciiDevanagari *pCtx = + static_cast(pContext); + delete pCtx; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sal/textenc/convertisciidevangari.hxx b/sal/textenc/convertisciidevangari.hxx new file mode 100644 index 000000000000..b3f5f309836d --- /dev/null +++ b/sal/textenc/convertisciidevangari.hxx @@ -0,0 +1,34 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +sal_Size ImplConvertIsciiDevanagariToUnicode(void const * pData, + void * pContext, char const * pSrcBuf, sal_Size nSrcBytes, + sal_Unicode * pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes); + +sal_Size ImplConvertUnicodeToIsciiDevanagari(void const * pData, + void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars, + char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size * pSrcCvtChars); + +void *ImplCreateIsciiDevanagariToUnicodeContext(); + +void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext); + +void ImplResetIsciiDevanagariToUnicodeContext(void * pContext); + +void *ImplCreateUnicodeToIsciiDevanagariContext(); + +void ImplResetUnicodeToIsciiDevanagariContext(void * pContext); + +void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sal/textenc/convertisciidevangari.tab b/sal/textenc/convertisciidevangari.tab new file mode 100644 index 000000000000..6c4a24b99029 --- /dev/null +++ b/sal/textenc/convertisciidevangari.tab @@ -0,0 +1,35 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include "sal/config.h" + +#include "rtl/tencinfo.h" +#include "rtl/textenc.h" + +#include "convertisciidevangari.hxx" + +static ImplTextEncodingData const aImplIsciiDevanagariTextEncodingData + = { { NULL, + &ImplConvertIsciiDevanagariToUnicode, + &ImplConvertUnicodeToIsciiDevanagari, + &ImplCreateIsciiDevanagariToUnicodeContext, + &ImplDestroyIsciiDevanagariToUnicodeContext, + &ImplResetIsciiDevanagariToUnicodeContext, + &ImplCreateUnicodeToIsciiDevanagariContext, + &ImplResetUnicodeToIsciiDevanagariContext, + &ImplDestroyUnicodeToIsciiDevanagariContext }, + 1, + 2, + 2, + 1, + NULL, + "x-iscii-de ", + RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MULTIBYTE }; + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/sal/textenc/tables.cxx b/sal/textenc/tables.cxx index 7400abb1111e..bdb391ea7681 100644 --- a/sal/textenc/tables.cxx +++ b/sal/textenc/tables.cxx @@ -100,13 +100,14 @@ static sal_uInt16 const aImplDoubleByteIdentifierTab[1] = { 0 }; #include "tcvttcn6.tab" #include "tcvtuni1.tab" +#include "convertadobe.tab" #include "convertbig5hkscs.tab" #include "converteuctw.tab" #include "convertgb18030.tab" +#include "convertisciidevangari.tab" #include "convertiso2022cn.tab" #include "convertiso2022jp.tab" #include "convertiso2022kr.tab" -#include "convertadobe.tab" extern "C" SAL_DLLPUBLIC_EXPORT ImplTextEncodingData const * sal_getFullTextEncodingData( rtl_TextEncoding nEncoding ) @@ -208,7 +209,7 @@ extern "C" SAL_DLLPUBLIC_EXPORT ImplTextEncodingData const * &aImplBig5HkscsTextEncodingData, /* BIG5_HKSCS */ &aImplTis620TextEncodingData, /* TIS_620 */ &aImplKoi8UTextEncodingData, /* KOI8_U */ - NULL, /* TODO! ISCII_DEVANAGARI */ + &aImplIsciiDevanagariTextEncodingData, /* ISCII_DEVANAGARI */ NULL, /* JAVA_UTF8, see above */ &adobeStandardEncodingData, /* ADOBE_STANDARD */ &adobeSymbolEncodingData, /* ADOBE_SYMBOL */ -- cgit v1.2.3