diff options
Diffstat (limited to 'src/mm-charsets.c')
-rw-r--r-- | src/mm-charsets.c | 924 |
1 files changed, 492 insertions, 432 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c index ffdcad52..a48da368 100644 --- a/src/mm-charsets.c +++ b/src/mm-charsets.c @@ -11,6 +11,7 @@ * GNU General Public License for more details: * * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2020 Aleksander Morgado <aleksander@aleksander.es> */ #include <config.h> @@ -24,204 +25,79 @@ #include <libmm-glib.h> #include "mm-charsets.h" +#include "mm-log.h" -typedef struct { - const char *gsm_name; - const char *other_name; - const char *iconv_from_name; - const char *iconv_to_name; - MMModemCharset charset; -} CharsetEntry; - -static CharsetEntry charset_map[] = { - { "UTF-8", "UTF8", "UTF-8", "UTF-8//TRANSLIT", MM_MODEM_CHARSET_UTF8 }, - { "UCS2", NULL, "UCS-2BE", "UCS-2BE//TRANSLIT", MM_MODEM_CHARSET_UCS2 }, - { "IRA", "ASCII", "ASCII", "ASCII//TRANSLIT", MM_MODEM_CHARSET_IRA }, - { "GSM", NULL, NULL, NULL, MM_MODEM_CHARSET_GSM }, - { "8859-1", NULL, "ISO8859-1", "ISO8859-1//TRANSLIT", MM_MODEM_CHARSET_8859_1 }, - { "PCCP437", "CP437", "CP437", "CP437//TRANSLIT", MM_MODEM_CHARSET_PCCP437 }, - { "PCDN", "CP850", "CP850", "CP850//TRANSLIT", MM_MODEM_CHARSET_PCDN }, - { "HEX", NULL, NULL, NULL, MM_MODEM_CHARSET_HEX }, - { NULL, NULL, NULL, NULL, MM_MODEM_CHARSET_UNKNOWN } -}; - -const char * -mm_modem_charset_to_string (MMModemCharset charset) -{ - CharsetEntry *iter = &charset_map[0]; +/* Common fallback character when transliteration is enabled */ +static const gchar *translit_fallback = "?"; - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); +/******************************************************************************/ +/* Expected charset settings */ - while (iter->gsm_name) { - if (iter->charset == charset) - return iter->gsm_name; - iter++; - } - g_warn_if_reached (); - return NULL; -} +typedef struct { + MMModemCharset charset; + const gchar *gsm_name; + const gchar *other_name; + const gchar *iconv_name; +} CharsetSettings; + +static const CharsetSettings charset_settings[] = { + { MM_MODEM_CHARSET_UTF8, "UTF-8", "UTF8", "UTF-8" }, + { MM_MODEM_CHARSET_UCS2, "UCS2", NULL, "UCS-2BE" }, + { MM_MODEM_CHARSET_IRA, "IRA", "ASCII", "ASCII" }, + { MM_MODEM_CHARSET_GSM, "GSM", NULL, NULL }, + { MM_MODEM_CHARSET_8859_1, "8859-1", NULL, "ISO8859-1" }, + { MM_MODEM_CHARSET_PCCP437, "PCCP437", "CP437", "CP437" }, + { MM_MODEM_CHARSET_PCDN, "PCDN", "CP850", "CP850" }, + { MM_MODEM_CHARSET_UTF16, "UTF-16", "UTF16", "UTF-16BE" }, +}; MMModemCharset -mm_modem_charset_from_string (const char *string) +mm_modem_charset_from_string (const gchar *string) { - CharsetEntry *iter = &charset_map[0]; + guint i; g_return_val_if_fail (string != NULL, MM_MODEM_CHARSET_UNKNOWN); - while (iter->gsm_name) { - if (strcasestr (string, iter->gsm_name)) - return iter->charset; - if (iter->other_name && strcasestr (string, iter->other_name)) - return iter->charset; - iter++; + for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) { + if (strcasestr (string, charset_settings[i].gsm_name)) + return charset_settings[i].charset; + if (charset_settings[i].other_name && strcasestr (string, charset_settings[i].other_name)) + return charset_settings[i].charset; } return MM_MODEM_CHARSET_UNKNOWN; } -static const char * -charset_iconv_to (MMModemCharset charset) +static const CharsetSettings * +lookup_charset_settings (MMModemCharset charset) { - CharsetEntry *iter = &charset_map[0]; + guint i; g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); - - while (iter->gsm_name) { - if (iter->charset == charset) - return iter->iconv_to_name; - iter++; - } - g_warn_if_reached (); - return NULL; -} - -static const char * -charset_iconv_from (MMModemCharset charset) -{ - CharsetEntry *iter = &charset_map[0]; - - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); - - while (iter->gsm_name) { - if (iter->charset == charset) - return iter->iconv_from_name; - iter++; + for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) { + if (charset_settings[i].charset == charset) + return &charset_settings[i]; } g_warn_if_reached (); return NULL; } -gboolean -mm_modem_charset_byte_array_append (GByteArray *array, - const char *utf8, - gboolean quoted, - MMModemCharset charset) -{ - const char *iconv_to; - char *converted; - GError *error = NULL; - gsize written = 0; - - g_return_val_if_fail (array != NULL, FALSE); - g_return_val_if_fail (utf8 != NULL, FALSE); - - iconv_to = charset_iconv_to (charset); - g_return_val_if_fail (iconv_to != NULL, FALSE); - - converted = g_convert (utf8, -1, iconv_to, "UTF-8", NULL, &written, &error); - if (!converted) { - if (error) { - g_warning ("%s: failed to convert '%s' to %s character set: (%d) %s", - __func__, utf8, iconv_to, - error->code, error->message); - g_error_free (error); - } - return FALSE; - } - - if (quoted) - g_byte_array_append (array, (const guint8 *) "\"", 1); - g_byte_array_append (array, (const guint8 *) converted, written); - if (quoted) - g_byte_array_append (array, (const guint8 *) "\"", 1); - - g_free (converted); - return TRUE; -} - -char * -mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset) -{ - char *unconverted, *converted; - const char *iconv_from; - gsize unconverted_len = 0; - GError *error = NULL; - - g_return_val_if_fail (src != NULL, NULL); - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); - - iconv_from = charset_iconv_from (charset); - g_return_val_if_fail (iconv_from != NULL, FALSE); - - unconverted = mm_utils_hexstr2bin (src, &unconverted_len); - if (!unconverted) - return NULL; - - if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA) - return unconverted; - - converted = g_convert (unconverted, unconverted_len, - "UTF-8//TRANSLIT", iconv_from, - NULL, NULL, &error); - if (!converted || error) { - g_clear_error (&error); - converted = NULL; - } - - g_free (unconverted); - - return converted; -} - -char * -mm_modem_charset_utf8_to_hex (const char *src, MMModemCharset charset) +const gchar * +mm_modem_charset_to_string (MMModemCharset charset) { - gsize converted_len = 0; - char *converted; - const char *iconv_to; - GError *error = NULL; - gchar *hex; - - g_return_val_if_fail (src != NULL, NULL); - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL); - - iconv_to = charset_iconv_from (charset); - g_return_val_if_fail (iconv_to != NULL, FALSE); - - if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA) - return g_strdup (src); - - converted = g_convert (src, strlen (src), - iconv_to, "UTF-8//TRANSLIT", - NULL, &converted_len, &error); - if (!converted || error) { - g_clear_error (&error); - g_free (converted); - return NULL; - } + const CharsetSettings *settings; - /* Get hex representation of the string */ - hex = mm_utils_bin2hexstr ((guint8 *)converted, converted_len); - g_free (converted); - return hex; + settings = lookup_charset_settings (charset); + return settings ? settings->gsm_name : NULL; } +/******************************************************************************/ /* GSM 03.38 encoding conversion stuff */ #define GSM_DEF_ALPHABET_SIZE 128 #define GSM_EXT_ALPHABET_SIZE 10 typedef struct GsmUtf8Mapping { - gchar chars[3]; + gchar chars[3]; guint8 len; guint8 gsm; /* only used for extended GSM charset */ } GsmUtf8Mapping; @@ -305,7 +181,8 @@ static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = { }; static guint8 -gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2]) +gsm_def_char_to_utf8 (const guint8 gsm, + guint8 out_utf8[2]) { g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0); memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len); @@ -313,9 +190,11 @@ gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2]) } static gboolean -utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm) +utf8_to_gsm_def_char (const gchar *utf8, + guint32 len, + guint8 *out_gsm) { - int i; + gint i; if (len > 0 && len < 4) { for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) { @@ -330,6 +209,22 @@ utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm) return FALSE; } +static gboolean +translit_gsm_nul_byte (GByteArray *gsm) +{ + guint i; + guint n_replaces = 0; + + for (i = 0; i < gsm->len; i++) { + if (gsm->data[i] == 0x00) { + utf8_to_gsm_def_char (translit_fallback, strlen (translit_fallback), &gsm->data[i]); + n_replaces++; + } + } + + return (n_replaces > 0); +} + #define EONE(a, g) { {a, 0x00, 0x00}, 1, g } #define ETHR(a, b, c, g) { {a, b, c}, 3, g } @@ -352,7 +247,8 @@ static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = { #define GSM_ESCAPE_CHAR 0x1b static guint8 -gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3]) +gsm_ext_char_to_utf8 (const guint8 gsm, + guint8 out_utf8[3]) { int i; @@ -366,7 +262,9 @@ gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3]) } static gboolean -utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm) +utf8_to_gsm_ext_char (const gchar *utf8, + guint32 len, + guint8 *out_gsm) { int i; @@ -383,11 +281,14 @@ utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm) return FALSE; } -guint8 * -mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len) +static guint8 * +charset_gsm_unpacked_to_utf8 (const guint8 *gsm, + guint32 len, + gboolean translit, + GError **error) { - int i; - GByteArray *utf8; + g_autoptr(GByteArray) utf8 = NULL; + guint i; g_return_val_if_fail (gsm != NULL, NULL); g_return_val_if_fail (len < 4096, NULL); @@ -399,6 +300,28 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len) guint8 uchars[4]; guint8 ulen; + /* + * 0x00 is NULL (when followed only by 0x00 up to the + * end of (fixed byte length) message, possibly also up to + * FORM FEED. But 0x00 is also the code for COMMERCIAL AT + * when some other character (CARRIAGE RETURN if nothing else) + * comes after the 0x00. + * http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT + * + * So, if we find a '@' (0x00) and all the next chars after that + * are also 0x00, we can consider the string finished already. + */ + if (gsm[i] == 0x00) { + gsize j; + + for (j = i + 1; j < len; j++) { + if (gsm[j] != 0x00) + break; + } + if (j == len) + break; + } + if (gsm[i] == GSM_ESCAPE_CHAR) { /* Extended alphabet, decode next char */ ulen = gsm_ext_char_to_utf8 (gsm[i+1], uchars); @@ -411,25 +334,36 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len) if (ulen) g_byte_array_append (utf8, &uchars[0], ulen); - else - g_byte_array_append (utf8, (guint8 *) "?", 1); + else if (translit) + g_byte_array_append (utf8, (guint8 *) translit_fallback, strlen (translit_fallback)); + else { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Invalid conversion from GSM7"); + return NULL; + } } - g_byte_array_append (utf8, (guint8 *) "\0", 1); /* NULL terminator */ - return g_byte_array_free (utf8, FALSE); + /* Always make sure returned string is NUL terminated */ + g_byte_array_append (utf8, (guint8 *) "\0", 1); + return g_byte_array_free (g_steal_pointer (&utf8), FALSE); } -guint8 * -mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len) +static guint8 * +charset_utf8_to_unpacked_gsm (const gchar *utf8, + gboolean translit, + guint32 *out_len, + GError **error) { - GByteArray *gsm; - const char *c = utf8, *next = c; - static const guint8 gesc = GSM_ESCAPE_CHAR; - int i = 0; - - g_return_val_if_fail (utf8 != NULL, NULL); - g_return_val_if_fail (out_len != NULL, NULL); - g_return_val_if_fail (g_utf8_validate (utf8, -1, NULL), NULL); + g_autoptr(GByteArray) gsm = NULL; + const gchar *c; + const gchar *next; + static const guint8 gesc = GSM_ESCAPE_CHAR; + + if (!utf8 || !g_utf8_validate (utf8, -1, NULL)) { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Couldn't convert UTF-8 to GSM: input UTF-8 validation failed"); + return NULL; + } /* worst case initial length */ gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1); @@ -437,10 +371,13 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len) if (*utf8 == 0x00) { /* Zero-length string */ g_byte_array_append (gsm, (guint8 *) "\0", 1); - *out_len = 0; - return g_byte_array_free (gsm, FALSE); + if (out_len) + *out_len = 0; + return g_byte_array_free (g_steal_pointer (&gsm), FALSE); } + next = utf8; + c = utf8; while (next && *next) { guint8 gch = 0x3f; /* 0x3f == '?' */ @@ -451,55 +388,83 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len) /* Add the escape char */ g_byte_array_append (gsm, &gesc, 1); g_byte_array_append (gsm, &gch, 1); - } else if (utf8_to_gsm_def_char (c, next - c, &gch)) + } else if (utf8_to_gsm_def_char (c, next - c, &gch)) { + g_byte_array_append (gsm, &gch, 1); + } else if (translit) { + /* add ? */ g_byte_array_append (gsm, &gch, 1); + } else { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Couldn't convert UTF-8 char to GSM"); + return NULL; + } c = next; - i++; } - *out_len = gsm->len; - return g_byte_array_free (gsm, FALSE); + /* Output length doesn't consider terminating NUL byte */ + if (out_len) + *out_len = gsm->len; + + /* Always make sure returned string is NUL terminated */ + g_byte_array_append (gsm, (guint8 *) "\0", 1); + return g_byte_array_free (g_steal_pointer (&gsm), FALSE); } +/******************************************************************************/ +/* Checks to see whether conversion to a target charset may be done without + * any loss. */ + static gboolean -gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +gsm_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) { guint8 gsm; - *out_clen = 1; if (utf8_to_gsm_def_char (utf8, ulen, &gsm)) return TRUE; - if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) { - *out_clen = 2; + if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) return TRUE; - } return FALSE; } static gboolean -ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +ira_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) { - *out_clen = 1; return (ulen == 1); } static gboolean -ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +ucs2_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) { - *out_clen = 2; return (c <= 0xFFFF); } static gboolean -iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +utf16_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) +{ + return TRUE; +} + +static gboolean +iso88591_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) { - *out_clen = 1; return (c <= 0xFF); } static gboolean -pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +pccp437_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) { static const gunichar t[] = { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, @@ -518,13 +483,11 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0 }; - int i; - - *out_clen = 1; + guint i; if (c <= 0x7F) return TRUE; - for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { + for (i = 0; i < G_N_ELEMENTS (t); i++) { if (c == t[i]) return TRUE; } @@ -532,7 +495,9 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) } static gboolean -pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +pcdn_is_subset (gunichar c, + const gchar *utf8, + gsize ulen) { static const gunichar t[] = { 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, @@ -551,9 +516,7 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0 }; - int i; - - *out_clen = 1; + guint i; if (c <= 0x7F) return TRUE; @@ -566,84 +529,75 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) typedef struct { MMModemCharset cs; - gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen); - guint charsize; + gboolean (*func) (gunichar c, + const gchar *utf8, + gsize ulen); } SubsetEntry; -SubsetEntry subset_table[] = { - { MM_MODEM_CHARSET_GSM, gsm_is_subset }, - { MM_MODEM_CHARSET_IRA, ira_is_subset }, - { MM_MODEM_CHARSET_UCS2, ucs2_is_subset }, +const SubsetEntry subset_table[] = { + { MM_MODEM_CHARSET_GSM, gsm_is_subset }, + { MM_MODEM_CHARSET_IRA, ira_is_subset }, + { MM_MODEM_CHARSET_UCS2, ucs2_is_subset }, + { MM_MODEM_CHARSET_UTF16, utf16_is_subset }, { MM_MODEM_CHARSET_8859_1, iso88591_is_subset }, - { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset }, - { MM_MODEM_CHARSET_PCDN, pcdn_is_subset }, - { MM_MODEM_CHARSET_UNKNOWN, NULL }, + { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset }, + { MM_MODEM_CHARSET_PCDN, pcdn_is_subset }, }; -/** - * mm_charset_get_encoded_len: - * - * @utf8: UTF-8 valid string - * @charset: the #MMModemCharset to check the length of @utf8 in - * @out_unsupported: on return, number of characters of @utf8 that are not fully - * representable in @charset - * - * Returns: the size in bytes of the string if converted from UTF-8 into @charset. - **/ -guint -mm_charset_get_encoded_len (const char *utf8, - MMModemCharset charset, - guint *out_unsupported) +gboolean +mm_charset_can_convert_to (const gchar *utf8, + MMModemCharset charset) { - const char *p = utf8, *next; - guint len = 0, unsupported = 0; - SubsetEntry *e; + const gchar *p; + guint i; - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0); - g_return_val_if_fail (utf8 != NULL, 0); + g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, FALSE); + g_return_val_if_fail (utf8 != NULL, FALSE); if (charset == MM_MODEM_CHARSET_UTF8) - return strlen (utf8); + return TRUE; /* Find the charset in our subset table */ - for (e = &subset_table[0]; - e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN; - e++); - g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0); + for (i = 0; i < G_N_ELEMENTS (subset_table); i++) { + if (subset_table[i].cs == charset) + break; + } + g_return_val_if_fail (i < G_N_ELEMENTS (subset_table), FALSE); + p = utf8; while (*p) { gunichar c; const char *end; - guint clen = 0; c = g_utf8_get_char_validated (p, -1); g_return_val_if_fail (c != (gunichar) -1, 0); - end = next = g_utf8_find_next_char (p, NULL); + end = g_utf8_find_next_char (p, NULL); if (end == NULL) { - /* Find the end... */ + /* Find the string terminating NULL */ end = p; - while (*end++); + while (*++end); } - if (!e->func (c, p, (end - p), &clen)) - unsupported++; - len += clen; - p = next; + if (!subset_table[i].func (c, p, (end - p))) + return FALSE; + + p = end; } - if (out_unsupported) - *out_unsupported = unsupported; - return len; + return TRUE; } +/******************************************************************************/ +/* GSM-7 pack/unpack operations */ + guint8 * -gsm_unpack (const guint8 *gsm, - guint32 num_septets, - guint8 start_offset, /* in _bits_ */ - guint32 *out_unpacked_len) +mm_charset_gsm_unpack (const guint8 *gsm, + guint32 num_septets, + guint8 start_offset, /* in _bits_ */ + guint32 *out_unpacked_len) { GByteArray *unpacked; - int i; + guint i; unpacked = g_byte_array_sized_new (num_septets + 1); @@ -673,14 +627,14 @@ gsm_unpack (const guint8 *gsm, } guint8 * -gsm_pack (const guint8 *src, - guint32 src_len, - guint8 start_offset, - guint32 *out_packed_len) +mm_charset_gsm_pack (const guint8 *src, + guint32 src_len, + guint8 start_offset, + guint32 *out_packed_len) { guint8 *packed; guint octet = 0, lshift, plen; - int i = 0; + guint i = 0; g_return_val_if_fail (start_offset < 8, NULL); @@ -708,202 +662,308 @@ gsm_pack (const guint8 *src, return packed; } -/* We do all our best to get the given string, which is possibly given in the - * specified charset, to UTF8. It may happen that the given string is really - * the hex representation of the charset-encoded string, so we need to cope with - * that case. */ -gchar * -mm_charset_take_and_convert_to_utf8 (gchar *str, MMModemCharset charset) +/*****************************************************************************/ +/* Main conversion functions */ + +static guint8 * +charset_iconv_from_utf8 (const gchar *utf8, + const CharsetSettings *settings, + gboolean translit, + guint *out_size, + GError **error) { - gchar *utf8 = NULL; + g_autoptr(GError) inner_error = NULL; + gsize bytes_written = 0; + g_autofree guint8 *encoded = NULL; + + encoded = (guint8 *) g_convert (utf8, -1, + settings->iconv_name, "UTF-8", + NULL, &bytes_written, &inner_error); + if (encoded) { + if (out_size) + *out_size = (guint) bytes_written; + return g_steal_pointer (&encoded); + } - if (!str) + if (!translit) { + g_propagate_error (error, g_steal_pointer (&inner_error)); + g_prefix_error (error, "Couldn't convert from UTF-8 to %s: ", settings->gsm_name); return NULL; - - switch (charset) { - case MM_MODEM_CHARSET_UNKNOWN: - g_warn_if_reached (); - utf8 = str; - break; - - case MM_MODEM_CHARSET_HEX: - /* We'll assume that the HEX string is really valid ASCII at the end */ - utf8 = str; - break; - - case MM_MODEM_CHARSET_GSM: - case MM_MODEM_CHARSET_8859_1: - case MM_MODEM_CHARSET_PCCP437: - case MM_MODEM_CHARSET_PCDN: { - const gchar *iconv_from; - GError *error = NULL; - - iconv_from = charset_iconv_from (charset); - utf8 = g_convert (str, strlen (str), - "UTF-8//TRANSLIT", iconv_from, - NULL, NULL, &error); - if (!utf8 || error) { - g_clear_error (&error); - utf8 = NULL; - } - - g_free (str); - break; } - case MM_MODEM_CHARSET_UCS2: { - gsize len; - gboolean possibly_hex = TRUE; - gsize bread = 0, bwritten = 0; + encoded = (guint8 *) g_convert_with_fallback (utf8, -1, + settings->iconv_name, "UTF-8", translit_fallback, + NULL, &bytes_written, error); + if (encoded) { + if (out_size) + *out_size = (guint) bytes_written; + return g_steal_pointer (&encoded); + } - /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */ - len = strlen (str); - if ((len < 4) || ((len % 4) != 0)) - possibly_hex = FALSE; - else { - const gchar *p = str; + g_prefix_error (error, "Couldn't convert from UTF-8 to %s with translit: ", settings->gsm_name); + return NULL; +} - /* All chars in the string must be hex */ - while (*p && possibly_hex) - possibly_hex = isxdigit (*p++); - } +GByteArray * +mm_modem_charset_bytearray_from_utf8 (const gchar *utf8, + MMModemCharset charset, + gboolean translit, + GError **error) +{ + const CharsetSettings *settings; + guint8 *encoded = NULL; + guint encoded_size = 0; - /* If hex, then we expect hex-encoded UCS-2 */ - if (possibly_hex) { - utf8 = mm_modem_charset_hex_to_utf8 (str, charset); - if (utf8) { - g_free (str); - break; - } - } + settings = lookup_charset_settings (charset); - /* If not hex, then it might be raw UCS-2 (very unlikely) or ASCII/UTF-8 - * (much more likely). Try to convert to UTF-8 and if that fails, use - * the partial conversion length to re-convert the part of the string - * that is UTF-8, if any. - */ - utf8 = g_convert (str, strlen (str), - "UTF-8//TRANSLIT", "UTF-8//TRANSLIT", - &bread, &bwritten, NULL); + if (charset == MM_MODEM_CHARSET_UNKNOWN) { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Cannot convert from UTF-8: unknown target charset"); + return NULL; + } - /* Valid conversion, or we didn't get enough valid UTF-8 */ - if (utf8 || (bwritten <= 2)) { - g_free (str); + switch (charset) { + case MM_MODEM_CHARSET_GSM: + encoded = charset_utf8_to_unpacked_gsm (utf8, translit, &encoded_size, error); break; - } + case MM_MODEM_CHARSET_IRA: + case MM_MODEM_CHARSET_8859_1: + case MM_MODEM_CHARSET_UTF8: + case MM_MODEM_CHARSET_UCS2: + case MM_MODEM_CHARSET_PCCP437: + case MM_MODEM_CHARSET_PCDN: + case MM_MODEM_CHARSET_UTF16: + encoded = charset_iconv_from_utf8 (utf8, settings, translit, &encoded_size, error); + break; + case MM_MODEM_CHARSET_UNKNOWN: + default: + g_assert_not_reached (); + } - /* Last try; chop off the original string at the conversion failure - * location and get what we can. - */ - str[bread] = '\0'; - utf8 = g_convert (str, strlen (str), - "UTF-8//TRANSLIT", "UTF-8//TRANSLIT", - NULL, NULL, NULL); - g_free (str); - break; + return g_byte_array_new_take (encoded, encoded_size); +} + +gchar * +mm_modem_charset_str_from_utf8 (const gchar *utf8, + MMModemCharset charset, + gboolean translit, + GError **error) +{ + g_autoptr(GByteArray) bytearray = NULL; + + if (charset == MM_MODEM_CHARSET_UNKNOWN) { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Cannot convert from UTF-8: unknown target charset"); + return NULL; } - /* If the given charset is ASCII or UTF8, we really expect the final string - * already here */ - case MM_MODEM_CHARSET_IRA: - case MM_MODEM_CHARSET_UTF8: - utf8 = str; - break; + bytearray = mm_modem_charset_bytearray_from_utf8 (utf8, charset, translit, error); + if (!bytearray) + return NULL; + + switch (charset) { + case MM_MODEM_CHARSET_GSM: + /* Note: strings encoded in unpacked GSM-7 can be used as plain + * strings as long as the string doesn't contain character '@', which + * is the one encoded as 0x00. At this point, we perform transliteration + * of the NUL bytes in the GSM-7 bytearray, and we fail the operation + * if one or more replacements were done and transliteration wasn't + * requested */ + if (translit_gsm_nul_byte (bytearray) && !translit) { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Cannot convert to GSM-7 string: transliteration required for embedded '@'"); + return NULL; + } + /* fall through */ + case MM_MODEM_CHARSET_IRA: + case MM_MODEM_CHARSET_8859_1: + case MM_MODEM_CHARSET_UTF8: + case MM_MODEM_CHARSET_PCCP437: + case MM_MODEM_CHARSET_PCDN: + return (gchar *) g_byte_array_free (g_steal_pointer (&bytearray), FALSE); + case MM_MODEM_CHARSET_UCS2: + case MM_MODEM_CHARSET_UTF16: + return mm_utils_bin2hexstr (bytearray->data, bytearray->len); + default: + case MM_MODEM_CHARSET_UNKNOWN: + g_assert_not_reached (); } +} - /* Validate UTF-8 always before returning. This result will be exposed in DBus - * very likely... */ - if (utf8 && !g_utf8_validate (utf8, -1, NULL)) { - /* Better return NULL than an invalid UTF-8 string */ - g_free (utf8); - utf8 = NULL; +static gchar * +charset_iconv_to_utf8 (const guint8 *data, + guint32 len, + const CharsetSettings *settings, + gboolean translit, + GError **error) +{ + g_autoptr(GError) inner_error = NULL; + g_autofree gchar *utf8 = NULL; + + utf8 = g_convert ((const gchar *) data, len, + "UTF-8", + settings->iconv_name, + NULL, NULL, &inner_error); + if (utf8) + return g_steal_pointer (&utf8); + + if (!translit) { + g_propagate_error (error, g_steal_pointer (&inner_error)); + g_prefix_error (error, "Couldn't convert from %s to UTF-8: ", settings->gsm_name); + return NULL; } - return utf8; + utf8 = g_convert_with_fallback ((const gchar *) data, len, + "UTF-8", settings->iconv_name, translit_fallback, + NULL, NULL, error); + if (utf8) + return g_steal_pointer (&utf8); + + g_prefix_error (error, "Couldn't convert from %s to UTF-8 with translit: ", settings->gsm_name); + return NULL; } -/* We do all our best to convert the given string, which comes in UTF-8, to the - * specified charset. It may be that the output string needs to be the hex - * representation of the charset-encoded string, so we need to cope with that - * case. */ gchar * -mm_utf8_take_and_convert_to_charset (gchar *str, - MMModemCharset charset) +mm_modem_charset_bytearray_to_utf8 (GByteArray *bytearray, + MMModemCharset charset, + gboolean translit, + GError **error) { - gchar *encoded = NULL; + const CharsetSettings *settings; + g_autofree gchar *utf8 = NULL; - if (!str) + if (charset == MM_MODEM_CHARSET_UNKNOWN) { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Cannot convert from UTF-8: unknown target charset"); return NULL; + } + + settings = lookup_charset_settings (charset); + + switch (charset) { + case MM_MODEM_CHARSET_GSM: + utf8 = (gchar *) charset_gsm_unpacked_to_utf8 (bytearray->data, + bytearray->len, + translit, + error); + break; + case MM_MODEM_CHARSET_IRA: + case MM_MODEM_CHARSET_UTF8: + case MM_MODEM_CHARSET_8859_1: + case MM_MODEM_CHARSET_PCCP437: + case MM_MODEM_CHARSET_PCDN: + case MM_MODEM_CHARSET_UCS2: + case MM_MODEM_CHARSET_UTF16: + utf8 = charset_iconv_to_utf8 (bytearray->data, + bytearray->len, + settings, + translit, + error); + break; + case MM_MODEM_CHARSET_UNKNOWN: + default: + g_assert_not_reached (); + } - /* Validate UTF-8 always before converting */ - if (!g_utf8_validate (str, -1, NULL)) { - /* Better return NULL than an invalid encoded string */ - g_free (str); + if (utf8 && g_utf8_validate (utf8, -1, NULL)) + return g_steal_pointer (&utf8); + + g_prefix_error (error, "Invalid conversion from %s to UTF-8: ", settings->gsm_name); + return NULL; +} + +gchar * +mm_modem_charset_str_to_utf8 (const gchar *str, + gssize len, + MMModemCharset charset, + gboolean translit, + GError **error) +{ + g_autoptr(GByteArray) bytearray = NULL; + + if (charset == MM_MODEM_CHARSET_UNKNOWN) { + g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS, + "Cannot convert from UTF-8: unknown target charset"); return NULL; } + /* Note: if the input string is GSM-7 encoded and it contains the '@' + * character, using -1 to indicate string length won't work properly, + * as '@' is encoded as 0x00. Whenever possible, if using GSM-7, + * give a proper len value or otherwise use the bytearray_to_utf8() + * method instead. */ + if (len < 0) + len = strlen (str); + switch (charset) { - case MM_MODEM_CHARSET_UNKNOWN: - g_warn_if_reached (); - encoded = str; - break; - - case MM_MODEM_CHARSET_HEX: - /* FIXME: What encoding is this? */ - g_warn_if_reached (); - encoded = str; - break; - - case MM_MODEM_CHARSET_GSM: - case MM_MODEM_CHARSET_8859_1: - case MM_MODEM_CHARSET_PCCP437: - case MM_MODEM_CHARSET_PCDN: { - const gchar *iconv_to; - GError *error = NULL; - - iconv_to = charset_iconv_from (charset); - encoded = g_convert (str, strlen (str), - iconv_to, "UTF-8", - NULL, NULL, &error); - if (!encoded || error) { - g_clear_error (&error); - encoded = NULL; - } + case MM_MODEM_CHARSET_GSM: + case MM_MODEM_CHARSET_IRA: + case MM_MODEM_CHARSET_8859_1: + case MM_MODEM_CHARSET_UTF8: + case MM_MODEM_CHARSET_PCCP437: + case MM_MODEM_CHARSET_PCDN: + bytearray = g_byte_array_sized_new (len); + g_byte_array_append (bytearray, (const guint8 *)str, len); + break; + case MM_MODEM_CHARSET_UCS2: + case MM_MODEM_CHARSET_UTF16: { + guint8 *bin = NULL; + gsize bin_len; - g_free (str); - break; + bin = (guint8 *) mm_utils_hexstr2bin (str, len, &bin_len, error); + if (!bin) + return NULL; + + bytearray = g_byte_array_new_take (bin, bin_len); + break; + } + case MM_MODEM_CHARSET_UNKNOWN: + default: + g_assert_not_reached (); } - case MM_MODEM_CHARSET_UCS2: { - const gchar *iconv_to; - gsize encoded_len = 0; - GError *error = NULL; - gchar *hex; - - iconv_to = charset_iconv_from (charset); - encoded = g_convert (str, strlen (str), - iconv_to, "UTF-8", - NULL, &encoded_len, &error); - if (!encoded || error) { - g_clear_error (&error); - encoded = NULL; + return mm_modem_charset_bytearray_to_utf8 (bytearray, charset, translit, error); +} + +/******************************************************************************/ +/* Runtime charset support via iconv() */ + +void +mm_modem_charsets_init (void) +{ + /* As test string, something we can convert to/from all the encodings */ + static const gchar *default_test_str = "ModemManager"; + guint i; + + mm_obj_dbg (NULL, "[charsets] detecting platform iconv() support..."); + for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) { + g_autofree guint8 *enc = NULL; + guint enc_size; + g_autofree gchar *dec = NULL; + + if (!charset_settings[i].iconv_name) + continue; + + enc = charset_iconv_from_utf8 (default_test_str, + &charset_settings[i], + FALSE, + &enc_size, + NULL); + if (!enc) { + mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to charset not supported", charset_settings[i].iconv_name); + continue; } - /* Get hex representation of the string */ - hex = mm_utils_bin2hexstr ((guint8 *)encoded, encoded_len); - g_free (encoded); - encoded = hex; - g_free (str); - break; - } + dec = charset_iconv_to_utf8 (enc, + enc_size, + &charset_settings[i], + FALSE, + NULL); + if (!enc) { + mm_obj_dbg (NULL, "[charsets] %s: iconv conversion from charset not supported", charset_settings[i].iconv_name); + continue; + } - /* If the given charset is ASCII or UTF8, we really expect the final string - * already here. */ - case MM_MODEM_CHARSET_IRA: - case MM_MODEM_CHARSET_UTF8: - encoded = str; - break; + mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name); } - - return encoded; } |