summaryrefslogtreecommitdiff
path: root/src/mm-charsets.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/mm-charsets.c')
-rw-r--r--src/mm-charsets.c924
1 files changed, 492 insertions, 432 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index ffdcad52..a48da368 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -11,6 +11,7 @@
* GNU General Public License for more details:
*
* Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2020 Aleksander Morgado <aleksander@aleksander.es>
*/
#include <config.h>
@@ -24,204 +25,79 @@
#include <libmm-glib.h>
#include "mm-charsets.h"
+#include "mm-log.h"
-typedef struct {
- const char *gsm_name;
- const char *other_name;
- const char *iconv_from_name;
- const char *iconv_to_name;
- MMModemCharset charset;
-} CharsetEntry;
-
-static CharsetEntry charset_map[] = {
- { "UTF-8", "UTF8", "UTF-8", "UTF-8//TRANSLIT", MM_MODEM_CHARSET_UTF8 },
- { "UCS2", NULL, "UCS-2BE", "UCS-2BE//TRANSLIT", MM_MODEM_CHARSET_UCS2 },
- { "IRA", "ASCII", "ASCII", "ASCII//TRANSLIT", MM_MODEM_CHARSET_IRA },
- { "GSM", NULL, NULL, NULL, MM_MODEM_CHARSET_GSM },
- { "8859-1", NULL, "ISO8859-1", "ISO8859-1//TRANSLIT", MM_MODEM_CHARSET_8859_1 },
- { "PCCP437", "CP437", "CP437", "CP437//TRANSLIT", MM_MODEM_CHARSET_PCCP437 },
- { "PCDN", "CP850", "CP850", "CP850//TRANSLIT", MM_MODEM_CHARSET_PCDN },
- { "HEX", NULL, NULL, NULL, MM_MODEM_CHARSET_HEX },
- { NULL, NULL, NULL, NULL, MM_MODEM_CHARSET_UNKNOWN }
-};
-
-const char *
-mm_modem_charset_to_string (MMModemCharset charset)
-{
- CharsetEntry *iter = &charset_map[0];
+/* Common fallback character when transliteration is enabled */
+static const gchar *translit_fallback = "?";
- g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
+/******************************************************************************/
+/* Expected charset settings */
- while (iter->gsm_name) {
- if (iter->charset == charset)
- return iter->gsm_name;
- iter++;
- }
- g_warn_if_reached ();
- return NULL;
-}
+typedef struct {
+ MMModemCharset charset;
+ const gchar *gsm_name;
+ const gchar *other_name;
+ const gchar *iconv_name;
+} CharsetSettings;
+
+static const CharsetSettings charset_settings[] = {
+ { MM_MODEM_CHARSET_UTF8, "UTF-8", "UTF8", "UTF-8" },
+ { MM_MODEM_CHARSET_UCS2, "UCS2", NULL, "UCS-2BE" },
+ { MM_MODEM_CHARSET_IRA, "IRA", "ASCII", "ASCII" },
+ { MM_MODEM_CHARSET_GSM, "GSM", NULL, NULL },
+ { MM_MODEM_CHARSET_8859_1, "8859-1", NULL, "ISO8859-1" },
+ { MM_MODEM_CHARSET_PCCP437, "PCCP437", "CP437", "CP437" },
+ { MM_MODEM_CHARSET_PCDN, "PCDN", "CP850", "CP850" },
+ { MM_MODEM_CHARSET_UTF16, "UTF-16", "UTF16", "UTF-16BE" },
+};
MMModemCharset
-mm_modem_charset_from_string (const char *string)
+mm_modem_charset_from_string (const gchar *string)
{
- CharsetEntry *iter = &charset_map[0];
+ guint i;
g_return_val_if_fail (string != NULL, MM_MODEM_CHARSET_UNKNOWN);
- while (iter->gsm_name) {
- if (strcasestr (string, iter->gsm_name))
- return iter->charset;
- if (iter->other_name && strcasestr (string, iter->other_name))
- return iter->charset;
- iter++;
+ for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
+ if (strcasestr (string, charset_settings[i].gsm_name))
+ return charset_settings[i].charset;
+ if (charset_settings[i].other_name && strcasestr (string, charset_settings[i].other_name))
+ return charset_settings[i].charset;
}
return MM_MODEM_CHARSET_UNKNOWN;
}
-static const char *
-charset_iconv_to (MMModemCharset charset)
+static const CharsetSettings *
+lookup_charset_settings (MMModemCharset charset)
{
- CharsetEntry *iter = &charset_map[0];
+ guint i;
g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
- while (iter->gsm_name) {
- if (iter->charset == charset)
- return iter->iconv_to_name;
- iter++;
- }
- g_warn_if_reached ();
- return NULL;
-}
-
-static const char *
-charset_iconv_from (MMModemCharset charset)
-{
- CharsetEntry *iter = &charset_map[0];
-
- g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
- while (iter->gsm_name) {
- if (iter->charset == charset)
- return iter->iconv_from_name;
- iter++;
+ for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
+ if (charset_settings[i].charset == charset)
+ return &charset_settings[i];
}
g_warn_if_reached ();
return NULL;
}
-gboolean
-mm_modem_charset_byte_array_append (GByteArray *array,
- const char *utf8,
- gboolean quoted,
- MMModemCharset charset)
-{
- const char *iconv_to;
- char *converted;
- GError *error = NULL;
- gsize written = 0;
-
- g_return_val_if_fail (array != NULL, FALSE);
- g_return_val_if_fail (utf8 != NULL, FALSE);
-
- iconv_to = charset_iconv_to (charset);
- g_return_val_if_fail (iconv_to != NULL, FALSE);
-
- converted = g_convert (utf8, -1, iconv_to, "UTF-8", NULL, &written, &error);
- if (!converted) {
- if (error) {
- g_warning ("%s: failed to convert '%s' to %s character set: (%d) %s",
- __func__, utf8, iconv_to,
- error->code, error->message);
- g_error_free (error);
- }
- return FALSE;
- }
-
- if (quoted)
- g_byte_array_append (array, (const guint8 *) "\"", 1);
- g_byte_array_append (array, (const guint8 *) converted, written);
- if (quoted)
- g_byte_array_append (array, (const guint8 *) "\"", 1);
-
- g_free (converted);
- return TRUE;
-}
-
-char *
-mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset)
-{
- char *unconverted, *converted;
- const char *iconv_from;
- gsize unconverted_len = 0;
- GError *error = NULL;
-
- g_return_val_if_fail (src != NULL, NULL);
- g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
- iconv_from = charset_iconv_from (charset);
- g_return_val_if_fail (iconv_from != NULL, FALSE);
-
- unconverted = mm_utils_hexstr2bin (src, &unconverted_len);
- if (!unconverted)
- return NULL;
-
- if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA)
- return unconverted;
-
- converted = g_convert (unconverted, unconverted_len,
- "UTF-8//TRANSLIT", iconv_from,
- NULL, NULL, &error);
- if (!converted || error) {
- g_clear_error (&error);
- converted = NULL;
- }
-
- g_free (unconverted);
-
- return converted;
-}
-
-char *
-mm_modem_charset_utf8_to_hex (const char *src, MMModemCharset charset)
+const gchar *
+mm_modem_charset_to_string (MMModemCharset charset)
{
- gsize converted_len = 0;
- char *converted;
- const char *iconv_to;
- GError *error = NULL;
- gchar *hex;
-
- g_return_val_if_fail (src != NULL, NULL);
- g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
- iconv_to = charset_iconv_from (charset);
- g_return_val_if_fail (iconv_to != NULL, FALSE);
-
- if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA)
- return g_strdup (src);
-
- converted = g_convert (src, strlen (src),
- iconv_to, "UTF-8//TRANSLIT",
- NULL, &converted_len, &error);
- if (!converted || error) {
- g_clear_error (&error);
- g_free (converted);
- return NULL;
- }
+ const CharsetSettings *settings;
- /* Get hex representation of the string */
- hex = mm_utils_bin2hexstr ((guint8 *)converted, converted_len);
- g_free (converted);
- return hex;
+ settings = lookup_charset_settings (charset);
+ return settings ? settings->gsm_name : NULL;
}
+/******************************************************************************/
/* GSM 03.38 encoding conversion stuff */
#define GSM_DEF_ALPHABET_SIZE 128
#define GSM_EXT_ALPHABET_SIZE 10
typedef struct GsmUtf8Mapping {
- gchar chars[3];
+ gchar chars[3];
guint8 len;
guint8 gsm; /* only used for extended GSM charset */
} GsmUtf8Mapping;
@@ -305,7 +181,8 @@ static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = {
};
static guint8
-gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2])
+gsm_def_char_to_utf8 (const guint8 gsm,
+ guint8 out_utf8[2])
{
g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0);
memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len);
@@ -313,9 +190,11 @@ gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2])
}
static gboolean
-utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm)
+utf8_to_gsm_def_char (const gchar *utf8,
+ guint32 len,
+ guint8 *out_gsm)
{
- int i;
+ gint i;
if (len > 0 && len < 4) {
for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) {
@@ -330,6 +209,22 @@ utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm)
return FALSE;
}
+static gboolean
+translit_gsm_nul_byte (GByteArray *gsm)
+{
+ guint i;
+ guint n_replaces = 0;
+
+ for (i = 0; i < gsm->len; i++) {
+ if (gsm->data[i] == 0x00) {
+ utf8_to_gsm_def_char (translit_fallback, strlen (translit_fallback), &gsm->data[i]);
+ n_replaces++;
+ }
+ }
+
+ return (n_replaces > 0);
+}
+
#define EONE(a, g) { {a, 0x00, 0x00}, 1, g }
#define ETHR(a, b, c, g) { {a, b, c}, 3, g }
@@ -352,7 +247,8 @@ static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = {
#define GSM_ESCAPE_CHAR 0x1b
static guint8
-gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3])
+gsm_ext_char_to_utf8 (const guint8 gsm,
+ guint8 out_utf8[3])
{
int i;
@@ -366,7 +262,9 @@ gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3])
}
static gboolean
-utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm)
+utf8_to_gsm_ext_char (const gchar *utf8,
+ guint32 len,
+ guint8 *out_gsm)
{
int i;
@@ -383,11 +281,14 @@ utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm)
return FALSE;
}
-guint8 *
-mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
+static guint8 *
+charset_gsm_unpacked_to_utf8 (const guint8 *gsm,
+ guint32 len,
+ gboolean translit,
+ GError **error)
{
- int i;
- GByteArray *utf8;
+ g_autoptr(GByteArray) utf8 = NULL;
+ guint i;
g_return_val_if_fail (gsm != NULL, NULL);
g_return_val_if_fail (len < 4096, NULL);
@@ -399,6 +300,28 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
guint8 uchars[4];
guint8 ulen;
+ /*
+ * 0x00 is NULL (when followed only by 0x00 up to the
+ * end of (fixed byte length) message, possibly also up to
+ * FORM FEED. But 0x00 is also the code for COMMERCIAL AT
+ * when some other character (CARRIAGE RETURN if nothing else)
+ * comes after the 0x00.
+ * http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
+ *
+ * So, if we find a '@' (0x00) and all the next chars after that
+ * are also 0x00, we can consider the string finished already.
+ */
+ if (gsm[i] == 0x00) {
+ gsize j;
+
+ for (j = i + 1; j < len; j++) {
+ if (gsm[j] != 0x00)
+ break;
+ }
+ if (j == len)
+ break;
+ }
+
if (gsm[i] == GSM_ESCAPE_CHAR) {
/* Extended alphabet, decode next char */
ulen = gsm_ext_char_to_utf8 (gsm[i+1], uchars);
@@ -411,25 +334,36 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
if (ulen)
g_byte_array_append (utf8, &uchars[0], ulen);
- else
- g_byte_array_append (utf8, (guint8 *) "?", 1);
+ else if (translit)
+ g_byte_array_append (utf8, (guint8 *) translit_fallback, strlen (translit_fallback));
+ else {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Invalid conversion from GSM7");
+ return NULL;
+ }
}
- g_byte_array_append (utf8, (guint8 *) "\0", 1); /* NULL terminator */
- return g_byte_array_free (utf8, FALSE);
+ /* Always make sure returned string is NUL terminated */
+ g_byte_array_append (utf8, (guint8 *) "\0", 1);
+ return g_byte_array_free (g_steal_pointer (&utf8), FALSE);
}
-guint8 *
-mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
+static guint8 *
+charset_utf8_to_unpacked_gsm (const gchar *utf8,
+ gboolean translit,
+ guint32 *out_len,
+ GError **error)
{
- GByteArray *gsm;
- const char *c = utf8, *next = c;
- static const guint8 gesc = GSM_ESCAPE_CHAR;
- int i = 0;
-
- g_return_val_if_fail (utf8 != NULL, NULL);
- g_return_val_if_fail (out_len != NULL, NULL);
- g_return_val_if_fail (g_utf8_validate (utf8, -1, NULL), NULL);
+ g_autoptr(GByteArray) gsm = NULL;
+ const gchar *c;
+ const gchar *next;
+ static const guint8 gesc = GSM_ESCAPE_CHAR;
+
+ if (!utf8 || !g_utf8_validate (utf8, -1, NULL)) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Couldn't convert UTF-8 to GSM: input UTF-8 validation failed");
+ return NULL;
+ }
/* worst case initial length */
gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1);
@@ -437,10 +371,13 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
if (*utf8 == 0x00) {
/* Zero-length string */
g_byte_array_append (gsm, (guint8 *) "\0", 1);
- *out_len = 0;
- return g_byte_array_free (gsm, FALSE);
+ if (out_len)
+ *out_len = 0;
+ return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
}
+ next = utf8;
+ c = utf8;
while (next && *next) {
guint8 gch = 0x3f; /* 0x3f == '?' */
@@ -451,55 +388,83 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
/* Add the escape char */
g_byte_array_append (gsm, &gesc, 1);
g_byte_array_append (gsm, &gch, 1);
- } else if (utf8_to_gsm_def_char (c, next - c, &gch))
+ } else if (utf8_to_gsm_def_char (c, next - c, &gch)) {
+ g_byte_array_append (gsm, &gch, 1);
+ } else if (translit) {
+ /* add ? */
g_byte_array_append (gsm, &gch, 1);
+ } else {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Couldn't convert UTF-8 char to GSM");
+ return NULL;
+ }
c = next;
- i++;
}
- *out_len = gsm->len;
- return g_byte_array_free (gsm, FALSE);
+ /* Output length doesn't consider terminating NUL byte */
+ if (out_len)
+ *out_len = gsm->len;
+
+ /* Always make sure returned string is NUL terminated */
+ g_byte_array_append (gsm, (guint8 *) "\0", 1);
+ return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
}
+/******************************************************************************/
+/* Checks to see whether conversion to a target charset may be done without
+ * any loss. */
+
static gboolean
-gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+gsm_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
{
guint8 gsm;
- *out_clen = 1;
if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
return TRUE;
- if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
- *out_clen = 2;
+ if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
return TRUE;
- }
return FALSE;
}
static gboolean
-ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+ira_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
{
- *out_clen = 1;
return (ulen == 1);
}
static gboolean
-ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+ucs2_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
{
- *out_clen = 2;
return (c <= 0xFFFF);
}
static gboolean
-iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+utf16_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
+{
+ return TRUE;
+}
+
+static gboolean
+iso88591_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
{
- *out_clen = 1;
return (c <= 0xFF);
}
static gboolean
-pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+pccp437_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
{
static const gunichar t[] = {
0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
@@ -518,13 +483,11 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2,
0x25a0, 0x00a0
};
- int i;
-
- *out_clen = 1;
+ guint i;
if (c <= 0x7F)
return TRUE;
- for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
+ for (i = 0; i < G_N_ELEMENTS (t); i++) {
if (c == t[i])
return TRUE;
}
@@ -532,7 +495,9 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
}
static gboolean
-pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+pcdn_is_subset (gunichar c,
+ const gchar *utf8,
+ gsize ulen)
{
static const gunichar t[] = {
0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
@@ -551,9 +516,7 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2,
0x25a0, 0x00a0
};
- int i;
-
- *out_clen = 1;
+ guint i;
if (c <= 0x7F)
return TRUE;
@@ -566,84 +529,75 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
typedef struct {
MMModemCharset cs;
- gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen);
- guint charsize;
+ gboolean (*func) (gunichar c,
+ const gchar *utf8,
+ gsize ulen);
} SubsetEntry;
-SubsetEntry subset_table[] = {
- { MM_MODEM_CHARSET_GSM, gsm_is_subset },
- { MM_MODEM_CHARSET_IRA, ira_is_subset },
- { MM_MODEM_CHARSET_UCS2, ucs2_is_subset },
+const SubsetEntry subset_table[] = {
+ { MM_MODEM_CHARSET_GSM, gsm_is_subset },
+ { MM_MODEM_CHARSET_IRA, ira_is_subset },
+ { MM_MODEM_CHARSET_UCS2, ucs2_is_subset },
+ { MM_MODEM_CHARSET_UTF16, utf16_is_subset },
{ MM_MODEM_CHARSET_8859_1, iso88591_is_subset },
- { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
- { MM_MODEM_CHARSET_PCDN, pcdn_is_subset },
- { MM_MODEM_CHARSET_UNKNOWN, NULL },
+ { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
+ { MM_MODEM_CHARSET_PCDN, pcdn_is_subset },
};
-/**
- * mm_charset_get_encoded_len:
- *
- * @utf8: UTF-8 valid string
- * @charset: the #MMModemCharset to check the length of @utf8 in
- * @out_unsupported: on return, number of characters of @utf8 that are not fully
- * representable in @charset
- *
- * Returns: the size in bytes of the string if converted from UTF-8 into @charset.
- **/
-guint
-mm_charset_get_encoded_len (const char *utf8,
- MMModemCharset charset,
- guint *out_unsupported)
+gboolean
+mm_charset_can_convert_to (const gchar *utf8,
+ MMModemCharset charset)
{
- const char *p = utf8, *next;
- guint len = 0, unsupported = 0;
- SubsetEntry *e;
+ const gchar *p;
+ guint i;
- g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
- g_return_val_if_fail (utf8 != NULL, 0);
+ g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, FALSE);
+ g_return_val_if_fail (utf8 != NULL, FALSE);
if (charset == MM_MODEM_CHARSET_UTF8)
- return strlen (utf8);
+ return TRUE;
/* Find the charset in our subset table */
- for (e = &subset_table[0];
- e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
- e++);
- g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);
+ for (i = 0; i < G_N_ELEMENTS (subset_table); i++) {
+ if (subset_table[i].cs == charset)
+ break;
+ }
+ g_return_val_if_fail (i < G_N_ELEMENTS (subset_table), FALSE);
+ p = utf8;
while (*p) {
gunichar c;
const char *end;
- guint clen = 0;
c = g_utf8_get_char_validated (p, -1);
g_return_val_if_fail (c != (gunichar) -1, 0);
- end = next = g_utf8_find_next_char (p, NULL);
+ end = g_utf8_find_next_char (p, NULL);
if (end == NULL) {
- /* Find the end... */
+ /* Find the string terminating NULL */
end = p;
- while (*end++);
+ while (*++end);
}
- if (!e->func (c, p, (end - p), &clen))
- unsupported++;
- len += clen;
- p = next;
+ if (!subset_table[i].func (c, p, (end - p)))
+ return FALSE;
+
+ p = end;
}
- if (out_unsupported)
- *out_unsupported = unsupported;
- return len;
+ return TRUE;
}
+/******************************************************************************/
+/* GSM-7 pack/unpack operations */
+
guint8 *
-gsm_unpack (const guint8 *gsm,
- guint32 num_septets,
- guint8 start_offset, /* in _bits_ */
- guint32 *out_unpacked_len)
+mm_charset_gsm_unpack (const guint8 *gsm,
+ guint32 num_septets,
+ guint8 start_offset, /* in _bits_ */
+ guint32 *out_unpacked_len)
{
GByteArray *unpacked;
- int i;
+ guint i;
unpacked = g_byte_array_sized_new (num_septets + 1);
@@ -673,14 +627,14 @@ gsm_unpack (const guint8 *gsm,
}
guint8 *
-gsm_pack (const guint8 *src,
- guint32 src_len,
- guint8 start_offset,
- guint32 *out_packed_len)
+mm_charset_gsm_pack (const guint8 *src,
+ guint32 src_len,
+ guint8 start_offset,
+ guint32 *out_packed_len)
{
guint8 *packed;
guint octet = 0, lshift, plen;
- int i = 0;
+ guint i = 0;
g_return_val_if_fail (start_offset < 8, NULL);
@@ -708,202 +662,308 @@ gsm_pack (const guint8 *src,
return packed;
}
-/* We do all our best to get the given string, which is possibly given in the
- * specified charset, to UTF8. It may happen that the given string is really
- * the hex representation of the charset-encoded string, so we need to cope with
- * that case. */
-gchar *
-mm_charset_take_and_convert_to_utf8 (gchar *str, MMModemCharset charset)
+/*****************************************************************************/
+/* Main conversion functions */
+
+static guint8 *
+charset_iconv_from_utf8 (const gchar *utf8,
+ const CharsetSettings *settings,
+ gboolean translit,
+ guint *out_size,
+ GError **error)
{
- gchar *utf8 = NULL;
+ g_autoptr(GError) inner_error = NULL;
+ gsize bytes_written = 0;
+ g_autofree guint8 *encoded = NULL;
+
+ encoded = (guint8 *) g_convert (utf8, -1,
+ settings->iconv_name, "UTF-8",
+ NULL, &bytes_written, &inner_error);
+ if (encoded) {
+ if (out_size)
+ *out_size = (guint) bytes_written;
+ return g_steal_pointer (&encoded);
+ }
- if (!str)
+ if (!translit) {
+ g_propagate_error (error, g_steal_pointer (&inner_error));
+ g_prefix_error (error, "Couldn't convert from UTF-8 to %s: ", settings->gsm_name);
return NULL;
-
- switch (charset) {
- case MM_MODEM_CHARSET_UNKNOWN:
- g_warn_if_reached ();
- utf8 = str;
- break;
-
- case MM_MODEM_CHARSET_HEX:
- /* We'll assume that the HEX string is really valid ASCII at the end */
- utf8 = str;
- break;
-
- case MM_MODEM_CHARSET_GSM:
- case MM_MODEM_CHARSET_8859_1:
- case MM_MODEM_CHARSET_PCCP437:
- case MM_MODEM_CHARSET_PCDN: {
- const gchar *iconv_from;
- GError *error = NULL;
-
- iconv_from = charset_iconv_from (charset);
- utf8 = g_convert (str, strlen (str),
- "UTF-8//TRANSLIT", iconv_from,
- NULL, NULL, &error);
- if (!utf8 || error) {
- g_clear_error (&error);
- utf8 = NULL;
- }
-
- g_free (str);
- break;
}
- case MM_MODEM_CHARSET_UCS2: {
- gsize len;
- gboolean possibly_hex = TRUE;
- gsize bread = 0, bwritten = 0;
+ encoded = (guint8 *) g_convert_with_fallback (utf8, -1,
+ settings->iconv_name, "UTF-8", translit_fallback,
+ NULL, &bytes_written, error);
+ if (encoded) {
+ if (out_size)
+ *out_size = (guint) bytes_written;
+ return g_steal_pointer (&encoded);
+ }
- /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
- len = strlen (str);
- if ((len < 4) || ((len % 4) != 0))
- possibly_hex = FALSE;
- else {
- const gchar *p = str;
+ g_prefix_error (error, "Couldn't convert from UTF-8 to %s with translit: ", settings->gsm_name);
+ return NULL;
+}
- /* All chars in the string must be hex */
- while (*p && possibly_hex)
- possibly_hex = isxdigit (*p++);
- }
+GByteArray *
+mm_modem_charset_bytearray_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ const CharsetSettings *settings;
+ guint8 *encoded = NULL;
+ guint encoded_size = 0;
- /* If hex, then we expect hex-encoded UCS-2 */
- if (possibly_hex) {
- utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
- if (utf8) {
- g_free (str);
- break;
- }
- }
+ settings = lookup_charset_settings (charset);
- /* If not hex, then it might be raw UCS-2 (very unlikely) or ASCII/UTF-8
- * (much more likely). Try to convert to UTF-8 and if that fails, use
- * the partial conversion length to re-convert the part of the string
- * that is UTF-8, if any.
- */
- utf8 = g_convert (str, strlen (str),
- "UTF-8//TRANSLIT", "UTF-8//TRANSLIT",
- &bread, &bwritten, NULL);
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
+ return NULL;
+ }
- /* Valid conversion, or we didn't get enough valid UTF-8 */
- if (utf8 || (bwritten <= 2)) {
- g_free (str);
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ encoded = charset_utf8_to_unpacked_gsm (utf8, translit, &encoded_size, error);
break;
- }
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ case MM_MODEM_CHARSET_UTF16:
+ encoded = charset_iconv_from_utf8 (utf8, settings, translit, &encoded_size, error);
+ break;
+ case MM_MODEM_CHARSET_UNKNOWN:
+ default:
+ g_assert_not_reached ();
+ }
- /* Last try; chop off the original string at the conversion failure
- * location and get what we can.
- */
- str[bread] = '\0';
- utf8 = g_convert (str, strlen (str),
- "UTF-8//TRANSLIT", "UTF-8//TRANSLIT",
- NULL, NULL, NULL);
- g_free (str);
- break;
+ return g_byte_array_new_take (encoded, encoded_size);
+}
+
+gchar *
+mm_modem_charset_str_from_utf8 (const gchar *utf8,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ g_autoptr(GByteArray) bytearray = NULL;
+
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
+ return NULL;
}
- /* If the given charset is ASCII or UTF8, we really expect the final string
- * already here */
- case MM_MODEM_CHARSET_IRA:
- case MM_MODEM_CHARSET_UTF8:
- utf8 = str;
- break;
+ bytearray = mm_modem_charset_bytearray_from_utf8 (utf8, charset, translit, error);
+ if (!bytearray)
+ return NULL;
+
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ /* Note: strings encoded in unpacked GSM-7 can be used as plain
+ * strings as long as the string doesn't contain character '@', which
+ * is the one encoded as 0x00. At this point, we perform transliteration
+ * of the NUL bytes in the GSM-7 bytearray, and we fail the operation
+ * if one or more replacements were done and transliteration wasn't
+ * requested */
+ if (translit_gsm_nul_byte (bytearray) && !translit) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert to GSM-7 string: transliteration required for embedded '@'");
+ return NULL;
+ }
+ /* fall through */
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ return (gchar *) g_byte_array_free (g_steal_pointer (&bytearray), FALSE);
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_UTF16:
+ return mm_utils_bin2hexstr (bytearray->data, bytearray->len);
+ default:
+ case MM_MODEM_CHARSET_UNKNOWN:
+ g_assert_not_reached ();
}
+}
- /* Validate UTF-8 always before returning. This result will be exposed in DBus
- * very likely... */
- if (utf8 && !g_utf8_validate (utf8, -1, NULL)) {
- /* Better return NULL than an invalid UTF-8 string */
- g_free (utf8);
- utf8 = NULL;
+static gchar *
+charset_iconv_to_utf8 (const guint8 *data,
+ guint32 len,
+ const CharsetSettings *settings,
+ gboolean translit,
+ GError **error)
+{
+ g_autoptr(GError) inner_error = NULL;
+ g_autofree gchar *utf8 = NULL;
+
+ utf8 = g_convert ((const gchar *) data, len,
+ "UTF-8",
+ settings->iconv_name,
+ NULL, NULL, &inner_error);
+ if (utf8)
+ return g_steal_pointer (&utf8);
+
+ if (!translit) {
+ g_propagate_error (error, g_steal_pointer (&inner_error));
+ g_prefix_error (error, "Couldn't convert from %s to UTF-8: ", settings->gsm_name);
+ return NULL;
}
- return utf8;
+ utf8 = g_convert_with_fallback ((const gchar *) data, len,
+ "UTF-8", settings->iconv_name, translit_fallback,
+ NULL, NULL, error);
+ if (utf8)
+ return g_steal_pointer (&utf8);
+
+ g_prefix_error (error, "Couldn't convert from %s to UTF-8 with translit: ", settings->gsm_name);
+ return NULL;
}
-/* We do all our best to convert the given string, which comes in UTF-8, to the
- * specified charset. It may be that the output string needs to be the hex
- * representation of the charset-encoded string, so we need to cope with that
- * case. */
gchar *
-mm_utf8_take_and_convert_to_charset (gchar *str,
- MMModemCharset charset)
+mm_modem_charset_bytearray_to_utf8 (GByteArray *bytearray,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
{
- gchar *encoded = NULL;
+ const CharsetSettings *settings;
+ g_autofree gchar *utf8 = NULL;
- if (!str)
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
return NULL;
+ }
+
+ settings = lookup_charset_settings (charset);
+
+ switch (charset) {
+ case MM_MODEM_CHARSET_GSM:
+ utf8 = (gchar *) charset_gsm_unpacked_to_utf8 (bytearray->data,
+ bytearray->len,
+ translit,
+ error);
+ break;
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_UTF16:
+ utf8 = charset_iconv_to_utf8 (bytearray->data,
+ bytearray->len,
+ settings,
+ translit,
+ error);
+ break;
+ case MM_MODEM_CHARSET_UNKNOWN:
+ default:
+ g_assert_not_reached ();
+ }
- /* Validate UTF-8 always before converting */
- if (!g_utf8_validate (str, -1, NULL)) {
- /* Better return NULL than an invalid encoded string */
- g_free (str);
+ if (utf8 && g_utf8_validate (utf8, -1, NULL))
+ return g_steal_pointer (&utf8);
+
+ g_prefix_error (error, "Invalid conversion from %s to UTF-8: ", settings->gsm_name);
+ return NULL;
+}
+
+gchar *
+mm_modem_charset_str_to_utf8 (const gchar *str,
+ gssize len,
+ MMModemCharset charset,
+ gboolean translit,
+ GError **error)
+{
+ g_autoptr(GByteArray) bytearray = NULL;
+
+ if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+ g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+ "Cannot convert from UTF-8: unknown target charset");
return NULL;
}
+ /* Note: if the input string is GSM-7 encoded and it contains the '@'
+ * character, using -1 to indicate string length won't work properly,
+ * as '@' is encoded as 0x00. Whenever possible, if using GSM-7,
+ * give a proper len value or otherwise use the bytearray_to_utf8()
+ * method instead. */
+ if (len < 0)
+ len = strlen (str);
+
switch (charset) {
- case MM_MODEM_CHARSET_UNKNOWN:
- g_warn_if_reached ();
- encoded = str;
- break;
-
- case MM_MODEM_CHARSET_HEX:
- /* FIXME: What encoding is this? */
- g_warn_if_reached ();
- encoded = str;
- break;
-
- case MM_MODEM_CHARSET_GSM:
- case MM_MODEM_CHARSET_8859_1:
- case MM_MODEM_CHARSET_PCCP437:
- case MM_MODEM_CHARSET_PCDN: {
- const gchar *iconv_to;
- GError *error = NULL;
-
- iconv_to = charset_iconv_from (charset);
- encoded = g_convert (str, strlen (str),
- iconv_to, "UTF-8",
- NULL, NULL, &error);
- if (!encoded || error) {
- g_clear_error (&error);
- encoded = NULL;
- }
+ case MM_MODEM_CHARSET_GSM:
+ case MM_MODEM_CHARSET_IRA:
+ case MM_MODEM_CHARSET_8859_1:
+ case MM_MODEM_CHARSET_UTF8:
+ case MM_MODEM_CHARSET_PCCP437:
+ case MM_MODEM_CHARSET_PCDN:
+ bytearray = g_byte_array_sized_new (len);
+ g_byte_array_append (bytearray, (const guint8 *)str, len);
+ break;
+ case MM_MODEM_CHARSET_UCS2:
+ case MM_MODEM_CHARSET_UTF16: {
+ guint8 *bin = NULL;
+ gsize bin_len;
- g_free (str);
- break;
+ bin = (guint8 *) mm_utils_hexstr2bin (str, len, &bin_len, error);
+ if (!bin)
+ return NULL;
+
+ bytearray = g_byte_array_new_take (bin, bin_len);
+ break;
+ }
+ case MM_MODEM_CHARSET_UNKNOWN:
+ default:
+ g_assert_not_reached ();
}
- case MM_MODEM_CHARSET_UCS2: {
- const gchar *iconv_to;
- gsize encoded_len = 0;
- GError *error = NULL;
- gchar *hex;
-
- iconv_to = charset_iconv_from (charset);
- encoded = g_convert (str, strlen (str),
- iconv_to, "UTF-8",
- NULL, &encoded_len, &error);
- if (!encoded || error) {
- g_clear_error (&error);
- encoded = NULL;
+ return mm_modem_charset_bytearray_to_utf8 (bytearray, charset, translit, error);
+}
+
+/******************************************************************************/
+/* Runtime charset support via iconv() */
+
+void
+mm_modem_charsets_init (void)
+{
+ /* As test string, something we can convert to/from all the encodings */
+ static const gchar *default_test_str = "ModemManager";
+ guint i;
+
+ mm_obj_dbg (NULL, "[charsets] detecting platform iconv() support...");
+ for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
+ g_autofree guint8 *enc = NULL;
+ guint enc_size;
+ g_autofree gchar *dec = NULL;
+
+ if (!charset_settings[i].iconv_name)
+ continue;
+
+ enc = charset_iconv_from_utf8 (default_test_str,
+ &charset_settings[i],
+ FALSE,
+ &enc_size,
+ NULL);
+ if (!enc) {
+ mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to charset not supported", charset_settings[i].iconv_name);
+ continue;
}
- /* Get hex representation of the string */
- hex = mm_utils_bin2hexstr ((guint8 *)encoded, encoded_len);
- g_free (encoded);
- encoded = hex;
- g_free (str);
- break;
- }
+ dec = charset_iconv_to_utf8 (enc,
+ enc_size,
+ &charset_settings[i],
+ FALSE,
+ NULL);
+ if (!enc) {
+ mm_obj_dbg (NULL, "[charsets] %s: iconv conversion from charset not supported", charset_settings[i].iconv_name);
+ continue;
+ }
- /* If the given charset is ASCII or UTF8, we really expect the final string
- * already here. */
- case MM_MODEM_CHARSET_IRA:
- case MM_MODEM_CHARSET_UTF8:
- encoded = str;
- break;
+ mm_obj_dbg (NULL, "[charsets] %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name);
}
-
- return encoded;
}