1 files changed, 492 insertions, 432 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index ffdcad52..a48da368 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -11,6 +11,7 @@
  * GNU General Public License for more details:
  *
  * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2020 Aleksander Morgado <aleksander@aleksander.es>
  */
 
 #include <config.h>
@@ -24,204 +25,79 @@
 #include <libmm-glib.h>
 
 #include "mm-charsets.h"
+#include "mm-log.h"
 
-typedef struct {
-    const char *gsm_name;
-    const char *other_name;
-    const char *iconv_from_name;
-    const char *iconv_to_name;
-    MMModemCharset charset;
-} CharsetEntry;
-
-static CharsetEntry charset_map[] = {
-    { "UTF-8",   "UTF8",   "UTF-8",     "UTF-8//TRANSLIT",     MM_MODEM_CHARSET_UTF8 },
-    { "UCS2",    NULL,     "UCS-2BE",   "UCS-2BE//TRANSLIT",   MM_MODEM_CHARSET_UCS2 },
-    { "IRA",     "ASCII",  "ASCII",     "ASCII//TRANSLIT",     MM_MODEM_CHARSET_IRA },
-    { "GSM",     NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_GSM },
-    { "8859-1",  NULL,     "ISO8859-1", "ISO8859-1//TRANSLIT", MM_MODEM_CHARSET_8859_1 },
-    { "PCCP437", "CP437",  "CP437",     "CP437//TRANSLIT",     MM_MODEM_CHARSET_PCCP437 },
-    { "PCDN",    "CP850",  "CP850",     "CP850//TRANSLIT",     MM_MODEM_CHARSET_PCDN },
-    { "HEX",     NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_HEX },
-    { NULL,      NULL,     NULL,        NULL,                  MM_MODEM_CHARSET_UNKNOWN }
-};
-
-const char *
-mm_modem_charset_to_string (MMModemCharset charset)
-{
-    CharsetEntry *iter = &charset_map[0];
+/* Common fallback character when transliteration is enabled */
+static const gchar *translit_fallback = "?";
 
-    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
+/******************************************************************************/
+/* Expected charset settings */
 
-    while (iter->gsm_name) {
-        if (iter->charset == charset)
-            return iter->gsm_name;
-        iter++;
-    }
-    g_warn_if_reached ();
-    return NULL;
-}
+typedef struct {
+    MMModemCharset  charset;
+    const gchar    *gsm_name;
+    const gchar    *other_name;
+    const gchar    *iconv_name;
+} CharsetSettings;
+
+static const CharsetSettings charset_settings[] = {
+    { MM_MODEM_CHARSET_UTF8,    "UTF-8",   "UTF8",   "UTF-8"     },
+    { MM_MODEM_CHARSET_UCS2,    "UCS2",    NULL,     "UCS-2BE"   },
+    { MM_MODEM_CHARSET_IRA,     "IRA",     "ASCII",  "ASCII"     },
+    { MM_MODEM_CHARSET_GSM,     "GSM",     NULL,     NULL        },
+    { MM_MODEM_CHARSET_8859_1,  "8859-1",  NULL,     "ISO8859-1" },
+    { MM_MODEM_CHARSET_PCCP437, "PCCP437", "CP437",  "CP437"     },
+    { MM_MODEM_CHARSET_PCDN,    "PCDN",    "CP850",  "CP850"     },
+    { MM_MODEM_CHARSET_UTF16,   "UTF-16",  "UTF16",  "UTF-16BE"  },
+};
 
 MMModemCharset
-mm_modem_charset_from_string (const char *string)
+mm_modem_charset_from_string (const gchar *string)
 {
-    CharsetEntry *iter = &charset_map[0];
+    guint i;
 
     g_return_val_if_fail (string != NULL, MM_MODEM_CHARSET_UNKNOWN);
 
-    while (iter->gsm_name) {
-        if (strcasestr (string, iter->gsm_name))
-            return iter->charset;
-        if (iter->other_name && strcasestr (string, iter->other_name))
-            return iter->charset;
-        iter++;
+    for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
+        if (strcasestr (string, charset_settings[i].gsm_name))
+            return charset_settings[i].charset;
+        if (charset_settings[i].other_name && strcasestr (string, charset_settings[i].other_name))
+            return charset_settings[i].charset;
     }
     return MM_MODEM_CHARSET_UNKNOWN;
 }
 
-static const char *
-charset_iconv_to (MMModemCharset charset)
+static const CharsetSettings *
+lookup_charset_settings (MMModemCharset charset)
 {
-    CharsetEntry *iter = &charset_map[0];
+    guint i;
 
     g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
-    while (iter->gsm_name) {
-        if (iter->charset == charset)
-            return iter->iconv_to_name;
-        iter++;
-    }
-    g_warn_if_reached ();
-    return NULL;
-}
-
-static const char *
-charset_iconv_from (MMModemCharset charset)
-{
-    CharsetEntry *iter = &charset_map[0];
-
-    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
-    while (iter->gsm_name) {
-        if (iter->charset == charset)
-            return iter->iconv_from_name;
-        iter++;
+    for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
+        if (charset_settings[i].charset == charset)
+            return &charset_settings[i];
     }
     g_warn_if_reached ();
     return NULL;
 }
 
-gboolean
-mm_modem_charset_byte_array_append (GByteArray *array,
-                                    const char *utf8,
-                                    gboolean quoted,
-                                    MMModemCharset charset)
-{
-    const char *iconv_to;
-    char *converted;
-    GError *error = NULL;
-    gsize written = 0;
-
-    g_return_val_if_fail (array != NULL, FALSE);
-    g_return_val_if_fail (utf8 != NULL, FALSE);
-
-    iconv_to = charset_iconv_to (charset);
-    g_return_val_if_fail (iconv_to != NULL, FALSE);
-
-    converted = g_convert (utf8, -1, iconv_to, "UTF-8", NULL, &written, &error);
-    if (!converted) {
-        if (error) {
-            g_warning ("%s: failed to convert '%s' to %s character set: (%d) %s",
-                       __func__, utf8, iconv_to,
-                       error->code, error->message);
-            g_error_free (error);
-        }
-        return FALSE;
-    }
-
-    if (quoted)
-        g_byte_array_append (array, (const guint8 *) "\"", 1);
-    g_byte_array_append (array, (const guint8 *) converted, written);
-    if (quoted)
-        g_byte_array_append (array, (const guint8 *) "\"", 1);
-
-    g_free (converted);
-    return TRUE;
-}
-
-char *
-mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset)
-{
-    char *unconverted, *converted;
-    const char *iconv_from;
-    gsize unconverted_len = 0;
-    GError *error = NULL;
-
-    g_return_val_if_fail (src != NULL, NULL);
-    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
-    iconv_from = charset_iconv_from (charset);
-    g_return_val_if_fail (iconv_from != NULL, FALSE);
-
-    unconverted = mm_utils_hexstr2bin (src, &unconverted_len);
-    if (!unconverted)
-        return NULL;
-
-    if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA)
-        return unconverted;
-
-    converted = g_convert (unconverted, unconverted_len,
-                           "UTF-8//TRANSLIT", iconv_from,
-                           NULL, NULL, &error);
-    if (!converted || error) {
-        g_clear_error (&error);
-        converted = NULL;
-    }
-
-    g_free (unconverted);
-
-    return converted;
-}
-
-char *
-mm_modem_charset_utf8_to_hex (const char *src, MMModemCharset charset)
+const gchar *
+mm_modem_charset_to_string (MMModemCharset charset)
 {
-    gsize converted_len = 0;
-    char *converted;
-    const char *iconv_to;
-    GError *error = NULL;
-    gchar *hex;
-
-    g_return_val_if_fail (src != NULL, NULL);
-    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, NULL);
-
-    iconv_to = charset_iconv_from (charset);
-    g_return_val_if_fail (iconv_to != NULL, FALSE);
-
-    if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA)
-        return g_strdup (src);
-
-    converted = g_convert (src, strlen (src),
-                           iconv_to, "UTF-8//TRANSLIT",
-                           NULL, &converted_len, &error);
-    if (!converted || error) {
-        g_clear_error (&error);
-        g_free (converted);
-        return NULL;
-    }
+    const CharsetSettings *settings;
 
-    /* Get hex representation of the string */
-    hex = mm_utils_bin2hexstr ((guint8 *)converted, converted_len);
-    g_free (converted);
-    return hex;
+    settings = lookup_charset_settings (charset);
+    return settings ? settings->gsm_name : NULL;
 }
 
+/******************************************************************************/
 /* GSM 03.38 encoding conversion stuff */
 
 #define GSM_DEF_ALPHABET_SIZE 128
 #define GSM_EXT_ALPHABET_SIZE 10
 
 typedef struct GsmUtf8Mapping {
-    gchar chars[3];
+    gchar  chars[3];
     guint8 len;
     guint8 gsm;  /* only used for extended GSM charset */
 } GsmUtf8Mapping;
@@ -305,7 +181,8 @@ static const GsmUtf8Mapping gsm_def_utf8_alphabet[GSM_DEF_ALPHABET_SIZE] = {
 };
 
 static guint8
-gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2])
+gsm_def_char_to_utf8 (const guint8 gsm,
+                      guint8       out_utf8[2])
 {
     g_return_val_if_fail (gsm < GSM_DEF_ALPHABET_SIZE, 0);
     memcpy (&out_utf8[0], &gsm_def_utf8_alphabet[gsm].chars[0], gsm_def_utf8_alphabet[gsm].len);
@@ -313,9 +190,11 @@ gsm_def_char_to_utf8 (const guint8 gsm, guint8 out_utf8[2])
 }
 
 static gboolean
-utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm)
+utf8_to_gsm_def_char (const gchar *utf8,
+                      guint32      len,
+                      guint8      *out_gsm)
 {
-    int i;
+    gint i;
 
     if (len > 0 && len < 4) {
         for (i = 0; i < GSM_DEF_ALPHABET_SIZE; i++) {
@@ -330,6 +209,22 @@ utf8_to_gsm_def_char (const char *utf8, guint32 len, guint8 *out_gsm)
     return FALSE;
 }
 
+static gboolean
+translit_gsm_nul_byte (GByteArray *gsm)
+{
+    guint i;
+    guint n_replaces = 0;
+
+    for (i = 0; i < gsm->len; i++) {
+        if (gsm->data[i] == 0x00) {
+            utf8_to_gsm_def_char (translit_fallback, strlen (translit_fallback), &gsm->data[i]);
+            n_replaces++;
+        }
+    }
+
+    return (n_replaces > 0);
+}
+
 
 #define EONE(a, g)        { {a, 0x00, 0x00}, 1, g }
 #define ETHR(a, b, c, g)  { {a, b,    c},    3, g }
@@ -352,7 +247,8 @@ static const GsmUtf8Mapping gsm_ext_utf8_alphabet[GSM_EXT_ALPHABET_SIZE] = {
 #define GSM_ESCAPE_CHAR 0x1b
 
 static guint8
-gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3])
+gsm_ext_char_to_utf8 (const guint8 gsm,
+                      guint8       out_utf8[3])
 {
     int i;
 
@@ -366,7 +262,9 @@ gsm_ext_char_to_utf8 (const guint8 gsm, guint8 out_utf8[3])
 }
 
 static gboolean
-utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm)
+utf8_to_gsm_ext_char (const gchar *utf8,
+                      guint32      len,
+                      guint8      *out_gsm)
 {
     int i;
 
@@ -383,11 +281,14 @@ utf8_to_gsm_ext_char (const char *utf8, guint32 len, guint8 *out_gsm)
     return FALSE;
 }
 
-guint8 *
-mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
+static guint8 *
+charset_gsm_unpacked_to_utf8 (const guint8  *gsm,
+                              guint32        len,
+                              gboolean       translit,
+                              GError       **error)
 {
-    int i;
-    GByteArray *utf8;
+    g_autoptr(GByteArray) utf8 = NULL;
+    guint                 i;
 
     g_return_val_if_fail (gsm != NULL, NULL);
     g_return_val_if_fail (len < 4096, NULL);
@@ -399,6 +300,28 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
         guint8 uchars[4];
         guint8 ulen;
 
+        /*
+         * 	0x00 is NULL (when followed only by 0x00 up to the
+         * 	end of (fixed byte length) message, possibly also up to
+         * 	FORM FEED.  But 0x00 is also the code for COMMERCIAL AT
+         * 	when some other character (CARRIAGE RETURN if nothing else)
+         * 	comes after the 0x00.
+         *  http://unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT
+         *
+         * So, if we find a '@' (0x00) and all the next chars after that
+         * are also 0x00, we can consider the string finished already.
+         */
+        if (gsm[i] == 0x00) {
+            gsize j;
+
+            for (j = i + 1; j < len; j++) {
+                if (gsm[j] != 0x00)
+                    break;
+            }
+            if (j == len)
+                break;
+        }
+
         if (gsm[i] == GSM_ESCAPE_CHAR) {
             /* Extended alphabet, decode next char */
             ulen = gsm_ext_char_to_utf8 (gsm[i+1], uchars);
@@ -411,25 +334,36 @@ mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len)
 
         if (ulen)
             g_byte_array_append (utf8, &uchars[0], ulen);
-        else
-            g_byte_array_append (utf8, (guint8 *) "?", 1);
+        else if (translit)
+            g_byte_array_append (utf8, (guint8 *) translit_fallback, strlen (translit_fallback));
+        else {
+            g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                         "Invalid conversion from GSM7");
+            return NULL;
+        }
     }
 
-    g_byte_array_append (utf8, (guint8 *) "\0", 1);  /* NULL terminator */
-    return g_byte_array_free (utf8, FALSE);
+    /* Always make sure returned string is NUL terminated */
+    g_byte_array_append (utf8, (guint8 *) "\0", 1);
+    return g_byte_array_free (g_steal_pointer (&utf8), FALSE);
 }
 
-guint8 *
-mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
+static guint8 *
+charset_utf8_to_unpacked_gsm (const gchar  *utf8,
+                              gboolean      translit,
+                              guint32      *out_len,
+                              GError      **error)
 {
-    GByteArray *gsm;
-    const char *c = utf8, *next = c;
-    static const guint8 gesc = GSM_ESCAPE_CHAR;
-    int i = 0;
-
-    g_return_val_if_fail (utf8 != NULL, NULL);
-    g_return_val_if_fail (out_len != NULL, NULL);
-    g_return_val_if_fail (g_utf8_validate (utf8, -1, NULL), NULL);
+    g_autoptr(GByteArray)  gsm = NULL;
+    const gchar           *c;
+    const gchar           *next;
+    static const guint8    gesc = GSM_ESCAPE_CHAR;
+
+    if (!utf8 || !g_utf8_validate (utf8, -1, NULL)) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Couldn't convert UTF-8 to GSM: input UTF-8 validation failed");
+        return NULL;
+    }
 
     /* worst case initial length */
     gsm = g_byte_array_sized_new (g_utf8_strlen (utf8, -1) * 2 + 1);
@@ -437,10 +371,13 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
     if (*utf8 == 0x00) {
         /* Zero-length string */
         g_byte_array_append (gsm, (guint8 *) "\0", 1);
-        *out_len = 0;
-        return g_byte_array_free (gsm, FALSE);
+        if (out_len)
+            *out_len = 0;
+        return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
     }
 
+    next = utf8;
+    c    = utf8;
     while (next && *next) {
         guint8 gch = 0x3f;  /* 0x3f == '?' */
 
@@ -451,55 +388,83 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
             /* Add the escape char */
             g_byte_array_append (gsm, &gesc, 1);
             g_byte_array_append (gsm, &gch, 1);
-        } else if (utf8_to_gsm_def_char (c, next - c, &gch))
+        } else if (utf8_to_gsm_def_char (c, next - c, &gch)) {
+            g_byte_array_append (gsm, &gch, 1);
+        } else if (translit) {
+            /* add ? */
             g_byte_array_append (gsm, &gch, 1);
+        } else {
+            g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                         "Couldn't convert UTF-8 char to GSM");
+            return NULL;
+        }
 
         c = next;
-        i++;
     }
 
-    *out_len = gsm->len;
-    return g_byte_array_free (gsm, FALSE);
+    /* Output length doesn't consider terminating NUL byte */
+    if (out_len)
+        *out_len = gsm->len;
+
+    /* Always make sure returned string is NUL terminated */
+    g_byte_array_append (gsm, (guint8 *) "\0", 1);
+    return g_byte_array_free (g_steal_pointer (&gsm), FALSE);
 }
 
+/******************************************************************************/
+/* Checks to see whether conversion to a target charset may be done without
+ * any loss. */
+
 static gboolean
-gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+gsm_is_subset (gunichar     c,
+               const gchar *utf8,
+               gsize        ulen)
 {
     guint8 gsm;
 
-    *out_clen = 1;
     if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
         return TRUE;
-    if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
-        *out_clen = 2;
+    if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
         return TRUE;
-    }
     return FALSE;
 }
 
 static gboolean
-ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+ira_is_subset (gunichar     c,
+               const gchar *utf8,
+               gsize        ulen)
 {
-    *out_clen = 1;
     return (ulen == 1);
 }
 
 static gboolean
-ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+ucs2_is_subset (gunichar     c,
+                const gchar *utf8,
+                gsize        ulen)
 {
-    *out_clen = 2;
     return (c <= 0xFFFF);
 }
 
 static gboolean
-iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+utf16_is_subset (gunichar     c,
+                 const gchar *utf8,
+                 gsize        ulen)
+{
+    return TRUE;
+}
+
+static gboolean
+iso88591_is_subset (gunichar     c,
+                    const gchar *utf8,
+                    gsize        ulen)
 {
-    *out_clen = 1;
     return (c <= 0xFF);
 }
 
 static gboolean
-pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+pccp437_is_subset (gunichar     c,
+                   const gchar *utf8,
+                   gsize        ulen)
 {
     static const gunichar t[] = {
         0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
@@ -518,13 +483,11 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
         0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2,
         0x25a0, 0x00a0
     };
-    int i;
-
-    *out_clen = 1;
+    guint i;
 
     if (c <= 0x7F)
         return TRUE;
-    for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
+    for (i = 0; i < G_N_ELEMENTS (t); i++) {
         if (c == t[i])
             return TRUE;
     }
@@ -532,7 +495,9 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 }
 
 static gboolean
-pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+pcdn_is_subset (gunichar     c,
+                const gchar *utf8,
+                gsize        ulen)
 {
     static const gunichar t[] = {
         0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
@@ -551,9 +516,7 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
         0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2,
         0x25a0, 0x00a0
     };
-    int i;
-
-    *out_clen = 1;
+    guint i;
 
     if (c <= 0x7F)
         return TRUE;
@@ -566,84 +529,75 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
 
 typedef struct {
     MMModemCharset cs;
-    gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen);
-    guint charsize;
+    gboolean (*func) (gunichar     c,
+                      const gchar *utf8,
+                      gsize        ulen);
 } SubsetEntry;
 
-SubsetEntry subset_table[] = {
-    { MM_MODEM_CHARSET_GSM,     gsm_is_subset },
-    { MM_MODEM_CHARSET_IRA,     ira_is_subset },
-    { MM_MODEM_CHARSET_UCS2,    ucs2_is_subset },
+const SubsetEntry subset_table[] = {
+    { MM_MODEM_CHARSET_GSM,     gsm_is_subset      },
+    { MM_MODEM_CHARSET_IRA,     ira_is_subset      },
+    { MM_MODEM_CHARSET_UCS2,    ucs2_is_subset     },
+    { MM_MODEM_CHARSET_UTF16,   utf16_is_subset    },
     { MM_MODEM_CHARSET_8859_1,  iso88591_is_subset },
-    { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset },
-    { MM_MODEM_CHARSET_PCDN,    pcdn_is_subset },
-    { MM_MODEM_CHARSET_UNKNOWN, NULL },
+    { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset  },
+    { MM_MODEM_CHARSET_PCDN,    pcdn_is_subset     },
 };
 
-/**
- * mm_charset_get_encoded_len:
- *
- * @utf8: UTF-8 valid string
- * @charset: the #MMModemCharset to check the length of @utf8 in
- * @out_unsupported: on return, number of characters of @utf8 that are not fully
- * representable in @charset
- *
- * Returns: the size in bytes of the string if converted from UTF-8 into @charset.
- **/
-guint
-mm_charset_get_encoded_len (const char *utf8,
-                            MMModemCharset charset,
-                            guint *out_unsupported)
+gboolean
+mm_charset_can_convert_to (const gchar    *utf8,
+                           MMModemCharset  charset)
 {
-    const char *p = utf8, *next;
-    guint len = 0, unsupported = 0;
-    SubsetEntry *e;
+    const gchar *p;
+    guint        i;
 
-    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
-    g_return_val_if_fail (utf8 != NULL, 0);
+    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, FALSE);
+    g_return_val_if_fail (utf8 != NULL, FALSE);
 
     if (charset == MM_MODEM_CHARSET_UTF8)
-        return strlen (utf8);
+        return TRUE;
 
     /* Find the charset in our subset table */
-    for (e = &subset_table[0];
-         e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
-         e++);
-    g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);
+    for (i = 0; i < G_N_ELEMENTS (subset_table); i++) {
+        if (subset_table[i].cs == charset)
+            break;
+    }
+    g_return_val_if_fail (i < G_N_ELEMENTS (subset_table), FALSE);
 
+    p = utf8;
     while (*p) {
         gunichar c;
         const char *end;
-        guint clen = 0;
 
         c = g_utf8_get_char_validated (p, -1);
         g_return_val_if_fail (c != (gunichar) -1, 0);
-        end = next = g_utf8_find_next_char (p, NULL);
+        end = g_utf8_find_next_char (p, NULL);
         if (end == NULL) {
-            /* Find the end... */
+            /* Find the string terminating NULL */
             end = p;
-            while (*end++);
+            while (*++end);
         }
 
-        if (!e->func (c, p, (end - p), &clen))
-            unsupported++;
-        len += clen;
-        p = next;
+        if (!subset_table[i].func (c, p, (end - p)))
+            return FALSE;
+
+        p = end;
     }
 
-    if (out_unsupported)
-        *out_unsupported = unsupported;
-    return len;
+    return TRUE;
 }
 
+/******************************************************************************/
+/* GSM-7 pack/unpack operations */
+
 guint8 *
-gsm_unpack (const guint8 *gsm,
-            guint32 num_septets,
-            guint8 start_offset,  /* in _bits_ */
-            guint32 *out_unpacked_len)
+mm_charset_gsm_unpack (const guint8 *gsm,
+                       guint32       num_septets,
+                       guint8        start_offset,  /* in _bits_ */
+                       guint32      *out_unpacked_len)
 {
     GByteArray *unpacked;
-    int i;
+    guint i;
 
     unpacked = g_byte_array_sized_new (num_septets + 1);
 
@@ -673,14 +627,14 @@ gsm_unpack (const guint8 *gsm,
 }
 
 guint8 *
-gsm_pack (const guint8 *src,
-          guint32 src_len,
-          guint8 start_offset,
-          guint32 *out_packed_len)
+mm_charset_gsm_pack (const guint8 *src,
+                     guint32       src_len,
+                     guint8        start_offset,
+                     guint32      *out_packed_len)
 {
     guint8 *packed;
     guint octet = 0, lshift, plen;
-    int i = 0;
+    guint i = 0;
 
     g_return_val_if_fail (start_offset < 8, NULL);
 
@@ -708,202 +662,308 @@ gsm_pack (const guint8 *src,
     return packed;
 }
 
-/* We do all our best to get the given string, which is possibly given in the
- * specified charset, to UTF8. It may happen that the given string is really
- * the hex representation of the charset-encoded string, so we need to cope with
- * that case. */
-gchar *
-mm_charset_take_and_convert_to_utf8 (gchar *str, MMModemCharset charset)
+/*****************************************************************************/
+/* Main conversion functions */
+
+static guint8 *
+charset_iconv_from_utf8 (const gchar            *utf8,
+                         const CharsetSettings  *settings,
+                         gboolean                translit,
+                         guint                  *out_size,
+                         GError                **error)
 {
-    gchar *utf8 = NULL;
+    g_autoptr(GError)      inner_error = NULL;
+    gsize                  bytes_written = 0;
+    g_autofree guint8     *encoded = NULL;
+
+    encoded = (guint8 *) g_convert (utf8, -1,
+                                    settings->iconv_name, "UTF-8",
+                                    NULL, &bytes_written, &inner_error);
+    if (encoded) {
+        if (out_size)
+            *out_size = (guint) bytes_written;
+        return g_steal_pointer (&encoded);
+    }
 
-    if (!str)
+    if (!translit) {
+        g_propagate_error (error, g_steal_pointer (&inner_error));
+        g_prefix_error (error, "Couldn't convert from UTF-8 to %s: ", settings->gsm_name);
         return NULL;
-
-    switch (charset) {
-    case MM_MODEM_CHARSET_UNKNOWN:
-        g_warn_if_reached ();
-        utf8 = str;
-        break;
-
-    case MM_MODEM_CHARSET_HEX:
-        /* We'll assume that the HEX string is really valid ASCII at the end */
-        utf8 = str;
-        break;
-
-    case MM_MODEM_CHARSET_GSM:
-    case MM_MODEM_CHARSET_8859_1:
-    case MM_MODEM_CHARSET_PCCP437:
-    case MM_MODEM_CHARSET_PCDN: {
-        const gchar *iconv_from;
-        GError *error = NULL;
-
-        iconv_from = charset_iconv_from (charset);
-        utf8 = g_convert (str, strlen (str),
-                          "UTF-8//TRANSLIT", iconv_from,
-                          NULL, NULL, &error);
-        if (!utf8 || error) {
-            g_clear_error (&error);
-            utf8 = NULL;
-        }
-
-        g_free (str);
-        break;
     }
 
-    case MM_MODEM_CHARSET_UCS2: {
-        gsize len;
-        gboolean possibly_hex = TRUE;
-        gsize bread = 0, bwritten = 0;
+    encoded = (guint8 *) g_convert_with_fallback (utf8, -1,
+                                                  settings->iconv_name, "UTF-8", translit_fallback,
+                                                  NULL, &bytes_written, error);
+    if (encoded) {
+        if (out_size)
+            *out_size = (guint) bytes_written;
+        return g_steal_pointer (&encoded);
+    }
 
-        /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */
-        len = strlen (str);
-        if ((len < 4) || ((len % 4) != 0))
-            possibly_hex = FALSE;
-        else {
-            const gchar *p = str;
+    g_prefix_error (error, "Couldn't convert from UTF-8 to %s with translit: ", settings->gsm_name);
+    return NULL;
+}
 
-            /* All chars in the string must be hex */
-            while (*p && possibly_hex)
-                possibly_hex = isxdigit (*p++);
-        }
+GByteArray *
+mm_modem_charset_bytearray_from_utf8 (const gchar     *utf8,
+                                      MMModemCharset   charset,
+                                      gboolean         translit,
+                                      GError         **error)
+{
+    const CharsetSettings *settings;
+    guint8                *encoded = NULL;
+    guint                  encoded_size = 0;
 
-        /* If hex, then we expect hex-encoded UCS-2 */
-        if (possibly_hex) {
-            utf8 = mm_modem_charset_hex_to_utf8 (str, charset);
-            if (utf8) {
-                g_free (str);
-                break;
-            }
-        }
+    settings = lookup_charset_settings (charset);
 
-        /* If not hex, then it might be raw UCS-2 (very unlikely) or ASCII/UTF-8
-         * (much more likely).  Try to convert to UTF-8 and if that fails, use
-         * the partial conversion length to re-convert the part of the string
-         * that is UTF-8, if any.
-         */
-        utf8 = g_convert (str, strlen (str),
-                          "UTF-8//TRANSLIT", "UTF-8//TRANSLIT",
-                          &bread, &bwritten, NULL);
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
+        return NULL;
+    }
 
-        /* Valid conversion, or we didn't get enough valid UTF-8 */
-        if (utf8 || (bwritten <= 2)) {
-            g_free (str);
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+            encoded = charset_utf8_to_unpacked_gsm (utf8, translit, &encoded_size, error);
             break;
-        }
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+        case MM_MODEM_CHARSET_UTF16:
+            encoded = charset_iconv_from_utf8 (utf8, settings, translit, &encoded_size, error);
+            break;
+        case MM_MODEM_CHARSET_UNKNOWN:
+        default:
+            g_assert_not_reached ();
+    }
 
-        /* Last try; chop off the original string at the conversion failure
-         * location and get what we can.
-         */
-        str[bread] = '\0';
-        utf8 = g_convert (str, strlen (str),
-                          "UTF-8//TRANSLIT", "UTF-8//TRANSLIT",
-                          NULL, NULL, NULL);
-        g_free (str);
-        break;
+    return g_byte_array_new_take (encoded, encoded_size);
+}
+
+gchar *
+mm_modem_charset_str_from_utf8 (const gchar     *utf8,
+                                MMModemCharset   charset,
+                                gboolean         translit,
+                                GError         **error)
+{
+    g_autoptr(GByteArray) bytearray = NULL;
+
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
+        return NULL;
     }
 
-    /* If the given charset is ASCII or UTF8, we really expect the final string
-     * already here */
-    case MM_MODEM_CHARSET_IRA:
-    case MM_MODEM_CHARSET_UTF8:
-        utf8 = str;
-        break;
+    bytearray = mm_modem_charset_bytearray_from_utf8 (utf8, charset, translit, error);
+    if (!bytearray)
+        return NULL;
+
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+            /* Note: strings encoded in unpacked GSM-7 can be used as plain
+             * strings as long as the string doesn't contain character '@', which
+             * is the one encoded as 0x00. At this point, we perform transliteration
+             * of the NUL bytes in the GSM-7 bytearray, and we fail the operation
+             * if one or more replacements were done and transliteration wasn't
+             * requested */
+            if (translit_gsm_nul_byte (bytearray) && !translit) {
+                g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                             "Cannot convert to GSM-7 string: transliteration required for embedded '@'");
+                return NULL;
+            }
+            /* fall through */
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+            return (gchar *) g_byte_array_free (g_steal_pointer (&bytearray), FALSE);
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_UTF16:
+            return mm_utils_bin2hexstr (bytearray->data, bytearray->len);
+        default:
+        case MM_MODEM_CHARSET_UNKNOWN:
+            g_assert_not_reached ();
     }
+}
 
-    /* Validate UTF-8 always before returning. This result will be exposed in DBus
-     * very likely... */
-    if (utf8 && !g_utf8_validate (utf8, -1, NULL)) {
-        /* Better return NULL than an invalid UTF-8 string */
-        g_free (utf8);
-        utf8 = NULL;
+static gchar *
+charset_iconv_to_utf8 (const guint8           *data,
+                       guint32                 len,
+                       const CharsetSettings  *settings,
+                       gboolean                translit,
+                       GError                **error)
+{
+    g_autoptr(GError)  inner_error = NULL;
+    g_autofree gchar  *utf8 = NULL;
+
+    utf8 = g_convert ((const gchar *) data, len,
+                      "UTF-8",
+                      settings->iconv_name,
+                      NULL, NULL, &inner_error);
+    if (utf8)
+        return g_steal_pointer (&utf8);
+
+    if (!translit) {
+        g_propagate_error (error, g_steal_pointer (&inner_error));
+        g_prefix_error (error, "Couldn't convert from %s to UTF-8: ", settings->gsm_name);
+        return NULL;
     }
 
-    return utf8;
+    utf8 = g_convert_with_fallback ((const gchar *) data, len,
+                                    "UTF-8", settings->iconv_name, translit_fallback,
+                                    NULL, NULL, error);
+    if (utf8)
+        return g_steal_pointer (&utf8);
+
+    g_prefix_error (error, "Couldn't convert from %s to UTF-8 with translit: ", settings->gsm_name);
+    return NULL;
 }
 
-/* We do all our best to convert the given string, which comes in UTF-8, to the
- * specified charset. It may be that the output string needs to be the hex
- * representation of the charset-encoded string, so we need to cope with that
- * case. */
 gchar *
-mm_utf8_take_and_convert_to_charset (gchar *str,
-                                     MMModemCharset charset)
+mm_modem_charset_bytearray_to_utf8 (GByteArray      *bytearray,
+                                    MMModemCharset   charset,
+                                    gboolean         translit,
+                                    GError         **error)
 {
-    gchar *encoded = NULL;
+    const CharsetSettings *settings;
+    g_autofree gchar      *utf8 = NULL;
 
-    if (!str)
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
         return NULL;
+    }
+
+    settings = lookup_charset_settings (charset);
+
+    switch (charset) {
+        case MM_MODEM_CHARSET_GSM:
+            utf8 = (gchar *) charset_gsm_unpacked_to_utf8 (bytearray->data,
+                                                           bytearray->len,
+                                                           translit,
+                                                           error);
+            break;
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_UTF16:
+            utf8 = charset_iconv_to_utf8 (bytearray->data,
+                                          bytearray->len,
+                                          settings,
+                                          translit,
+                                          error);
+            break;
+        case MM_MODEM_CHARSET_UNKNOWN:
+        default:
+            g_assert_not_reached ();
+    }
 
-    /* Validate UTF-8 always before converting */
-    if (!g_utf8_validate (str, -1, NULL)) {
-        /* Better return NULL than an invalid encoded string */
-        g_free (str);
+    if (utf8 && g_utf8_validate (utf8, -1, NULL))
+        return g_steal_pointer (&utf8);
+
+    g_prefix_error (error, "Invalid conversion from %s to UTF-8: ", settings->gsm_name);
+    return NULL;
+}
+
+gchar *
+mm_modem_charset_str_to_utf8 (const gchar     *str,
+                              gssize           len,
+                              MMModemCharset   charset,
+                              gboolean         translit,
+                              GError         **error)
+{
+    g_autoptr(GByteArray) bytearray = NULL;
+
+    if (charset == MM_MODEM_CHARSET_UNKNOWN) {
+        g_set_error (error, MM_CORE_ERROR, MM_CORE_ERROR_INVALID_ARGS,
+                     "Cannot convert from UTF-8: unknown target charset");
         return NULL;
     }
 
+    /* Note: if the input string is GSM-7 encoded and it contains the '@'
+     * character, using -1 to indicate string length won't work properly,
+     * as '@' is encoded as 0x00. Whenever possible, if using GSM-7,
+     * give a proper len value or otherwise use the bytearray_to_utf8()
+     * method instead. */
+    if (len < 0)
+        len = strlen (str);
+
     switch (charset) {
-    case MM_MODEM_CHARSET_UNKNOWN:
-        g_warn_if_reached ();
-        encoded = str;
-        break;
-
-    case MM_MODEM_CHARSET_HEX:
-        /* FIXME: What encoding is this? */
-        g_warn_if_reached ();
-        encoded = str;
-        break;
-
-    case MM_MODEM_CHARSET_GSM:
-    case MM_MODEM_CHARSET_8859_1:
-    case MM_MODEM_CHARSET_PCCP437:
-    case MM_MODEM_CHARSET_PCDN: {
-        const gchar *iconv_to;
-        GError *error = NULL;
-
-        iconv_to = charset_iconv_from (charset);
-        encoded = g_convert (str, strlen (str),
-                             iconv_to, "UTF-8",
-                             NULL, NULL, &error);
-        if (!encoded || error) {
-            g_clear_error (&error);
-            encoded = NULL;
-        }
+        case MM_MODEM_CHARSET_GSM:
+        case MM_MODEM_CHARSET_IRA:
+        case MM_MODEM_CHARSET_8859_1:
+        case MM_MODEM_CHARSET_UTF8:
+        case MM_MODEM_CHARSET_PCCP437:
+        case MM_MODEM_CHARSET_PCDN:
+            bytearray = g_byte_array_sized_new (len);
+            g_byte_array_append (bytearray, (const guint8 *)str, len);
+            break;
+        case MM_MODEM_CHARSET_UCS2:
+        case MM_MODEM_CHARSET_UTF16: {
+            guint8 *bin = NULL;
+            gsize   bin_len;
 
-        g_free (str);
-        break;
+            bin = (guint8 *) mm_utils_hexstr2bin (str, len, &bin_len, error);
+            if (!bin)
+                return NULL;
+
+            bytearray = g_byte_array_new_take (bin, bin_len);
+            break;
+        }
+        case MM_MODEM_CHARSET_UNKNOWN:
+        default:
+            g_assert_not_reached ();
     }
 
-    case MM_MODEM_CHARSET_UCS2: {
-        const gchar *iconv_to;
-        gsize encoded_len = 0;
-        GError *error = NULL;
-        gchar *hex;
-
-        iconv_to = charset_iconv_from (charset);
-        encoded = g_convert (str, strlen (str),
-                             iconv_to, "UTF-8",
-                             NULL, &encoded_len, &error);
-        if (!encoded || error) {
-            g_clear_error (&error);
-            encoded = NULL;
+    return mm_modem_charset_bytearray_to_utf8 (bytearray, charset, translit, error);
+}
+
+/******************************************************************************/
+/* Runtime charset support via iconv() */
+
+void
+mm_modem_charsets_init (void)
+{
+    /* As test string, something we can convert to/from all the encodings */
+    static const gchar *default_test_str = "ModemManager";
+    guint               i;
+
+    mm_obj_dbg (NULL, "[charsets] detecting platform iconv() support...");
+    for (i = 0; i < G_N_ELEMENTS (charset_settings); i++) {
+        g_autofree guint8 *enc = NULL;
+        guint              enc_size;
+        g_autofree gchar  *dec = NULL;
+
+        if (!charset_settings[i].iconv_name)
+            continue;
+
+        enc = charset_iconv_from_utf8 (default_test_str,
+                                       &charset_settings[i],
+                                       FALSE,
+                                       &enc_size,
+                                       NULL);
+        if (!enc) {
+            mm_obj_dbg (NULL, "[charsets]   %s: iconv conversion to charset not supported", charset_settings[i].iconv_name);
+            continue;
         }
 
-        /* Get hex representation of the string */
-        hex = mm_utils_bin2hexstr ((guint8 *)encoded, encoded_len);
-        g_free (encoded);
-        encoded = hex;
-        g_free (str);
-        break;
-    }
+        dec = charset_iconv_to_utf8 (enc,
+                                     enc_size,
+                                     &charset_settings[i],
+                                     FALSE,
+                                     NULL);
+        if (!enc) {
+            mm_obj_dbg (NULL, "[charsets]   %s: iconv conversion from charset not supported", charset_settings[i].iconv_name);
+            continue;
+        }
 
-    /* If the given charset is ASCII or UTF8, we really expect the final string
-     * already here. */
-    case MM_MODEM_CHARSET_IRA:
-    case MM_MODEM_CHARSET_UTF8:
-        encoded = str;
-        break;
+        mm_obj_dbg (NULL, "[charsets]   %s: iconv conversion to/from charset is supported", charset_settings[i].iconv_name);
     }
-
-    return encoded;
 }