summaryrefslogtreecommitdiff
path: root/glib/glib/gcharset.c
diff options
context:
space:
mode:
Diffstat (limited to 'glib/glib/gcharset.c')
-rw-r--r--glib/glib/gcharset.c592
1 files changed, 592 insertions, 0 deletions
diff --git a/glib/glib/gcharset.c b/glib/glib/gcharset.c
new file mode 100644
index 0000000..4f52ab4
--- /dev/null
+++ b/glib/glib/gcharset.c
@@ -0,0 +1,592 @@
+/* gcharset.c - Charset information
+ *
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include "gcharset.h"
+
+#include "garray.h"
+#include "genviron.h"
+#include "ghash.h"
+#include "gmessages.h"
+#include "gstrfuncs.h"
+#include "gthread.h"
+#ifdef G_OS_WIN32
+#include "gwin32.h"
+#endif
+
+#include "libcharset/libcharset.h"
+
+#include <string.h>
+#include <stdio.h>
+
+G_LOCK_DEFINE_STATIC (aliases);
+
+static GHashTable *
+get_alias_hash (void)
+{
+ static GHashTable *alias_hash = NULL;
+ const char *aliases;
+
+ G_LOCK (aliases);
+
+ if (!alias_hash)
+ {
+ alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
+
+ aliases = _g_locale_get_charset_aliases ();
+ while (*aliases != '\0')
+ {
+ const char *canonical;
+ const char *alias;
+ const char **alias_array;
+ int count = 0;
+
+ alias = aliases;
+ aliases += strlen (aliases) + 1;
+ canonical = aliases;
+ aliases += strlen (aliases) + 1;
+
+ alias_array = g_hash_table_lookup (alias_hash, canonical);
+ if (alias_array)
+ {
+ while (alias_array[count])
+ count++;
+ }
+
+ alias_array = g_renew (const char *, alias_array, count + 2);
+ alias_array[count] = alias;
+ alias_array[count + 1] = NULL;
+
+ g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
+ }
+ }
+
+ G_UNLOCK (aliases);
+
+ return alias_hash;
+}
+
+/* As an abuse of the alias table, the following routines gets
+ * the charsets that are aliases for the canonical name.
+ */
+G_GNUC_INTERNAL const char **
+_g_charset_get_aliases (const char *canonical_name)
+{
+ GHashTable *alias_hash = get_alias_hash ();
+
+ return g_hash_table_lookup (alias_hash, canonical_name);
+}
+
+static gboolean
+g_utf8_get_charset_internal (const char *raw_data,
+ const char **a)
+{
+ const char *charset = g_getenv ("CHARSET");
+
+ if (charset && *charset)
+ {
+ *a = charset;
+
+ if (charset && strstr (charset, "UTF-8"))
+ return TRUE;
+ else
+ return FALSE;
+ }
+
+ /* The libcharset code tries to be thread-safe without
+ * a lock, but has a memory leak and a missing memory
+ * barrier, so we lock for it
+ */
+ G_LOCK (aliases);
+ charset = _g_locale_charset_unalias (raw_data);
+ G_UNLOCK (aliases);
+
+ if (charset && *charset)
+ {
+ *a = charset;
+
+ if (charset && strstr (charset, "UTF-8"))
+ return TRUE;
+ else
+ return FALSE;
+ }
+
+ /* Assume this for compatibility at present. */
+ *a = "US-ASCII";
+
+ return FALSE;
+}
+
+typedef struct _GCharsetCache GCharsetCache;
+
+struct _GCharsetCache {
+ gboolean is_utf8;
+ gchar *raw;
+ gchar *charset;
+};
+
+static void
+charset_cache_free (gpointer data)
+{
+ GCharsetCache *cache = data;
+ g_free (cache->raw);
+ g_free (cache->charset);
+ g_free (cache);
+}
+
+/**
+ * g_get_charset:
+ * @charset: return location for character set name
+ *
+ * Obtains the character set for the <link linkend="setlocale">current
+ * locale</link>; you might use this character set as an argument to
+ * g_convert(), to convert from the current locale's encoding to some
+ * other encoding. (Frequently g_locale_to_utf8() and g_locale_from_utf8()
+ * are nice shortcuts, though.)
+ *
+ * On Windows the character set returned by this function is the
+ * so-called system default ANSI code-page. That is the character set
+ * used by the "narrow" versions of C library and Win32 functions that
+ * handle file names. It might be different from the character set
+ * used by the C library's current locale.
+ *
+ * The return value is %TRUE if the locale's encoding is UTF-8, in that
+ * case you can perhaps avoid calling g_convert().
+ *
+ * The string returned in @charset is not allocated, and should not be
+ * freed.
+ *
+ * Return value: %TRUE if the returned charset is UTF-8
+ */
+gboolean
+g_get_charset (const char **charset)
+{
+ static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
+ GCharsetCache *cache = g_private_get (&cache_private);
+ const gchar *raw;
+
+ if (!cache)
+ {
+ cache = g_new0 (GCharsetCache, 1);
+ g_private_set (&cache_private, cache);
+ }
+
+ G_LOCK (aliases);
+ raw = _g_locale_charset_raw ();
+ G_UNLOCK (aliases);
+
+ if (!(cache->raw && strcmp (cache->raw, raw) == 0))
+ {
+ const gchar *new_charset;
+
+ g_free (cache->raw);
+ g_free (cache->charset);
+ cache->raw = g_strdup (raw);
+ cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
+ cache->charset = g_strdup (new_charset);
+ }
+
+ if (charset)
+ *charset = cache->charset;
+
+ return cache->is_utf8;
+}
+
+/**
+ * g_get_codeset:
+ *
+ * Gets the character set for the current locale.
+ *
+ * Return value: a newly allocated string containing the name
+ * of the character set. This string must be freed with g_free().
+ */
+gchar *
+g_get_codeset (void)
+{
+ const gchar *charset;
+
+ g_get_charset (&charset);
+
+ return g_strdup (charset);
+}
+
+#ifndef G_OS_WIN32
+
+static GHashTable *alias_table = NULL;
+
+/* read an alias file for the locales */
+static void
+read_aliases (gchar *file)
+{
+ FILE *fp;
+ char buf[256];
+
+ if (!alias_table)
+ alias_table = g_hash_table_new (g_str_hash, g_str_equal);
+ fp = fopen (file,"r");
+ if (!fp)
+ return;
+ while (fgets (buf, 256, fp))
+ {
+ char *p, *q;
+
+ g_strstrip (buf);
+
+ /* Line is a comment */
+ if ((buf[0] == '#') || (buf[0] == '\0'))
+ continue;
+
+ /* Reads first column */
+ for (p = buf, q = NULL; *p; p++) {
+ if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
+ *p = '\0';
+ q = p+1;
+ while ((*q == '\t') || (*q == ' ')) {
+ q++;
+ }
+ break;
+ }
+ }
+ /* The line only had one column */
+ if (!q || *q == '\0')
+ continue;
+
+ /* Read second column */
+ for (p = q; *p; p++) {
+ if ((*p == '\t') || (*p == ' ')) {
+ *p = '\0';
+ break;
+ }
+ }
+
+ /* Add to alias table if necessary */
+ if (!g_hash_table_lookup (alias_table, buf)) {
+ g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
+ }
+ }
+ fclose (fp);
+}
+
+#endif
+
+static char *
+unalias_lang (char *lang)
+{
+#ifndef G_OS_WIN32
+ char *p;
+ int i;
+
+ if (!alias_table)
+ read_aliases ("/usr/share/locale/locale.alias");
+
+ i = 0;
+ while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
+ {
+ lang = p;
+ if (i++ == 30)
+ {
+ static gboolean said_before = FALSE;
+ if (!said_before)
+ g_warning ("Too many alias levels for a locale, "
+ "may indicate a loop");
+ said_before = TRUE;
+ return lang;
+ }
+ }
+#endif
+ return lang;
+}
+
+/* Mask for components of locale spec. The ordering here is from
+ * least significant to most significant
+ */
+enum
+{
+ COMPONENT_CODESET = 1 << 0,
+ COMPONENT_TERRITORY = 1 << 1,
+ COMPONENT_MODIFIER = 1 << 2
+};
+
+/* Break an X/Open style locale specification into components
+ */
+static guint
+explode_locale (const gchar *locale,
+ gchar **language,
+ gchar **territory,
+ gchar **codeset,
+ gchar **modifier)
+{
+ const gchar *uscore_pos;
+ const gchar *at_pos;
+ const gchar *dot_pos;
+
+ guint mask = 0;
+
+ uscore_pos = strchr (locale, '_');
+ dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
+ at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
+
+ if (at_pos)
+ {
+ mask |= COMPONENT_MODIFIER;
+ *modifier = g_strdup (at_pos);
+ }
+ else
+ at_pos = locale + strlen (locale);
+
+ if (dot_pos)
+ {
+ mask |= COMPONENT_CODESET;
+ *codeset = g_strndup (dot_pos, at_pos - dot_pos);
+ }
+ else
+ dot_pos = at_pos;
+
+ if (uscore_pos)
+ {
+ mask |= COMPONENT_TERRITORY;
+ *territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
+ }
+ else
+ uscore_pos = dot_pos;
+
+ *language = g_strndup (locale, uscore_pos - locale);
+
+ return mask;
+}
+
+/*
+ * Compute all interesting variants for a given locale name -
+ * by stripping off different components of the value.
+ *
+ * For simplicity, we assume that the locale is in
+ * X/Open format: language[_territory][.codeset][@modifier]
+ *
+ * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
+ * as well. We could just copy the code from glibc wholesale
+ * but it is big, ugly, and complicated, so I'm reluctant
+ * to do so when this should handle 99% of the time...
+ */
+static void
+append_locale_variants (GPtrArray *array,
+ const gchar *locale)
+{
+ gchar *language = NULL;
+ gchar *territory = NULL;
+ gchar *codeset = NULL;
+ gchar *modifier = NULL;
+
+ guint mask;
+ guint i, j;
+
+ g_return_if_fail (locale != NULL);
+
+ mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
+
+ /* Iterate through all possible combinations, from least attractive
+ * to most attractive.
+ */
+ for (j = 0; j <= mask; ++j)
+ {
+ i = mask - j;
+
+ if ((i & ~mask) == 0)
+ {
+ gchar *val = g_strconcat (language,
+ (i & COMPONENT_TERRITORY) ? territory : "",
+ (i & COMPONENT_CODESET) ? codeset : "",
+ (i & COMPONENT_MODIFIER) ? modifier : "",
+ NULL);
+ g_ptr_array_add (array, val);
+ }
+ }
+
+ g_free (language);
+ if (mask & COMPONENT_CODESET)
+ g_free (codeset);
+ if (mask & COMPONENT_TERRITORY)
+ g_free (territory);
+ if (mask & COMPONENT_MODIFIER)
+ g_free (modifier);
+}
+
+/**
+ * g_get_locale_variants:
+ * @locale: a locale identifier
+ *
+ * Returns a list of derived variants of @locale, which can be used to
+ * e.g. construct locale-dependent filenames or search paths. The returned
+ * list is sorted from most desirable to least desirable.
+ * This function handles territory, charset and extra locale modifiers.
+ *
+ * For example, if @locale is "fr_BE", then the returned list
+ * is "fr_BE", "fr".
+ *
+ * If you need the list of variants for the <emphasis>current locale</emphasis>,
+ * use g_get_language_names().
+ *
+ * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
+ * allocated array of newly allocated strings with the locale variants. Free with
+ * g_strfreev().
+ *
+ * Since: 2.28
+ */
+gchar **
+g_get_locale_variants (const gchar *locale)
+{
+ GPtrArray *array;
+
+ g_return_val_if_fail (locale != NULL, NULL);
+
+ array = g_ptr_array_sized_new (8);
+ append_locale_variants (array, locale);
+ g_ptr_array_add (array, NULL);
+
+ return (gchar **) g_ptr_array_free (array, FALSE);
+}
+
+/* The following is (partly) taken from the gettext package.
+ Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
+
+static const gchar *
+guess_category_value (const gchar *category_name)
+{
+ const gchar *retval;
+
+ /* The highest priority value is the `LANGUAGE' environment
+ variable. This is a GNU extension. */
+ retval = g_getenv ("LANGUAGE");
+ if ((retval != NULL) && (retval[0] != '\0'))
+ return retval;
+
+ /* `LANGUAGE' is not set. So we have to proceed with the POSIX
+ methods of looking to `LC_ALL', `LC_xxx', and `LANG'. On some
+ systems this can be done by the `setlocale' function itself. */
+
+ /* Setting of LC_ALL overwrites all other. */
+ retval = g_getenv ("LC_ALL");
+ if ((retval != NULL) && (retval[0] != '\0'))
+ return retval;
+
+ /* Next comes the name of the desired category. */
+ retval = g_getenv (category_name);
+ if ((retval != NULL) && (retval[0] != '\0'))
+ return retval;
+
+ /* Last possibility is the LANG environment variable. */
+ retval = g_getenv ("LANG");
+ if ((retval != NULL) && (retval[0] != '\0'))
+ return retval;
+
+#ifdef G_PLATFORM_WIN32
+ /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
+ * LANG, which we already did above. Oh well. The main point of
+ * calling g_win32_getlocale() is to get the thread's locale as used
+ * by Windows and the Microsoft C runtime (in the "English_United
+ * States" format) translated into the Unixish format.
+ */
+ {
+ char *locale = g_win32_getlocale ();
+ retval = g_intern_string (locale);
+ g_free (locale);
+ return retval;
+ }
+#endif
+
+ return NULL;
+}
+
+typedef struct _GLanguageNamesCache GLanguageNamesCache;
+
+struct _GLanguageNamesCache {
+ gchar *languages;
+ gchar **language_names;
+};
+
+static void
+language_names_cache_free (gpointer data)
+{
+ GLanguageNamesCache *cache = data;
+ g_free (cache->languages);
+ g_strfreev (cache->language_names);
+ g_free (cache);
+}
+
+/**
+ * g_get_language_names:
+ *
+ * Computes a list of applicable locale names, which can be used to
+ * e.g. construct locale-dependent filenames or search paths. The returned
+ * list is sorted from most desirable to least desirable and always contains
+ * the default locale "C".
+ *
+ * For example, if LANGUAGE=de:en_US, then the returned list is
+ * "de", "en_US", "en", "C".
+ *
+ * This function consults the environment variables <envar>LANGUAGE</envar>,
+ * <envar>LC_ALL</envar>, <envar>LC_MESSAGES</envar> and <envar>LANG</envar>
+ * to find the list of locales specified by the user.
+ *
+ * Return value: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
+ * that must not be modified or freed.
+ *
+ * Since: 2.6
+ **/
+const gchar * const *
+g_get_language_names (void)
+{
+ static GPrivate cache_private = G_PRIVATE_INIT (language_names_cache_free);
+ GLanguageNamesCache *cache = g_private_get (&cache_private);
+ const gchar *value;
+
+ if (!cache)
+ {
+ cache = g_new0 (GLanguageNamesCache, 1);
+ g_private_set (&cache_private, cache);
+ }
+
+ value = guess_category_value ("LC_MESSAGES");
+ if (!value)
+ value = "C";
+
+ if (!(cache->languages && strcmp (cache->languages, value) == 0))
+ {
+ GPtrArray *array;
+ gchar **alist, **a;
+
+ g_free (cache->languages);
+ g_strfreev (cache->language_names);
+ cache->languages = g_strdup (value);
+
+ array = g_ptr_array_sized_new (8);
+
+ alist = g_strsplit (value, ":", 0);
+ for (a = alist; *a; a++)
+ append_locale_variants (array, unalias_lang (*a));
+ g_strfreev (alist);
+ g_ptr_array_add (array, g_strdup ("C"));
+ g_ptr_array_add (array, NULL);
+
+ cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
+ }
+
+ return (const gchar * const *) cache->language_names;
+}