move ulfconv to l10ntools

This allows us to drop dependency on setup_native everywhere. Change-Id: Ib033f8d5953682379c6c2ab53d5cf221e9d8cfec
author: David Tardon <dtardon@redhat.com> 2012-07-15 11:38:07 +0200
committer: David Tardon <dtardon@redhat.com> 2012-07-17 14:06:52 +0200
commit: 4b592ce754e578a347490341caecc1bc45f67242 (patch)
tree: a25dce68615e277f12827d061e79da086522cf15 /l10ntools
parent: 4fea92fe5389ba4de593f5e991870cf595b516e1 (diff)
5 files changed, 550 insertions, 0 deletions
diff --git a/l10ntools/Executable_ulfconv.mk b/l10ntools/Executable_ulfconv.mk
new file mode 100644
index 000000000000..8ed5bb0e5f2c
--- /dev/null
+++ b/l10ntools/Executable_ulfconv.mk
@@ -0,0 +1,21 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_Executable_Executable,ulfconv))
+
+$(eval $(call gb_Executable_use_libraries,ulfconv,\
+    sal \
+	$(gb_STDLIBS) \
+))
+
+$(eval $(call gb_Executable_add_exception_objects,ulfconv,\
+    l10ntools/source/ulfconv/ulfconv \
+))
+
+# vim: set noet sw=4 ts=4:
diff --git a/l10ntools/Module_l10ntools.mk b/l10ntools/Module_l10ntools.mk
index c149db078c77..e3c11344a0c9 100644
--- a/l10ntools/Module_l10ntools.mk
+++ b/l10ntools/Module_l10ntools.mk
@@ -29,6 +29,7 @@ $(eval $(call gb_Module_Module,l10ntools))
 $(eval $(call gb_Module_add_targets,l10ntools,\
     Executable_helpex \
     Executable_idxdict \
+    Executable_ulfconv \
     Executable_ulfex \
     Executable_gsicheck \
     Executable_cfgex \
@@ -41,6 +42,7 @@ $(eval $(call gb_Module_add_targets,l10ntools,\
     Library_helplinker \
     Package_inc \
     Package_scripts \
+    Package_ulfconv \
 ))
 
 ifneq ($(SOLAR_JAVA),)
diff --git a/l10ntools/Package_ulfconv.mk b/l10ntools/Package_ulfconv.mk
new file mode 100644
index 000000000000..41337b26cb6e
--- /dev/null
+++ b/l10ntools/Package_ulfconv.mk
@@ -0,0 +1,14 @@
+# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+$(eval $(call gb_Package_Package,l10ntools_ulfconv,$(SRCDIR)/l10ntools/source/ulfconv))
+
+$(eval $(call gb_Package_add_file,l10ntools_ulfconv,bin/msi-encodinglist.txt,msi-encodinglist.txt))
+
+# vim: set noet sw=4 ts=4:
diff --git a/l10ntools/source/ulfconv/msi-encodinglist.txt b/l10ntools/source/ulfconv/msi-encodinglist.txt
new file mode 100644
index 000000000000..1fd3cb96e70d
--- /dev/null
+++ b/l10ntools/source/ulfconv/msi-encodinglist.txt
@@ -0,0 +1,152 @@
+# Syntax: language ANSI-Codepage LCID
+# comment lines begin with hash
+af    1252  1078   # Afrikaans
+am       0  1118   # Amharic
+ar    1256  1025
+ar-SA 1256  1025
+as       0  1101   # Assamese
+as-IN    0  1101   # Assamese
+ast   1252  1610
+be    1251  1059   # Belarusian
+be-BY 1251  1059
+bg    1251  1026   # Bulgarian
+bn       0  2117   # Bengali
+bn-BD    0  2117   # Bengali Bangladesh
+bn-IN    0  1093   # Bengali India
+bo       0  2121   
+br    1252  1150   # Breton
+brx      0  1603   # Bodo (India)
+bs       0  5146   # bosnian
+ca    1252  1027   # Catalan
+ca-XV   1252  32771  # Catalan Valencian
+cs    1250  1029   # Czech
+cy    1252  1106   # Welsh
+da    1252  1030
+de    1252  1031
+dgo      0  1604   # Dogri (India)
+dz       0  2129   # Dzongkha (same ID as tibetan bhutan (s. i40713))
+el    1253  1032
+en-GB 1252  2057
+en-US 1252  1033
+en-ZA 1252  7177
+eo       0  1553   # Esperanto
+es    1252  1034
+et    1257  1061
+eu    1252  1069   # Basque
+fa       0  1065   # Farsi
+fi    1252  1035
+fo    1252  1080   # Faroese
+fr    1252  1036
+fr-CA 1252  3084
+fur      0  1585
+ga       0  2108   # Irish
+gd       0  1084   # Gaelic (Scotland)
+gl    1252  1110   # Galician
+gu       0  1095   # Gujarati
+gu-IN    0  1095   # Gujarati
+he    1255  1037
+hi       0  1081
+hr    1250  1050   # Croatian
+ht    1252  1626   # Haitian
+hu    1250  1038
+hy       0  1067   # Armenian
+id    1252  1057   # Indonesian
+is    1252  1039   # Icelandic
+it    1252  1040
+ja     932  1041
+jbo      0  1624
+ka       0  1079   # Georgian
+kab      0  1625
+kk       0  1087
+km       0  1107   # Khmer
+kn       0  1099   # Kannada
+kn-IN    0  1099   # Kannada
+ko     949  1042
+kok      0  1111   # Konkani
+ks       0  1120   # Kashmiri
+ku       0  1574
+ky       0  1088   # Kyrgyz
+ky-CN    0  1640   # Kyrgyz (China)
+lb    1252  1134
+lo       0  1108   # Lao
+lt    1257  1063   # Lithuanian
+lv    1257  1062   # Latvian
+mai      0  1605   # Maithili (India)
+mk    1251  1071   # Macedonian
+ml       0  1100
+ml-IN    0  1100
+mn       0  1104   # Mongolian
+mni      0  1112   # Manipuri
+mn-TR    0  2128   # Mongolian Classical/traditional
+mr       0  1102   # Marathi
+mr-IN    0  1102
+ms       0  1086   # Malay (Malaysian)
+mt       0  1082   # Maltese
+my       0  1109   # Burmese
+nb    1252  1044
+ne       0  1121   # Nepali
+nl    1252  1043
+nn    1252  2068
+no    1252  1044
+nr       0  1580   # Ndebele South
+nso      0  1132
+ny       0  1598
+oc    1252  1154   # Occitan-lengadocian
+om       0  2162
+or       0  1096   # Oriya
+or-IN    0  1096
+pa-IN    0  1094   # Punjabi
+pap      0  2171 
+pl    1250  1045
+ps       0  2171
+pt    1252  2070
+pt-BR 1252  1046
+pt-PT 1252  2070
+qtz   1252  1033   # key id pseudo language
+rm       0  1047   # Raeto-Romance
+ro    1250  1048   # Romanian
+ru    1251  1049
+rw       0  1569   # Kinyarwanda
+sa-IN    0  1103   # Sanskrit
+sat      0  1606   # Santali
+sb       0  1070   # Sorbian
+sc       0  3047
+sd       0  1113   # Sindhi
+sh    1250  2074   # Serbian Latin
+si       0  2133
+sk    1250  1051   # Slovak
+sl    1250  1060   # Slovenian
+sq    1250  1052   # Albanian
+sr    1251  3098   # Serbian Cyrillic
+sr-SP 1251  3098   # Serbian Cyrillic
+ss       0  1579   # Swazi
+st       0  1072   # Southern Sotho, Sutu
+sv    1252  1053
+sw    1252  1089   # Swahili
+sw-TZ 1252  1089   # Swahili
+so       0  1143  
+ta       0  1097   # Tamil
+ta-IN    0  1097   # Tamil
+te       0  1098
+te-IN    0  1098
+tg       0  1064   # Tajik
+th     874  1054
+ti       0  1139   # Tigrinya
+ti-ER    0  1139   # Tigrinya
+tn       0  1074   # Setsuana
+tr    1254  1055   # Turkish
+ts       0  1073   # Tsonga
+tk       0  1090
+tt    1251  1092   # Tatar
+ug       0  1152
+uk    1251  1058   # Ukrainian
+ur    1256  1056   # Urdu
+ur-IN    0  2080
+uz       0  1091   # Uzbek (Latin)
+ve       0  1075   # Venda
+vi    1258  1066   # Vietnamese
+xh       0  1076   # Xhosa
+yi       0  1085   # Yiddish
+zh-CN  936  2052
+zh-TW  950  1028
+zu       0  1077   # Zulu
diff --git a/l10ntools/source/ulfconv/ulfconv.cxx b/l10ntools/source/ulfconv/ulfconv.cxx
new file mode 100644
index 000000000000..1643b330d776
--- /dev/null
+++ b/l10ntools/source/ulfconv/ulfconv.cxx
@@ -0,0 +1,361 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*************************************************************************
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * Copyright 2000, 2010 Oracle and/or its affiliates.
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * This file is part of OpenOffice.org.
+ *
+ * OpenOffice.org is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 3
+ * only, as published by the Free Software Foundation.
+ *
+ * OpenOffice.org is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License version 3 for more details
+ * (a copy is included in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * version 3 along with OpenOffice.org.  If not, see
+ * <http://www.openoffice.org/license.html>
+ * for a copy of the LGPLv3 License.
+ *
+ ************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <sal/alloca.h>
+#include <sal/macros.h>
+
+#include <rtl/ustring.hxx>
+
+#include <map>
+#include <string>
+
+/*****************************************************************************
+ * typedefs
+ *****************************************************************************/
+
+typedef std::map< const std::string, rtl_TextEncoding > EncodingMap;
+
+struct _pair {
+    const char *key;
+    rtl_TextEncoding value;
+};
+
+static int _pair_compare (const char *key, const _pair *pair);
+static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member );
+
+
+const _pair _ms_encoding_list[] = {
+    { "0",       RTL_TEXTENCODING_UTF8        },
+    { "1250",    RTL_TEXTENCODING_MS_1250     },
+    { "1251",    RTL_TEXTENCODING_MS_1251     },
+    { "1252",    RTL_TEXTENCODING_MS_1252     },
+    { "1253",    RTL_TEXTENCODING_MS_1253     },
+    { "1254",    RTL_TEXTENCODING_MS_1254     },
+    { "1255",    RTL_TEXTENCODING_MS_1255     },
+    { "1256",    RTL_TEXTENCODING_MS_1256     },
+    { "1257",    RTL_TEXTENCODING_MS_1257     },
+    { "1258",    RTL_TEXTENCODING_MS_1258     },
+    { "874",     RTL_TEXTENCODING_MS_874      },
+    { "932",     RTL_TEXTENCODING_MS_932      },
+    { "936",     RTL_TEXTENCODING_MS_936      },
+    { "949",     RTL_TEXTENCODING_MS_949      },
+    { "950",     RTL_TEXTENCODING_MS_950      }
+};
+
+
+/*****************************************************************************
+ * fgets that work with unix line ends on Windows
+ *****************************************************************************/
+
+char * my_fgets(char *s, int n, FILE *fp)
+{
+    int i;
+    for( i=0; i < n-1; i++ )
+    {
+        int c = getc(fp);
+
+        if( c == EOF )
+            break;
+
+        s[i] = (char) c;
+
+        if( s[i] == '\n' )
+        {
+            i++;
+            break;
+        }
+    }
+
+    if( i>0 )
+    {
+        s[i] = '\0';
+        return s;
+    }
+    else
+    {
+        return NULL;
+    }
+}
+
+/*****************************************************************************
+ * compare function for binary search
+ *****************************************************************************/
+
+static int
+_pair_compare (const char *key, const _pair *pair)
+{
+    int result = rtl_str_compareIgnoreAsciiCase( key, pair->key );
+    return result;
+}
+
+/*****************************************************************************
+ * binary search on encoding tables
+ *****************************************************************************/
+
+static const _pair*
+_pair_search (const char *key, const _pair *base, unsigned int member )
+{
+    unsigned int lower = 0;
+    unsigned int upper = member;
+    unsigned int current;
+    int comparison;
+
+    /* check for validity of input */
+    if ( (key == NULL) || (base == NULL) || (member == 0) )
+        return NULL;
+
+    /* binary search */
+    while ( lower < upper )
+    {
+        current = (lower + upper) / 2;
+        comparison = _pair_compare( key, base + current );
+        if (comparison < 0)
+            upper = current;
+        else
+        if (comparison > 0)
+            lower = current + 1;
+        else
+            return base + current;
+    }
+
+    return NULL;
+}
+
+
+/************************************************************************
+ * read_encoding_table
+ ************************************************************************/
+
+void read_encoding_table(char * file, EncodingMap& aEncodingMap)
+{
+    FILE * fp = fopen(file, "r");
+    if ( ! fp  ) {
+        fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno));
+        exit(2);
+    }
+
+    char buffer[512];
+    while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) {
+
+        // strip comment lines
+        if ( buffer[0] == '#' )
+            continue;
+
+        // find end of language string
+        char * cp;
+        for ( cp = buffer; ! isspace(*cp); cp++ )
+            ;
+        *cp = '\0';
+
+        // find start of codepage string
+        for ( ++cp; isspace(*cp); ++cp )
+            ;
+        char * codepage = cp;
+
+        // find end of codepage string
+        for ( ++cp; ! isspace(*cp); ++cp )
+            ;
+        *cp = '\0';
+
+        // find the correct mapping for codepage
+        const unsigned int members = SAL_N_ELEMENTS( _ms_encoding_list );
+        const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members );
+
+        if ( encoding != NULL ) {
+            const std::string language(buffer);
+            aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) );
+        }
+    }
+
+    fclose(fp);
+}
+
+/************************************************************************
+ * print_legacy_mixed
+ ************************************************************************/
+
+void print_legacy_mixed(
+    FILE * ostream,
+    const rtl::OUString& aString,
+    const std::string& language,
+    EncodingMap& aEncodingMap)
+{
+    EncodingMap::iterator iter = aEncodingMap.find(language);
+
+    if ( iter != aEncodingMap.end() ) {
+        fputs(OUStringToOString(aString, iter->second).getStr(), ostream);
+    } else {
+        fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str());
+    }
+}
+
+/************************************************************************
+ * print_java_style
+ ************************************************************************/
+
+void print_java_style(FILE * ostream, const rtl::OUString& aString)
+{
+    int imax = aString.getLength();
+    for (int i = 0; i < imax; i++) {
+        sal_Unicode uc = aString[i];
+        if ( uc < 128 ) {
+            fprintf(ostream, "%c", (char) uc);
+        } else {
+            fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF );
+        }
+    }
+}
+
+/************************************************************************
+ * main
+ ************************************************************************/
+
+int main( int argc, char * const argv[] )
+{
+    EncodingMap aEncodingMap;
+
+    FILE *istream = stdin;
+    FILE *ostream = stdout;
+
+    char *outfile = NULL;
+
+    int errflg = 0;
+    int argi;
+
+    for( argi=1; argi < argc; argi++ )
+    {
+        if( argv[argi][0] == '-' && argv[argi][2] == '\0' )
+        {
+            switch(argv[argi][1]) {
+            case 'o':
+                if (argi+1 >= argc || argv[argi+1][0] == '-')
+                {
+                    fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
+                    errflg++;
+                    break;
+                }
+
+                ++argi;
+                outfile = argv[argi];
+                break;
+            case 't':
+                if (argi+1 >= argc || argv[argi+1][0] == '-')
+                {
+                    fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
+                    errflg++;
+                    break;
+                }
+
+                read_encoding_table(argv[++argi], aEncodingMap);
+                break;
+            default:
+                fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]);
+                errflg++;
+            }
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    if (errflg) {
+      fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n");
+      exit(2);
+    }
+
+    /* assign input file to stdin */
+    if ( argi < argc )
+    {
+        istream = fopen(argv[argi], "r");
+        if ( istream  == NULL ) {
+            fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno));
+            exit(2);
+        }
+    }
+
+    /* open output file if any */
+    if ( outfile )
+    {
+        ostream = fopen(outfile, "w");
+        if ( ostream == NULL ) {
+            fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno));
+            fclose(istream);
+            exit(2);
+        }
+    }
+
+    /* read line by line from stdin */
+    char buffer[65536];
+    while ( NULL != fgets(buffer, sizeof(buffer), istream) ) {
+
+        /* only handle lines containing " = " */
+        char * cp = strstr(buffer, " = \"");
+        if ( cp ) {
+            rtl::OUString aString;
+
+            /* find end of lang string */
+            int n;
+            for ( n=0; ! isspace(buffer[n]); n++ )
+                ;
+
+            std::string line = buffer;
+            std::string lang(line, 0, n);
+
+            cp += 4;
+            rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp,
+                RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS );
+
+            fprintf(ostream, "%s = \"", lang.c_str());
+
+            if ( aEncodingMap.empty() ) {
+                print_java_style(ostream, aString);
+            } else {
+                print_legacy_mixed(ostream, aString, lang, aEncodingMap);
+            }
+
+            fprintf(ostream, "\"\n");
+
+
+        } else {
+            fputs(buffer, ostream);
+        }
+    }
+
+    fclose(ostream);
+    fclose(istream);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	David Tardon <dtardon@redhat.com>	2012-07-15 11:38:07 +0200
committer	David Tardon <dtardon@redhat.com>	2012-07-17 14:06:52 +0200
commit	4b592ce754e578a347490341caecc1bc45f67242 (patch)
tree	a25dce68615e277f12827d061e79da086522cf15 /l10ntools
parent	4fea92fe5389ba4de593f5e991870cf595b516e1 (diff)