diff options
Diffstat (limited to 'i18npool/source/breakiterator/gendict.cxx')
-rw-r--r-- | i18npool/source/breakiterator/gendict.cxx | 305 |
1 files changed, 174 insertions, 131 deletions
diff --git a/i18npool/source/breakiterator/gendict.cxx b/i18npool/source/breakiterator/gendict.cxx index fe2758602ee4..ab181be73836 100644 --- a/i18npool/source/breakiterator/gendict.cxx +++ b/i18npool/source/breakiterator/gendict.cxx @@ -1,3 +1,4 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /************************************************************************* * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -35,171 +36,213 @@ #include <sal/types.h> #include <rtl/strbuf.hxx> #include <rtl/ustring.hxx> +#include <osl/diagnose.h> +#include <vector> +using std::vector; using namespace ::rtl; -/* Main Procedure */ +/* Utility gendict: -SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) -{ - FILE *sfp, *cfp; + "BreakIterator_CJK provides input string caching and dictionary searching for + longest matching. You can provide a sorted dictionary (the encoding must be + UTF-8) by creating the following file: + i18npool/source/breakiterator/data/<language>.dict. - if (argc < 3) exit(-1); + The utility gendict will convert the file to C code, which will be compiled + into a shared library for dynamic loading. - sfp = fopen(argv[1], "rb"); // open the source file for read; - if (sfp == NULL) - { - printf("Open the dictionary source file failed."); - return -1; - } + All dictionary searching and loading is performed in the xdictionary class. + The only thing you need to do is to derive your class from BreakIterator_CJK + and create an instance of the xdictionary with the language name and + pass it to the parent class." (from http://wiki.services.openoffice.org/wiki/ + /Documentation/DevGuide/OfficeDev/Implementing_a_New_Locale - 27/01/2011) +*/ - // create the C source file to write - cfp = fopen(argv[2], "wb"); - if (cfp == NULL) { - fclose(sfp); - printf("Can't create the C source file."); - return -1; - } +// C-standard garantees that static variables are automatically initialized to 0 +static sal_uInt8 exists[0x2000]; +static sal_uInt32 charArray[0x10000]; - fprintf(cfp, "/*\n"); - fprintf(cfp, " * Copyright(c) 1999 - 2000, Sun Microsystems, Inc.\n"); - fprintf(cfp, " * All Rights Reserved.\n"); - fprintf(cfp, " */\n\n"); - fprintf(cfp, "/* !!!The file is generated automatically. DONOT edit the file manually!!! */\n\n"); - fprintf(cfp, "#include <sal/types.h>\n\n"); - fprintf(cfp, "extern \"C\" {\n"); - - sal_Int32 count, i, j; - sal_Int32 lenArrayCurr = 0, lenArrayCount = 0, lenArrayLen = 0, *lenArray = NULL, charArray[0x10000]; - sal_Bool exist[0x10000]; - for (i = 0; i < 0x10000; i++) { - exist[i] = sal_False; - charArray[i] = 0; - } +static inline void set_exists(sal_uInt32 index) +{ + exists[index>>3] |= 1 << (index & 0x07); +} + +static inline void printIncludes(FILE* source_fp) +{ + fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp); + fputs("#include <sal/types.h>\n\n", source_fp); +} +static inline void printFunctions(FILE* source_fp) +{ + fputs ("\tconst sal_uInt8* getExistMark() { return existMark; }\n", source_fp); + fputs ("\tconst sal_Int16* getIndex1() { return index1; }\n", source_fp); + fputs ("\tconst sal_Int32* getIndex2() { return index2; }\n", source_fp); + fputs ("\tconst sal_Int32* getLenArray() { return lenArray; }\n", source_fp); + fputs ("\tconst sal_Unicode* getDataArea() { return dataArea; }\n", source_fp); +} + +static inline void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray) +{ // generate main dict. data array - fprintf(cfp, "static const sal_Unicode dataArea[] = {"); + fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp); sal_Char str[1024]; + sal_uInt32 lenArrayCurr = 0; sal_Unicode current = 0; - count = 0; - while (fgets(str, 1024, sfp)) { + + while (fgets(str, 1024, dictionary_fp)) { // input file is in UTF-8 encoding // don't convert last new line character to Ostr. OUString Ostr((const sal_Char *)str, strlen(str) - 1, RTL_TEXTENCODING_UTF8); const sal_Unicode *u = Ostr.getStr(); - sal_Int32 len = Ostr.getLength(); + const sal_Int32 len = Ostr.getLength(); - i=0; + sal_Int32 i=0; Ostr.iterateCodePoints(&i, 1); - if (len == i) continue; // skip one character word - - if (*u != current) { - if (*u < current) - printf("u %x, current %x, count %d, lenArrayCount %d\n", *u, current, - sal::static_int_cast<int>(count), sal::static_int_cast<int>(lenArrayCount)); - current = *u; - charArray[current] = lenArrayCount; + if (len == i) + continue; // skip one character word + + if (u[0] != current) { + OSL_ENSURE( (u[0] > current), "Dictionary file should be sorted"); + current = u[0]; + charArray[current] = lenArray.size(); } - if (lenArrayLen <= lenArrayCount+1) - lenArray = (sal_Int32*) realloc(lenArray, (lenArrayLen += 1000) * sizeof(sal_Int32)); - lenArray[lenArrayCount++] = lenArrayCurr; - - exist[u[0]] = sal_True; - for (i = 1; i < len; i++) { // start from second character, - exist[u[i]] = sal_True; // since the first character is captured in charArray. - lenArrayCurr++; - if ((count++) % 0x10 == 0) - fprintf(cfp, "\n\t"); - fprintf(cfp, "0x%04x, ", u[i]); + lenArray.push_back(lenArrayCurr); + + set_exists(u[0]); + // first character is stored in charArray, so start from second + for (i = 1; i < len; i++, lenArrayCurr++) { + set_exists(u[i]); + fprintf(source_fp, "0x%04x, ", u[i]); + if ((lenArrayCurr & 0x0f) == 0x0f) + fputs("\n\t", source_fp); } } - lenArray[lenArrayCount++] = lenArrayCurr; // store last ending pointer - charArray[current+1] = lenArrayCount; - fprintf(cfp, "\n};\n"); - - // generate lenArray - fprintf(cfp, "static const sal_Int32 lenArray[] = {\n\t"); - count = 1; - fprintf(cfp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. - for (i = 0; i < lenArrayCount; i++) { - fprintf(cfp, "0x%lx, ", static_cast<long unsigned int>(lenArray[i])); - if (count == 0xf) { - count = 0; - fprintf(cfp, "\n\t"); - } else count++; - } - fprintf(cfp, "\n};\n"); + lenArray.push_back( lenArrayCurr ); // store last ending pointer + charArray[current+1] = lenArray.size(); + fputs("\n};\n", source_fp); +} - free(lenArray); +static inline void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray) +{ + fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t"); + fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array. + for (size_t k = 0; k < lenArray.size(); k++) + { + if( !(k & 0xf) ) + fputs("\n\t", source_fp); - // generate index1 array - fprintf (cfp, "static const sal_Int16 index1[] = {\n\t"); - sal_Int16 set[0x100]; - count = 0; - for (i = 0; i < 0x100; i++) { - for (j = 0; j < 0x100; j++) - if (charArray[(i*0x100) + j] != 0) - break; - - fprintf(cfp, "0x%02x, ", set[i] = (j < 0x100 ? sal::static_int_cast<sal_Int16>(count++) : 0xff)); - if ((i+1) % 0x10 == 0) - fprintf (cfp, "\n\t"); + fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(lenArray[k])); + } + fputs("\n};\n", source_fp ); +} + +/* FIXME?: what happens if in every range i there is at least one charArray != 0 + => this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff } + => then in index2, the last range will be ignored incorrectly */ +static inline void printIndex1(FILE *source_fp, sal_Int16 *set) +{ + fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t"); + sal_Int16 count = 0; + for (sal_Int32 i = 0; i < 0x100; i++) { + sal_Int32 j = 0; + while( j < 0x100 && charArray[(i<<8) + j] == 0) + j++; + + fprintf(source_fp, "0x%02x, ", set[i] = (j < 0x100 ? count++ : 0xff)); + if ((i & 0x0f) == 0x0f) + fputs ("\n\t", source_fp); } - fprintf (cfp, "};\n"); + fputs("};\n", source_fp); +} - // generate index2 array - fprintf (cfp, "static const sal_Int32 index2[] = {\n\t"); +static inline void printIndex2(FILE *source_fp, sal_Int16 *set) +{ + fputs ("static const sal_Int32 index2[] = {\n\t", source_fp); sal_Int32 prev = 0; - for (i = 0; i < 0x100; i++) { + for (sal_Int32 i = 0; i < 0x100; i++) { if (set[i] != 0xff) { - for (j = 0; j < 0x100; j++) { - sal_Int32 k = (i*0x100) + j; - if (prev != 0 && charArray[k] == 0) { - for (k++; k < 0x10000; k++) - if (charArray[k] != 0) - break; + for (sal_Int32 j = 0; j < 0x100; j++) { + sal_Int32 k = (i<<8) + j; + if (prev != 0 ) + while( charArray[k] == 0 && k < 0x10000 ) + k++; + + prev = charArray[(i<<8) + j]; + fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0)); + if ((j & 0x0f) == 0x0f) + fputs ("\n\t", source_fp); } - prev = charArray[(i*0x100) + j]; - fprintf( - cfp, "0x%lx, ", - sal::static_int_cast< unsigned long >( - k < 0x10000 ? charArray[k] + 1 : 0)); - if ((j+1) % 0x10 == 0) - fprintf (cfp, "\n\t"); - } - fprintf (cfp, "\n\t"); + fputs ("\n\t", source_fp); } } - fprintf (cfp, "\n};\n"); - - // generate existMark array - count = 0; - fprintf (cfp, "static const sal_uInt8 existMark[] = {\n\t"); - for (i = 0; i < 0x1FFF; i++) { - sal_uInt8 bit = 0; - for (j = 0; j < 8; j++) - if (exist[i * 8 + j]) - bit |= 1 << j; - fprintf(cfp, "0x%02x, ", bit); - if (count == 0xf) { - count = 0; - fprintf(cfp, "\n\t"); - } else count++; + fputs ("\n};\n", source_fp); +} + +/* Generates a bitmask for the existance of sal_Unicode values in dictionary; + it packs 8 sal_Bool values in 1 sal_uInt8 */ +static inline void printExistsMask(FILE *source_fp) +{ + fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t"); + for (unsigned int i = 0; i < 0x2000; i++) + { + fprintf(source_fp, "0x%02x, ", exists[i]); + if ( (i & 0xf) == 0xf ) + fputs("\n\t", source_fp); } - fprintf (cfp, "\n};\n"); + fputs("\n};\n", source_fp); +} - // create function to return arrays - fprintf (cfp, "\tconst sal_uInt8* getExistMark() { return existMark; }\n"); - fprintf (cfp, "\tconst sal_Int16* getIndex1() { return index1; }\n"); - fprintf (cfp, "\tconst sal_Int32* getIndex2() { return index2; }\n"); - fprintf (cfp, "\tconst sal_Int32* getLenArray() { return lenArray; }\n"); - fprintf (cfp, "\tconst sal_Unicode* getDataArea() { return dataArea; }\n"); - fprintf (cfp, "}\n"); +SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv) +{ + FILE *dictionary_fp, *source_fp; - fclose(sfp); - fclose(cfp); + if (argc == 1 || argc > 3) + { + fputs("2 arguments required: dictionary_file_name source_file_name", stderr); + exit(-1); + } + + dictionary_fp = fopen(argv[1], "rb"); // open the source file for read; + if (dictionary_fp == NULL) + { + printf("Open the dictionary source file failed."); + return -1; + } + + if(argc == 2) + source_fp = stdout; + else + { + // create the C source file to write + source_fp = fopen(argv[2], "wb"); + if (source_fp == NULL) { + fclose(dictionary_fp); + printf("Can't create the C source file."); + return -1; + } + } + + vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea + sal_Int16 set[0x100]; + + printIncludes(source_fp); + fputs("extern \"C\" {\n", source_fp); + printDataArea(dictionary_fp, source_fp, lenArray); + printLenArray(source_fp, lenArray); + printIndex1(source_fp, set); + printIndex2(source_fp, set); + printExistsMask(source_fp); + printFunctions(source_fp); + fputs("}\n", source_fp); + + fclose(dictionary_fp); + fclose(source_fp); return 0; -} // End of main +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |