From fabad007c60958f2ff87e8f636ff6a798ad1f963 Mon Sep 17 00:00:00 2001 From: Eike Rathke Date: Sat, 22 Apr 2017 01:24:19 +0200 Subject: Upgrade to ICU 59.1 Also regenerated all patches using make icu.genpatch (hence the .1 suffix that indicates the path level) as some hunks did not apply anyway and all now have the correct offset. Using genpatch may have the future benefit to yield smaller diffs between different versions of patches. Also prefixed all patch names with icu4c- for a cleaner listing. New patches introduced are prefixed with icu4c-59-... Change-Id: Ia83754b0823839887fce1a1d4ed04f8375b113c2 Reviewed-on: https://gerrit.libreoffice.org/36809 Tested-by: Jenkins Reviewed-by: Eike Rathke --- Makefile.fetch | 1 + config_host.mk.in | 1 + configure.ac | 12 +- download.lst | 6 +- external/firebird/ExternalProject_firebird.mk | 1 + external/harfbuzz/ExternalProject_harfbuzz.mk | 4 +- external/icu/UnpackedTarball_icu.mk | 37 +- external/icu/clang-cl.patch.0 | 26 - external/icu/icu-ubsan.patch.0 | 163 --- external/icu/icu4c-59-werror-shadow.patch.1 | 74 ++ external/icu/icu4c-59-werror-undef.patch.1 | 12 + external/icu/icu4c-aix.patch | 141 --- external/icu/icu4c-aix.patch.1 | 143 +++ external/icu/icu4c-android.patch | 57 - external/icu/icu4c-android.patch.1 | 58 + external/icu/icu4c-build.patch | 88 -- external/icu/icu4c-build.patch.1 | 91 ++ external/icu/icu4c-clang-cl.patch.1 | 28 + external/icu/icu4c-icu11100.patch.1 | 9 +- external/icu/icu4c-icudata-stdlibs.diff | 15 - external/icu/icu4c-icudata-stdlibs.patch.1 | 14 + external/icu/icu4c-khmerbreakengine.patch.1 | 1114 ++++++++++++++++++++ external/icu/icu4c-macosx.patch | 20 - external/icu/icu4c-macosx.patch.1 | 20 + external/icu/icu4c-mkdir.patch | 10 - external/icu/icu4c-mkdir.patch.1 | 11 + external/icu/icu4c-rpath.patch | 35 - external/icu/icu4c-rpath.patch.1 | 36 + external/icu/icu4c-rtti.patch.1 | 12 + external/icu/icu4c-scriptrun.patch | 58 - external/icu/icu4c-scriptrun.patch.1 | 60 ++ external/icu/icu4c-solarisgcc.patch | 12 - external/icu/icu4c-solarisgcc.patch.1 | 12 + external/icu/icu4c-ubsan.patch.1 | 171 +++ external/icu/icu4c-warnings.patch | 10 - external/icu/icu4c-warnings.patch.1 | 11 + external/icu/khmerbreakengine.patch | 1110 ------------------- external/icu/rtti.patch.0 | 11 - external/libcdr/ExternalProject_libcdr.mk | 2 +- external/libebook/ExternalProject_libebook.mk | 2 +- .../libfreehand/ExternalProject_libfreehand.mk | 1 + external/libmspub/ExternalProject_libmspub.mk | 2 +- external/libvisio/ExternalProject_libvisio.mk | 2 +- 43 files changed, 1921 insertions(+), 1782 deletions(-) delete mode 100644 external/icu/clang-cl.patch.0 delete mode 100644 external/icu/icu-ubsan.patch.0 create mode 100644 external/icu/icu4c-59-werror-shadow.patch.1 create mode 100644 external/icu/icu4c-59-werror-undef.patch.1 delete mode 100644 external/icu/icu4c-aix.patch create mode 100644 external/icu/icu4c-aix.patch.1 delete mode 100644 external/icu/icu4c-android.patch create mode 100644 external/icu/icu4c-android.patch.1 delete mode 100644 external/icu/icu4c-build.patch create mode 100644 external/icu/icu4c-build.patch.1 create mode 100644 external/icu/icu4c-clang-cl.patch.1 delete mode 100644 external/icu/icu4c-icudata-stdlibs.diff create mode 100644 external/icu/icu4c-icudata-stdlibs.patch.1 create mode 100644 external/icu/icu4c-khmerbreakengine.patch.1 delete mode 100644 external/icu/icu4c-macosx.patch create mode 100644 external/icu/icu4c-macosx.patch.1 delete mode 100644 external/icu/icu4c-mkdir.patch create mode 100644 external/icu/icu4c-mkdir.patch.1 delete mode 100644 external/icu/icu4c-rpath.patch create mode 100644 external/icu/icu4c-rpath.patch.1 create mode 100644 external/icu/icu4c-rtti.patch.1 delete mode 100644 external/icu/icu4c-scriptrun.patch create mode 100644 external/icu/icu4c-scriptrun.patch.1 delete mode 100644 external/icu/icu4c-solarisgcc.patch create mode 100644 external/icu/icu4c-solarisgcc.patch.1 create mode 100644 external/icu/icu4c-ubsan.patch.1 delete mode 100644 external/icu/icu4c-warnings.patch create mode 100644 external/icu/icu4c-warnings.patch.1 delete mode 100644 external/icu/khmerbreakengine.patch delete mode 100644 external/icu/rtti.patch.0 diff --git a/Makefile.fetch b/Makefile.fetch index 1191a82a83dc..5958db10db09 100644 --- a/Makefile.fetch +++ b/Makefile.fetch @@ -135,6 +135,7 @@ $(WORKDIR)/download: $(BUILDDIR)/config_host.mk $(SRCDIR)/download.lst $(SRCDIR) $(call fetch_Optional,HUNSPELL,HUNSPELL_TARBALL) \ $(call fetch_Optional,HYPHEN,HYPHEN_TARBALL) \ $(call fetch_Optional,ICU,ICU_TARBALL) \ + $(call fetch_Optional,ICU,ICU_DATA_TARBALL) \ $(call fetch_Optional,JFREEREPORT,JFREEREPORT_FLOW_ENGINE_TARBALL) \ $(call fetch_Optional,JFREEREPORT,JFREEREPORT_FLUTE_TARBALL) \ $(call fetch_Optional,JFREEREPORT,JFREEREPORT_LIBBASE_TARBALL) \ diff --git a/config_host.mk.in b/config_host.mk.in index 480e42fde112..6f2280d83d72 100644 --- a/config_host.mk.in +++ b/config_host.mk.in @@ -263,6 +263,7 @@ export ICU_MINOR=@ICU_MINOR@ export ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER=@ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER@ export ICU_RECLASSIFIED_HEBREW_LETTER=@ICU_RECLASSIFIED_HEBREW_LETTER@ export ICU_RECLASSIFIED_PREPEND_SET_EMPTY=@ICU_RECLASSIFIED_PREPEND_SET_EMPTY@ +export ICU_UCHAR_TYPE=@ICU_UCHAR_TYPE@ export INTROSPECTION_SCANNER=@INTROSPECTION_SCANNER@ export ILIB=@ILIB@ export INSTALLDIR=@INSTALLDIR@ diff --git a/configure.ac b/configure.ac index ca5d0cd6b0b7..beb41ddc7d1b 100644 --- a/configure.ac +++ b/configure.ac @@ -9014,7 +9014,7 @@ SYSTEM_GENBRK= SYSTEM_GENCCODE= SYSTEM_GENCMN= -ICU_MAJOR=58 +ICU_MAJOR=59 ICU_MINOR=1 ICU_RECLASSIFIED_PREPEND_SET_EMPTY="TRUE" ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER="TRUE" @@ -9102,6 +9102,15 @@ else ICU_CFLAGS="-I${WORKDIR}/UnpackedTarball/icu/source/i18n -I${WORKDIR}/UnpackedTarball/icu/source/common" ICU_LIBS="-L${WORKDIR}/UnpackedTarball/icu/source/lib" fi +if test "$ICU_MAJOR" -ge "59"; then + # As of ICU 59 it defaults to typedef char16_t UChar; which is available + # with -std=c++11 but not all external libraries can be built with that, + # for those use a bit-compatible typedef uint16_t UChar; see + # icu/source/common/unicode/umachine.h + ICU_UCHAR_TYPE="-DUCHAR_TYPE=uint16_t" +else + ICU_UCHAR_TYPE="" +fi AC_SUBST(SYSTEM_ICU) AC_SUBST(SYSTEM_GENBRK) AC_SUBST(SYSTEM_GENCCODE) @@ -9113,6 +9122,7 @@ AC_SUBST(ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER) AC_SUBST(ICU_RECLASSIFIED_HEBREW_LETTER) AC_SUBST(ICU_CFLAGS) AC_SUBST(ICU_LIBS) +AC_SUBST(ICU_UCHAR_TYPE) dnl ================================================================== dnl Breakpad diff --git a/download.lst b/download.lst index 7f9658d85512..1361055c90b9 100644 --- a/download.lst +++ b/download.lst @@ -89,8 +89,10 @@ export HUNSPELL_SHA256SUM := 512e7d2ee69dad0b35ca011076405e56e0f10963a02d4859dbc export HUNSPELL_TARBALL := 047c3feb121261b76dc16cdb62f54483-hunspell-1.6.0.tar.gz export HYPHEN_SHA256SUM := 304636d4eccd81a14b6914d07b84c79ebb815288c76fe027b9ebff6ff24d5705 export HYPHEN_TARBALL := 5ade6ae2a99bc1e9e57031ca88d36dad-hyphen-2.8.8.tar.gz -export ICU_SHA256SUM := 0eb46ba3746a9c2092c8ad347a29b1a1b4941144772d13a88667a7b11ea30309 -export ICU_TARBALL := 1901302aaff1c1633ef81862663d2917-icu4c-58_1-src.tgz +export ICU_SHA256SUM := 7132fdaf9379429d004005217f10e00b7d2319d0fea22bdfddef8991c45b75fe +export ICU_TARBALL := icu4c-59_1-src.tgz +export ICU_DATA_SHA256SUM := 38a84c1289c32a96924ff286a2f33edc97671b4cc90e8c99708553a6b5a9e70a +export ICU_DATA_TARBALL := icu4c-59_1-data.zip export JFREEREPORT_FLOW_ENGINE_SHA256SUM := 233f66e8d25c5dd971716d4200203a612a407649686ef3b52075d04b4c9df0dd export JFREEREPORT_FLOW_ENGINE_TARBALL := ba2930200c9f019c2d93a8c88c651a0f-flow-engine-0.9.4.zip export JFREEREPORT_FLUTE_SHA256SUM := 1b5b24f7bc543c0362b667692f78db8bab4ed6dafc6172f104d0bd3757d8a133 diff --git a/external/firebird/ExternalProject_firebird.mk b/external/firebird/ExternalProject_firebird.mk index 361fffc86c64..988a24b5ed74 100644 --- a/external/firebird/ExternalProject_firebird.mk +++ b/external/firebird/ExternalProject_firebird.mk @@ -66,6 +66,7 @@ $(call gb_ExternalProject_get_state_target,firebird,build): -I$(call gb_UnpackedTarball_get_dir,icu)/source/i18n \ -I$(call gb_UnpackedTarball_get_dir,icu)/source/common \ ) \ + $(ICU_UCHAR_TYPE) \ $(if $(SYSTEM_LIBTOMMATH),$(LIBTOMMATH_CFLAGS), \ -L$(call gb_UnpackedTarball_get_dir,libtommath) \ ) \ diff --git a/external/harfbuzz/ExternalProject_harfbuzz.mk b/external/harfbuzz/ExternalProject_harfbuzz.mk index 53490abc2eaa..070c58ea80a7 100644 --- a/external/harfbuzz/ExternalProject_harfbuzz.mk +++ b/external/harfbuzz/ExternalProject_harfbuzz.mk @@ -47,7 +47,9 @@ $(call gb_ExternalProject_get_state_target,harfbuzz,build) : $(gb_COMPILERNOOPTFLAGS) $(gb_DEBUG_CFLAGS) \ $(gb_DEBUG_CXXFLAGS), \ $(gb_COMPILEROPTFLAGS)) \ - $(CXXFLAGS) $(if $(filter LINUX,$(OS)),-fvisibility=hidden)' \ + $(CXXFLAGS) \ + $(ICU_UCHAR_TYPE) \ + $(if $(filter LINUX,$(OS)),-fvisibility=hidden)' \ && (cd $(EXTERNAL_WORKDIR)/src && $(MAKE) lib) \ ) diff --git a/external/icu/UnpackedTarball_icu.mk b/external/icu/UnpackedTarball_icu.mk index fae37f5dd922..992001186d5f 100644 --- a/external/icu/UnpackedTarball_icu.mk +++ b/external/icu/UnpackedTarball_icu.mk @@ -11,22 +11,31 @@ $(eval $(call gb_UnpackedTarball_UnpackedTarball,icu)) $(eval $(call gb_UnpackedTarball_set_tarball,icu,$(ICU_TARBALL))) +# Data zip contains data/... and needs to end up in icu/source/data/... +# Some files are overwritten with identical content. +# -a to convert line endings from CrLf to LF (hopefully no binary identified as +# text ...) so existing patches still apply and can be generated. +$(eval $(call gb_UnpackedTarball_set_pre_action,icu,\ + unzip -q -a -d source -o $(gb_UnpackedTarget_TARFILE_LOCATION)/$(ICU_DATA_TARBALL) \ +)) + $(eval $(call gb_UnpackedTarball_add_patches,icu,\ - external/icu/icu4c-build.patch \ - external/icu/icu4c-aix.patch \ - external/icu/icu4c-warnings.patch \ - external/icu/icu4c-macosx.patch \ - external/icu/icu4c-solarisgcc.patch \ - external/icu/icu4c-mkdir.patch \ - external/icu/icu4c-$(if $(filter ANDROID,$(OS)),android,rpath).patch \ - external/icu/icu-ubsan.patch.0 \ + external/icu/icu4c-build.patch.1 \ + external/icu/icu4c-aix.patch.1 \ + external/icu/icu4c-warnings.patch.1 \ + external/icu/icu4c-macosx.patch.1 \ + external/icu/icu4c-solarisgcc.patch.1 \ + external/icu/icu4c-mkdir.patch.1 \ + external/icu/icu4c-$(if $(filter ANDROID,$(OS)),android,rpath).patch.1 \ + external/icu/icu4c-ubsan.patch.1 \ external/icu/icu4c-icu11100.patch.1 \ - external/icu/icu4c-scriptrun.patch \ - external/icu/rtti.patch.0 \ - external/icu/clang-cl.patch.0 \ - $(if $(filter-out ANDROID,$(OS)),external/icu/icu4c-icudata-stdlibs.diff) \ - external/icu/khmerbreakengine.patch \ - external/icu/icu4c-changeset-39671.patch.1 \ + external/icu/icu4c-scriptrun.patch.1 \ + external/icu/icu4c-rtti.patch.1 \ + external/icu/icu4c-clang-cl.patch.1 \ + $(if $(filter-out ANDROID,$(OS)),external/icu/icu4c-icudata-stdlibs.patch.1) \ + external/icu/icu4c-khmerbreakengine.patch.1 \ + external/icu/icu4c-59-werror-undef.patch.1 \ + external/icu/icu4c-59-werror-shadow.patch.1 \ )) $(eval $(call gb_UnpackedTarball_add_file,icu,source/data/brkitr/khmerdict.dict,external/icu/khmerdict.dict)) diff --git a/external/icu/clang-cl.patch.0 b/external/icu/clang-cl.patch.0 deleted file mode 100644 index 4df5d0f56c83..000000000000 --- a/external/icu/clang-cl.patch.0 +++ /dev/null @@ -1,26 +0,0 @@ ---- source/config/mh-cygwin-msvc -+++ source/config/mh-cygwin-msvc -@@ -51,8 +51,8 @@ - LDFLAGS+=-nologo - - # Commands to compile --COMPILE.c= $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c -+COMPILE.c= true && $(CC) $(CPPFLAGS) $(DEFS) $(CFLAGS) -c --COMPILE.cc= $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c -+COMPILE.cc= true && $(CXX) $(CPPFLAGS) $(DEFS) $(CXXFLAGS) -c - - # Commands to link - LINK.c= LINK.EXE -subsystem:console $(LDFLAGS) ---- source/runConfigureICU -+++ source/runConfigureICU -@@ -259,8 +259,8 @@ - Cygwin/MSVC) - THE_OS="Windows with Cygwin" - THE_COMP="Microsoft Visual C++" -- CC=cl; export CC -- CXX=cl; export CXX -+ CC=${CC-cl}; export CC -+ CXX=${CXX-cl}; export CXX - RELEASE_CFLAGS='-Gy -MD' - RELEASE_CXXFLAGS='-Gy -MD' - DEBUG_CFLAGS='-Zi -MDd' diff --git a/external/icu/icu-ubsan.patch.0 b/external/icu/icu-ubsan.patch.0 deleted file mode 100644 index f16503eeb4ac..000000000000 --- a/external/icu/icu-ubsan.patch.0 +++ /dev/null @@ -1,163 +0,0 @@ ---- source/common/rbbidata.h -+++ source/common/rbbidata.h -@@ -113,7 +113,7 @@ - /* StatusTable of the set of matching */ - /* tags (rule status values) */ - int16_t fReserved; -- uint16_t fNextState[2]; /* Next State, indexed by char category. */ -+ uint16_t fNextState[1]; /* Next State, indexed by char category. */ - /* This array does not have two elements */ - /* Array Size is actually fData->fHeader->fCatCount */ - /* CAUTION: see RBBITableBuilder::getTableSize() */ -@@ -126,7 +126,7 @@ - uint32_t fRowLen; /* Length of a state table row, in bytes. */ - uint32_t fFlags; /* Option Flags for this state table */ - uint32_t fReserved; /* reserved */ -- char fTableData[4]; /* First RBBIStateTableRow begins here. */ -+ char fTableData[1]; /* First RBBIStateTableRow begins here. */ - /* (making it char[] simplifies ugly address */ - /* arithmetic for indexing variable length rows.) */ - }; ---- source/common/rbbitblb.cpp -+++ source/common/rbbitblb.cpp -@@ -1067,15 +1067,15 @@ - return 0; - } - -- size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table. -+ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. - - numRows = fDStates->size(); - numCols = fRB->fSetBuilder->getNumCharCategories(); - -- // Note The declaration of RBBIStateTableRow is for a table of two columns. -- // Therefore we subtract two from numCols when determining -+ // Note The declaration of RBBIStateTableRow is for a table of one columns. -+ // Therefore we subtract one from numCols when determining - // how much storage to add to a row for the total columns. -- rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); -+ rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-1); - size += numRows * rowSize; - return size; - } -@@ -1105,7 +1105,7 @@ - } - - table->fRowLen = sizeof(RBBIStateTableRow) + -- sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); -+ sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 1); - table->fNumStates = fDStates->size(); - table->fFlags = 0; - if (fRB->fLookAheadHardBreak) { ---- source/common/ubidiimp.h -+++ source/common/ubidiimp.h -@@ -198,8 +198,8 @@ - /* in a Run, logicalStart will get this bit set if the run level is odd */ - #define INDEX_ODD_BIT (1UL<<31) - --#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)(level)<<31)) --#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)(level)<<31)) -+#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((uint32_t)(level)<<31)) -+#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((uint32_t)(level)<<31)) - #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT) - - #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT) ---- source/common/ucharstriebuilder.cpp -+++ source/common/ucharstriebuilder.cpp -@@ -285,7 +285,7 @@ - - UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) - : LinearMatchNode(len, nextNode), s(units) { -- hash=hash*37+ustr_hashUCharsN(units, len); -+ hash=hash*37U+ustr_hashUCharsN(units, len); - } - - UBool ---- source/common/ucmndata.c -+++ source/common/ucmndata.c -@@ -75,7 +75,7 @@ - typedef struct { - uint32_t count; - uint32_t reserved; -- PointerTOCEntry entry[2]; /* Actual size is from count. */ -+ PointerTOCEntry entry[1]; /* Actual size is from count. */ - } PointerTOC; - - ---- source/common/ucmndata.h -+++ source/common/ucmndata.h -@@ -50,7 +50,7 @@ - - typedef struct { - uint32_t count; -- UDataOffsetTOCEntry entry[2]; /* Actual size of array is from count. */ -+ UDataOffsetTOCEntry entry[1]; /* Actual size of array is from count. */ - } UDataOffsetTOC; - - /** ---- source/common/unicode/stringtriebuilder.h -+++ source/common/unicode/stringtriebuilder.h -@@ -269,7 +269,7 @@ - void setValue(int32_t v) { - hasValue=TRUE; - value=v; -- hash=hash*37+v; -+ hash=hash*37U+v; - } - protected: - UBool hasValue; -@@ -296,7 +296,7 @@ - class LinearMatchNode : public ValueNode { - public: - LinearMatchNode(int32_t len, Node *nextNode) -- : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), -+ : ValueNode((0x333333U*37+len)*37+hashCode(nextNode)), - length(len), next(nextNode) {} - virtual UBool operator==(const Node &other) const; - virtual int32_t markRightEdgesFirst(int32_t edgeNumber); -@@ -330,7 +330,7 @@ - equal[length]=NULL; - values[length]=value; - ++length; -- hash=(hash*37+c)*37+value; -+ hash=(hash*37U+c)*37+value; - } - // Adds a unit which leads to another match node. - void add(int32_t c, Node *node) { -@@ -338,7 +338,7 @@ - equal[length]=node; - values[length]=0; - ++length; -- hash=(hash*37+c)*37+hashCode(node); -+ hash=(hash*37U+c)*37+hashCode(node); - } - protected: - Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". -@@ -353,7 +353,7 @@ - class SplitBranchNode : public BranchNode { - public: - SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) -- : BranchNode(((0x555555*37+middleUnit)*37+ -+ : BranchNode(((0x555555U*37+middleUnit)*37+ - hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), - unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} - virtual UBool operator==(const Node &other) const; -@@ -370,7 +370,7 @@ - class BranchHeadNode : public ValueNode { - public: - BranchHeadNode(int32_t len, Node *subNode) -- : ValueNode((0x666666*37+len)*37+hashCode(subNode)), -+ : ValueNode((0x666666U*37+len)*37+hashCode(subNode)), - length(len), next(subNode) {} - virtual UBool operator==(const Node &other) const; - virtual int32_t markRightEdgesFirst(int32_t edgeNumber); ---- source/i18n/collationdatareader.cpp -+++ source/i18n/collationdatareader.cpp -@@ -419,6 +419,7 @@ - tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries)); - if(options == ts.options && ts.variableTop != 0 && - reorderCodesLength == ts.reorderCodesLength && -+ reorderCodesLength != 0 && - uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 && - fastLatinOptions == ts.fastLatinOptions && - (fastLatinOptions < 0 || diff --git a/external/icu/icu4c-59-werror-shadow.patch.1 b/external/icu/icu4c-59-werror-shadow.patch.1 new file mode 100644 index 000000000000..8d09e44025e3 --- /dev/null +++ b/external/icu/icu4c-59-werror-shadow.patch.1 @@ -0,0 +1,74 @@ +diff -ur icu.org/source/common/unicode/char16ptr.h icu/source/common/unicode/char16ptr.h +--- icu.org/source/common/unicode/char16ptr.h 2017-03-29 06:44:37.000000000 +0200 ++++ icu/source/common/unicode/char16ptr.h 2017-04-24 11:56:47.707572355 +0200 +@@ -107,14 +107,14 @@ + + #ifdef U_ALIASING_BARRIER + +-Char16Ptr::Char16Ptr(char16_t *p) : p(p) {} ++Char16Ptr::Char16Ptr(char16_t *p_) : p(p_) {} + #if !U_CHAR16_IS_TYPEDEF +-Char16Ptr::Char16Ptr(uint16_t *p) : p(cast(p)) {} ++Char16Ptr::Char16Ptr(uint16_t *p_) : p(cast(p_)) {} + #endif + #if U_SIZEOF_WCHAR_T==2 +-Char16Ptr::Char16Ptr(wchar_t *p) : p(cast(p)) {} ++Char16Ptr::Char16Ptr(wchar_t *p_) : p(cast(p_)) {} + #endif +-Char16Ptr::Char16Ptr(std::nullptr_t p) : p(p) {} ++Char16Ptr::Char16Ptr(std::nullptr_t p_) : p(p_) {} + Char16Ptr::~Char16Ptr() { + U_ALIASING_BARRIER(p); + } +@@ -215,14 +215,14 @@ + + #ifdef U_ALIASING_BARRIER + +-ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) : p(p) {} ++ConstChar16Ptr::ConstChar16Ptr(const char16_t *p_) : p(p_) {} + #if !U_CHAR16_IS_TYPEDEF +-ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) : p(cast(p)) {} ++ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p_) : p(cast(p_)) {} + #endif + #if U_SIZEOF_WCHAR_T==2 +-ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) : p(cast(p)) {} ++ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p_) : p(cast(p_)) {} + #endif +-ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) : p(p) {} ++ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p_) : p(p_) {} + ConstChar16Ptr::~ConstChar16Ptr() { + U_ALIASING_BARRIER(p); + } +diff -ur icu.org/source/common/unicode/unistr.h icu/source/common/unicode/unistr.h +--- icu.org/source/common/unicode/unistr.h 2017-03-29 06:44:37.000000000 +0200 ++++ icu/source/common/unicode/unistr.h 2017-04-24 11:59:51.782076511 +0200 +@@ -3080,11 +3080,11 @@ + * uint16_t * constructor. + * Delegates to UnicodeString(const char16_t *, int32_t). + * @param text UTF-16 string +- * @param length string length ++ * @param length_ string length + * @draft ICU 59 + */ +- UnicodeString(const uint16_t *text, int32_t length) : +- UnicodeString(ConstChar16Ptr(text), length) {} ++ UnicodeString(const uint16_t *text, int32_t length_) : ++ UnicodeString(ConstChar16Ptr(text), length_) {} + #endif + + /* +@@ -3097,11 +3097,11 @@ + * (Only defined if U_SIZEOF_WCHAR_T==2.) + * Delegates to UnicodeString(const char16_t *, int32_t). + * @param text NUL-terminated UTF-16 string +- * @param length string length ++ * @param length_ string length + * @draft ICU 59 + */ +- UnicodeString(const wchar_t *text, int32_t length) : +- UnicodeString(ConstChar16Ptr(text), length) {} ++ UnicodeString(const wchar_t *text, int32_t length_) : ++ UnicodeString(ConstChar16Ptr(text), length_) {} + #endif + + /* diff --git a/external/icu/icu4c-59-werror-undef.patch.1 b/external/icu/icu4c-59-werror-undef.patch.1 new file mode 100644 index 000000000000..196d78984a18 --- /dev/null +++ b/external/icu/icu4c-59-werror-undef.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/common/unicode/uvernum.h icu/source/common/unicode/uvernum.h +--- icu.org/source/common/unicode/uvernum.h 2017-03-21 02:03:49.000000000 +0100 ++++ icu/source/common/unicode/uvernum.h 2017-04-21 23:44:55.123597927 +0200 +@@ -125,7 +125,7 @@ + * This value will change in the subsequent releases of ICU + * @stable ICU 2.6 + */ +-#if U_PLATFORM_HAS_WINUWP_API == 0 ++#if !defined(U_PLATFORM_HAS_WINUWP_API) || (U_PLATFORM_HAS_WINUWP_API == 0) + #define U_ICU_VERSION_SHORT "59" + #else + // U_DISABLE_RENAMING does not impact dat file name diff --git a/external/icu/icu4c-aix.patch b/external/icu/icu4c-aix.patch deleted file mode 100644 index f4a449f9b423..000000000000 --- a/external/icu/icu4c-aix.patch +++ /dev/null @@ -1,141 +0,0 @@ ---- misc/icu/source/config/mh-aix-gcc 2010-06-24 20:58:16.000000000 -0500 -+++ misc/build/icu/source/config/mh-aix-gcc 2013-10-31 20:34:16.607982310 +0700 -@@ -13,84 +13,29 @@ - GEN_DEPS.c= $(CC) -E -MM $(DEFS) $(CPPFLAGS) - GEN_DEPS.cc= $(CXX) -E -MM $(DEFS) $(CPPFLAGS) - --## Commands to link --## We need to use the C++ linker, even when linking C programs, since --## our libraries contain C++ code (C++ static init not called) --LINK.c= $(AIX_PREDELETE) $(CXX) $(CXXFLAGS) $(LDFLAGS) --LINK.cc= $(AIX_PREDELETE) $(CXX) $(CXXFLAGS) $(LDFLAGS) -- --## Shared library options --LD_SOOPTIONS= -Wl,-bsymbolic -- --## Commands to make a shared library --SHLIB.c= $(AIX_PREDELETE) $(CC) $(CFLAGS) $(LDFLAGS) -shared -Wl,-bexpall $(LD_SOOPTIONS) --SHLIB.cc= $(AIX_PREDELETE) $(CXX) $(CXXFLAGS) $(LDFLAGS) -shared -Wl,-bexpall $(LD_SOOPTIONS) -- --## Compiler switch to embed a runtime search path --LD_RPATH= -I --LD_RPATH_PRE= -+## Flags for position independent code -+SHAREDLIBCFLAGS = -fPIC -+SHAREDLIBCXXFLAGS = -fPIC -+SHAREDLIBCPPFLAGS = -DPIC -+ -+## Additional flags when building libraries and with threads -+THREADSCPPFLAGS = -D_REENTRANT -D_THREAD_SAFE -+LIBCPPFLAGS = - --## enable the shared lib loader --LDFLAGS += -Wl,-bbigtoc -+LD_RPATH= -+LD_RPATH_PRE= - - ## These are the library specific LDFLAGS - LDFLAGSICUDT=-nodefaultlibs -nostdlib - --## We need to delete things prior to linking, or else we'll get --## SEVERE ERROR: output file in use .. on AIX. --## But, shell script version should NOT delete target as we don't --## have $@ in that context. (SH = only shell script, icu-config) --AIX_PREDELETE=rm -f $@ ; --#SH# AIX_PREDELETE= -- - ## Environment variable to set a runtime search path - LDLIBRARYPATH_ENVVAR = LIBPATH - --## Override Versioned target for a shared library. --FINAL_SO_TARGET= $(basename $(SO_TARGET))$(SO_TARGET_VERSION).$(SO) --MIDDLE_SO_TARGET= $(basename $(SO_TARGET))$(SO_TARGET_VERSION_MAJOR).$(SO) --SHARED_OBJECT = $(notdir $(FINAL_SO_TARGET:.$(SO)=.$(SOBJ))) --SHARED_OBJECT_NO_VERSION = $(basename $(SO_TARGET)).$(SOBJ) -- --# The following is for Makefile.inc's use. --ICULIBSUFFIX_VERSION = $(LIB_VERSION_MAJOR) -- --# this one is for icudefs.mk's use --ifeq ($(ENABLE_SHARED),YES) --SO_TARGET_VERSION_SUFFIX = $(SO_TARGET_VERSION_MAJOR) --endif -- --## Compiler switch to embed a library name. Not present on AIX. --LD_SONAME = -- --## The type of assembly needed when pkgdata is used for generating shared libraries. --GENCCODE_ASSEMBLY=-a xlc -- - ## Shared object suffix --SOBJ= so --# without the -brtl option, the library names use .a. AIX is funny that way. --SO= a --A= a -+SO= so - - ## Non-shared intermediate object suffix --STATIC_O = o -- --## Special AIX rules -- --## Build archive from shared object --%.a : %.so -- ln -f $< $(SHARED_OBJECT_NO_VERSION) -- $(AR) $(ARFLAGS) $@ $(SHARED_OBJECT_NO_VERSION) -- rm -f $(SHARED_OBJECT_NO_VERSION) --$(LIBDIR)/%.a : %.so -- ln -f $< $(SHARED_OBJECT_NO_VERSION) -- $(AR) $(ARFLAGS) $@ $(SHARED_OBJECT_NO_VERSION) -- rm -f $(SHARED_OBJECT_NO_VERSION) -- --## Build import list from export list --%.e : %.exp -- @echo "Building an import list for $<" -- @$(SHELL) -ec "echo '#! $*.a($*.so)' | cat - $< > $@" -+STATIC_O = ao - - ## Compilation rules - %.$(STATIC_O): $(srcdir)/%.c -@@ -118,10 +63,10 @@ - [ -s $@ ] || rm -f $@' - - ## Versioned libraries rules --%$(SO_TARGET_VERSION_MAJOR).$(SO): %$(SO_TARGET_VERSION).$(SO) -- $(RM) $@ && ln -s ${*F}$(SO_TARGET_VERSION).$(SO) $@ --%.$(SO): %$(SO_TARGET_VERSION).$(SO) -- $(RM) $@ && ln -s ${*F}$(SO_TARGET_VERSION).$(SO) $@ -+%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) -+ $(RM) $@ && ln -s ${ $@" ++STATIC_O = ao + + ## Compilation rules + %.$(STATIC_O): $(srcdir)/%.c +@@ -123,10 +68,10 @@ + [ -s $@ ] || rm -f $@' + + ## Versioned libraries rules +-%$(SO_TARGET_VERSION_MAJOR).$(SO): %$(SO_TARGET_VERSION).$(SO) +- $(RM) $@ && ln -s ${*F}$(SO_TARGET_VERSION).$(SO) $@ +-%.$(SO): %$(SO_TARGET_VERSION).$(SO) +- $(RM) $@ && ln -s ${*F}$(SO_TARGET_VERSION).$(SO) $@ ++%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) ++ $(RM) $@ && ln -s ${= 0x550) \ - || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) - # define U_EXPORT __global - diff --git a/external/icu/icu4c-android.patch.1 b/external/icu/icu4c-android.patch.1 new file mode 100644 index 000000000000..c62cf54e5936 --- /dev/null +++ b/external/icu/icu4c-android.patch.1 @@ -0,0 +1,58 @@ +diff -ur icu.org/source/common/unicode/platform.h icu/source/common/unicode/platform.h +--- icu.org/source/common/unicode/platform.h 2017-03-22 20:06:26.000000000 +0100 ++++ icu/source/common/unicode/platform.h 2017-04-21 22:32:31.656693147 +0200 +@@ -818,7 +818,7 @@ + #elif defined(U_STATIC_IMPLEMENTATION) + # define U_EXPORT + #elif defined(__GNUC__) +-# define U_EXPORT __attribute__((visibility("default"))) ++# define U_EXPORT + #elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \ + || (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550) + # define U_EXPORT __global +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-linux 2017-04-21 22:32:31.653693154 +0200 +@@ -27,7 +27,7 @@ + + ## Compiler switch to embed a library name + # The initial tab in the next line is to prevent icu-config from reading it. +- LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) ++ #LD_SONAME = -Wl,-soname -Wl,$(notdir $(MIDDLE_SO_TARGET)) + #SH# # We can't depend on MIDDLE_SO_TARGET being set. + #SH# LD_SONAME= + +diff -ur icu.org/source/configure icu/source/configure +--- icu.org/source/configure 2017-04-07 09:40:30.000000000 +0200 ++++ icu/source/configure 2017-04-21 22:32:31.656693147 +0200 +@@ -5134,7 +5134,7 @@ + else + icu_cv_host_frag=mh-linux-va + fi ;; +-*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu) icu_cv_host_frag=mh-linux ;; ++*-*-linux*|*-*-gnu|*-*-k*bsd*-gnu|*-*-kopensolaris*-gnu|*-*-*-androideabi*) icu_cv_host_frag=mh-linux ;; + i[34567]86-*-cygwin) + if test "$GCC" = yes; then + icu_cv_host_frag=mh-cygwin +@@ -6358,6 +6358,10 @@ + # Check to see if genccode can generate simple assembly. + GENCCODE_ASSEMBLY= + case "${host}" in ++arm-*-linux-androideabi) ++ if test "$GCC" = yes; then ++ GENCCODE_ASSEMBLY="-a gcc-android-arm" ++ fi ;; + *-linux*|*-kfreebsd*-gnu*|i*86-*-*bsd*|i*86-pc-gnu) + if test "$GCC" = yes; then + # We're using gcc, and the simple -a gcc command line works for genccode +@@ -7445,6 +7449,10 @@ + # wchar_t can be used + CHECK_UTF16_STRING_RESULT="available" + ;; ++*-*-*-androideabi|mips-unknown-linux-android) ++ # no UTF-16 strings thanks, I think, this is to avoid the -std=c++0x which causes trouble with uint64_t ++ CHECK_UTF16_STRING_RESULT="nope" ++ ;; + *) + ;; + esac diff --git a/external/icu/icu4c-build.patch b/external/icu/icu4c-build.patch deleted file mode 100644 index 103e9aea5b64..000000000000 --- a/external/icu/icu4c-build.patch +++ /dev/null @@ -1,88 +0,0 @@ ---- misc/icu/source/tools/toolutil/pkg_genc.h -+++ misc/build/icu/source/tools/toolutil/pkg_genc.h -@@ -58,7 +58,7 @@ - #endif - - #define LARGE_BUFFER_MAX_SIZE 2048 --#define SMALL_BUFFER_MAX_SIZE 512 -+#define SMALL_BUFFER_MAX_SIZE 2048 - #define SMALL_BUFFER_FLAG_NAMES 32 - #define BUFFER_PADDING_SIZE 20 - ---- misc/icu/source/tools/toolutil/pkg_genc.c -+++ misc/build/icu/source/tools/toolutil/pkg_genc.c -@@ -152,6 +152,28 @@ - - ".long ","",HEX_0X - }, -+ {"gcc-android-arm", -+ "\t.arch armv5te\n" -+ "\t.fpu softvfp\n" -+ "\t.eabi_attribute 20, 1\n" -+ "\t.eabi_attribute 21, 1\n" -+ "\t.eabi_attribute 23, 3\n" -+ "\t.eabi_attribute 24, 1\n" -+ "\t.eabi_attribute 25, 1\n" -+ "\t.eabi_attribute 26, 2\n" -+ "\t.eabi_attribute 30, 6\n" -+ "\t.eabi_attribute 18, 4\n" -+ "\t.file \"%s.s\"\n" -+ "\t.global %s\n" -+ "\t.section .rodata\n" -+ "\t.align 2\n" -+ "\t.type %s, %%object\n" -+ "%s:\n", -+ -+ "\t.word ", -+ "\t.section .note.GNU-stack,\"\",%%progbits\n", -+ HEX_0X -+ }, - /* 16 bytes alignment. */ - /* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */ - {"sun", ---- misc/icu/source/config/mh-darwin 2010-09-29 20:37:36.000000000 +0200 -+++ misc/build/icu/source/config/mh-darwin 2011-03-15 10:56:26.653056004 +0100 -@@ -28,11 +28,7 @@ - SHLIB.cc= $(CXX) -dynamiclib -dynamic $(CXXFLAGS) $(LDFLAGS) $(LD_SOOPTIONS) - - ## Compiler switches to embed a library name and version information --ifeq ($(ENABLE_RPATH),YES) --LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(libdir)/$(notdir $(MIDDLE_SO_TARGET)) --else --LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name $(notdir $(MIDDLE_SO_TARGET)) --endif -+LD_SONAME = -Wl,-compatibility_version -Wl,$(SO_TARGET_VERSION_MAJOR) -Wl,-current_version -Wl,$(SO_TARGET_VERSION) -install_name @__________________________________________________URELIB/$(notdir $(MIDDLE_SO_TARGET)) - - ## Compiler switch to embed a runtime search path - LD_RPATH= -@@ -48,10 +44,6 @@ - ## Non-shared intermediate object suffix - STATIC_O = ao - --## Override Versioned target for a shared library. --FINAL_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION).$(SO) --MIDDLE_SO_TARGET= $(basename $(SO_TARGET)).$(SO_TARGET_VERSION_MAJOR).$(SO) -- - ## Compilation and dependency rules - %.$(STATIC_O): $(srcdir)/%.c - $(call SILENT_COMPILE,$(strip $(COMPILE.c) $(STATICCPPFLAGS) $(STATICCFLAGS)) -MMD -MT "$*.d $*.o $*.$(STATIC_O)" -o $@ $<) -@@ -65,16 +57,10 @@ - - ## Versioned libraries rules - --%.$(SO_TARGET_VERSION_MAJOR).$(SO): %.$(SO_TARGET_VERSION).$(SO) -+%.$(SO).$(SO_TARGET_VERSION_MAJOR): %.$(SO).$(SO_TARGET_VERSION) - $(RM) $@ && ln -s ${start&&(DIRPROP_FLAG(dirProps[i])&MASK_BN_EXPLICIT); i--); dirProp=dirProps[i]; diff --git a/external/icu/icu4c-icudata-stdlibs.diff b/external/icu/icu4c-icudata-stdlibs.diff deleted file mode 100644 index 16bea7b4a995..000000000000 --- a/external/icu/icu4c-icudata-stdlibs.diff +++ /dev/null @@ -1,15 +0,0 @@ -Index: icu-52~m1/source/config/mh-linux -=================================================================== ---- build/icu-52~m1.orig/source/config/mh-linux 2013-09-14 18:53:23.284040467 -0400 -+++ build/icu-52~m1/source/config/mh-linux 2013-09-14 18:53:23.284040467 -0400 -@@ -21,7 +21,9 @@ - RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' - - ## These are the library specific LDFLAGS --LDFLAGSICUDT=-nodefaultlibs -nostdlib -+#LDFLAGSICUDT=-nodefaultlibs -nostdlib -+# Debian change: linking icudata as data only causes too many problems. -+LDFLAGSICUDT= - - ## Compiler switch to embed a library name - # The initial tab in the next line is to prevent icu-config from reading it. diff --git a/external/icu/icu4c-icudata-stdlibs.patch.1 b/external/icu/icu4c-icudata-stdlibs.patch.1 new file mode 100644 index 000000000000..c8d66c6ed06f --- /dev/null +++ b/external/icu/icu4c-icudata-stdlibs.patch.1 @@ -0,0 +1,14 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2017-04-21 23:09:57.588533707 +0200 ++++ icu/source/config/mh-linux 2017-04-21 23:11:38.075292226 +0200 +@@ -27,7 +27,9 @@ + RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' + + ## These are the library specific LDFLAGS +-LDFLAGSICUDT=-nodefaultlibs -nostdlib ++#LDFLAGSICUDT=-nodefaultlibs -nostdlib ++# Debian change: linking icudata as data only causes too many problems. ++LDFLAGSICUDT= + + ## Compiler switch to embed a library name + # The initial tab in the next line is to prevent icu-config from reading it. diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1 new file mode 100644 index 000000000000..74f60f866257 --- /dev/null +++ b/external/icu/icu4c-khmerbreakengine.patch.1 @@ -0,0 +1,1114 @@ +diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp +--- icu.org/source/common/dictbe.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/common/dictbe.cpp 2017-04-21 23:14:23.845894374 +0200 +@@ -29,8 +29,17 @@ + ****************************************************************** + */ + +-DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { ++DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) : ++ clusterLimit(3) ++{ ++ UErrorCode status = U_ZERO_ERROR; + fTypes = breakTypes; ++ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status); ++ ++ // note Skip Sets contain fIgnoreSet characters too. ++ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status); ++ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status); ++ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status); + } + + DictionaryBreakEngine::~DictionaryBreakEngine() { +@@ -92,7 +101,7 @@ + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); + utext_setNativeIndex(text, current); + } +- ++ + return result; + } + +@@ -103,6 +112,169 @@ + fSet.compact(); + } + ++bool ++DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ utext_setNativeIndex(ut, start); ++ UChar32 c = utext_current32(ut); ++ bool res = false; ++ doBreak = true; ++ while (start >= 0) { ++ if (!fSkipStartSet.contains(c)) { ++ res = (c == ZWSP); ++ break; ++ } ++ --start; ++ c = utext_previous32(ut); ++ doBreak = false; ++ } ++ utext_close(ut); ++ return res; ++} ++ ++bool ++DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ utext_setNativeIndex(ut, end); ++ UChar32 c = utext_current32(ut); ++ bool res = false; ++ doBreak = !fNBeforeSet.contains(c); ++ while (end < textEnd) { ++ if (!fSkipEndSet.contains(c)) { ++ res = (c == ZWSP); ++ break; ++ } ++ ++end; ++ c = utext_next32(ut); ++ doBreak = false; ++ } ++ utext_close(ut); ++ return res; ++} ++ ++void ++DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const { ++ UChar32 c = 0; ++ start = utext_getNativeIndex(text); ++ while (start > textStart) { ++ c = utext_previous32(text); ++ --start; ++ if (!fSkipEndSet.contains(c)) ++ break; ++ } ++ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters ++ while (start > textStart) { ++ while (fIgnoreSet.contains(c)) ++ c = utext_previous32(text); ++ if (!fMarkSet.contains(c)) { ++ if (fBaseSet.contains(c)) { ++ c = utext_previous32(text); ++ if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark ++ utext_next32(text); ++ c = utext_current32(text); ++ break; ++ } else { ++ --start; ++ } ++ } else { ++ break; ++ } ++ } ++ c = utext_previous32(text); ++ --start; ++ } ++ if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish ++ break; ++ } ++ c = utext_previous32(text); ++ --start; // go round again ++ } // ignore hitting previous inhibitor since scanning for it should have found us! ++ ++start; // counteract --before ++} ++ ++void ++DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const { ++ UChar32 c = utext_current32(text); ++ end = utext_getNativeIndex(text); ++ while (end < textEnd) { ++ if (!fSkipStartSet.contains(c)) ++ break; ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ } ++ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters ++ while (fIgnoreSet.contains(c)) { ++ utext_next32(text); ++ c = utext_current32(text); ++ } ++ if (fBaseSet.contains(c)) { ++ while (end < textEnd) { ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ if (!fMarkSet.contains(c)) ++ break; ++ else if (fViramaSet.contains(c)) { // handle coeng + base as mark ++ utext_next32(text); ++ c = utext_current32(text); ++ ++end; ++ if (!fBaseSet.contains(c)) ++ break; ++ } ++ } ++ } else { ++ --end; // bad char so break after char before it ++ break; ++ } ++ } ++} ++ ++bool ++DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const { ++ UErrorCode status = U_ZERO_ERROR; ++ UText* ut = utext_clone(NULL, text, false, true, &status); ++ int32_t nat = start; ++ utext_setNativeIndex(ut, nat); ++ bool foundFirst = true; ++ int32_t curr = start; ++ while (nat < end) { ++ UChar32 c = utext_current32(ut); ++ if (c == ZWSP || c == WJ) { ++ curr = nat + 1; ++ if (foundFirst) // only scan backwards for first inhibitor ++ scanBackClusters(ut, start, before); ++ foundFirst = false; // don't scan backwards if we go around again. Also marks found something ++ ++ utext_next32(ut); ++ scanFwdClusters(ut, end, after); ++ nat = after + 1; ++ ++ if (c == ZWSP || c == WJ) { // did we hit another one? ++ continue; ++ } else { ++ break; ++ } ++ } ++ ++ ++nat; // keep hunting ++ utext_next32(ut); ++ } ++ ++ utext_close(ut); ++ ++ if (nat >= end && foundFirst) { ++ start = before = after = nat; ++ return false; // failed to find anything ++ } ++ else { ++ start = curr; ++ } ++ return true; // yup hit one ++} ++ + /* + ****************************************************************** + * PossibleWord +@@ -130,35 +302,35 @@ + public: + PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}; + ~PossibleWord() {}; +- ++ + // Fill the list of candidates if needed, select the longest, and return the number found +- int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); +- ++ int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 ); ++ + // Select the currently marked candidate, point after it in the text, and invalidate self + int32_t acceptMarked( UText *text ); +- ++ + // Back up from the current candidate to the next shorter one; return TRUE if that exists + // and point the text after it + UBool backUp( UText *text ); +- ++ + // Return the longest prefix this candidate location shares with a dictionary word + // Return value is in code points. + int32_t longestPrefix() { return prefix; }; +- ++ + // Mark the current candidate as the one we like + void markCurrent() { mark = current; }; +- ++ + // Get length in code points of the marked word. + int32_t markedCPLength() { return cpLengths[mark]; }; + }; + + +-int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { ++int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) { + // TODO: If getIndex is too slow, use offset < 0 and add discardAll() + int32_t start = (int32_t)utext_getNativeIndex(text); + if (start != offset) { + offset = start; +- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); ++ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength); + // Dictionary leaves text after longest prefix, not longest word. Back up. + if (count <= 0) { + utext_setNativeIndex(text, start); +@@ -830,51 +1002,28 @@ + * KhmerBreakEngine + */ + +-// How many words in a row are "good enough"? +-static const int32_t KHMER_LOOKAHEAD = 3; +- +-// Will not combine a non-word with a preceding dictionary word longer than this +-static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; +- +-// Will not combine a non-word that shares at least this much prefix with a +-// dictionary word, with a preceding word +-static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; +- +-// Minimum word size +-static const int32_t KHMER_MIN_WORD = 2; +- +-// Minimum number of characters for two words +-static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; +- + KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) + : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), + fDictionary(adoptDictionary) + { +- fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); ++ ++ clusterLimit = 3; ++ ++ fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status); + if (U_SUCCESS(status)) { + setCharacters(fKhmerWordSet); + } + fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); +- fMarkSet.add(0x0020); +- fEndWordSet = fKhmerWordSet; +- fBeginWordSet.add(0x1780, 0x17B3); +- //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels +- //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word +- //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word +- fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters +- //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels +-// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT +-// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI +-// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK +-// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI +-// fSuffixSet.add(THAI_PAIYANNOI); +-// fSuffixSet.add(THAI_MAIYAMOK); ++ fIgnoreSet.add(0x2060); // WJ ++ fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ ++ fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status); ++ fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status); + + // Compact for caching. + fMarkSet.compact(); +- fEndWordSet.compact(); +- fBeginWordSet.compact(); +-// fSuffixSet.compact(); ++ fIgnoreSet.compact(); ++ fBaseSet.compact(); ++ fPuncSet.compact(); + } + + KhmerBreakEngine::~KhmerBreakEngine() { +@@ -886,180 +1035,204 @@ + int32_t rangeStart, + int32_t rangeEnd, + UStack &foundBreaks ) const { +- if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { +- return 0; // Not enough characters for two words +- } +- +- uint32_t wordsFound = 0; +- int32_t cpWordLength = 0; +- int32_t cuWordLength = 0; +- int32_t current; ++ uint32_t wordsFound = foundBreaks.size(); + UErrorCode status = U_ZERO_ERROR; +- PossibleWord words[KHMER_LOOKAHEAD]; +- ++ int32_t before = 0; ++ int32_t after = 0; ++ int32_t finalBefore = 0; ++ int32_t initAfter = 0; ++ int32_t scanStart = rangeStart; ++ int32_t scanEnd = rangeEnd; ++ ++ bool startZwsp = false; ++ bool breakStart = false; ++ bool breakEnd = false; ++ ++ if (rangeStart > 0) { ++ --scanStart; ++ startZwsp = scanBeforeStart(text, scanStart, breakStart); ++ } + utext_setNativeIndex(text, rangeStart); ++ scanFwdClusters(text, rangeEnd, initAfter); ++ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd); ++ utext_setNativeIndex(text, rangeEnd - 1); ++ scanBackClusters(text, rangeStart, finalBefore); ++ if (finalBefore < initAfter) { // the whole run is tented so no breaks ++ if (breakStart || fTypes < UBRK_LINE) ++ foundBreaks.push(rangeStart, status); ++ if (breakEnd || fTypes < UBRK_LINE) ++ foundBreaks.push(rangeEnd, status); ++ return foundBreaks.size() - wordsFound; ++ } ++ ++ scanStart = rangeStart; ++ scanWJ(text, scanStart, rangeEnd, before, after); ++ if (startZwsp || initAfter >= before) { ++ after = initAfter; ++ before = 0; ++ } ++ if (!endZwsp && after > finalBefore && after < rangeEnd) ++ endZwsp = true; ++ if (endZwsp && before > finalBefore) ++ before = finalBefore; + +- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { +- cuWordLength = 0; +- cpWordLength = 0; +- +- // Look for candidate words at the current position +- int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); +- +- // If we found exactly one, use that +- if (candidates == 1) { +- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); +- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); +- wordsFound += 1; +- } +- +- // If there was more than one, see which one can take us forward the most words +- else if (candidates > 1) { +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } +- do { +- int32_t wordsMatched = 1; +- if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { +- if (wordsMatched < 2) { +- // Followed by another dictionary word; mark first word as a good candidate +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); +- wordsMatched = 2; +- } +- +- // If we're already at the end of the range, we're done +- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { +- goto foundBest; +- } +- +- // See if any of the possible second words is followed by a third word +- do { +- // If we find a third word, stop right away +- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { +- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); +- goto foundBest; +- } +- } +- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); +- } ++ utext_setNativeIndex(text, rangeStart); ++ int32_t numCodePts = rangeEnd - rangeStart; ++ // bestSnlp[i] is the snlp of the best segmentation of the first i ++ // code points in the range to be matched. ++ UVector32 bestSnlp(numCodePts + 1, status); ++ bestSnlp.addElement(0, status); ++ for(int32_t i = 1; i <= numCodePts; i++) { ++ bestSnlp.addElement(kuint32max, status); ++ } ++ ++ // prev[i] is the index of the last code point in the previous word in ++ // the best segmentation of the first i characters. Note negative implies ++ // that the code point is part of an unknown word. ++ UVector32 prev(numCodePts + 1, status); ++ for(int32_t i = 0; i <= numCodePts; i++) { ++ prev.addElement(kuint32max, status); ++ } ++ ++ const int32_t maxWordSize = 20; ++ UVector32 values(maxWordSize, status); ++ values.setSize(maxWordSize); ++ UVector32 lengths(maxWordSize, status); ++ lengths.setSize(maxWordSize); ++ ++ // Dynamic programming to find the best segmentation. ++ ++ // In outer loop, i is the code point index, ++ // ix is the corresponding string (code unit) index. ++ // They differ when the string contains supplementary characters. ++ int32_t ix = rangeStart; ++ for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) { ++ if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { ++ continue; ++ } ++ ++ int32_t count; ++ count = fDictionary->matches(text, numCodePts - i, maxWordSize, ++ NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2); ++ // Note: lengths is filled with code point lengths ++ // The NULL parameter is the ignored code unit lengths. ++ ++ for (int32_t j = 0; j < count; j++) { ++ int32_t ln = lengths.elementAti(j); ++ if (ln + i >= numCodePts) ++ continue; ++ utext_setNativeIndex(text, ln+ix); ++ int32_t c = utext_current32(text); ++ if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng ++ lengths.removeElementAt(j); ++ values.removeElementAt(j); ++ --j; ++ --count; + } +- while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); +-foundBest: +- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); +- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); +- wordsFound += 1; + } +- +- // We come here after having either found a word or not. We look ahead to the +- // next word. If it's not a dictionary word, we will combine it with the word we +- // just found (if there is one), but only if the preceding word does not exceed +- // the threshold. +- // The text iterator should now be positioned at the end of the word we found. +- if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { +- // if it is a dictionary word, do nothing. If it isn't, then if there is +- // no preceding word, or the non-word shares less than the minimum threshold +- // of characters with a dictionary word, then scan to resynchronize +- if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 +- && (cuWordLength == 0 +- || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { +- // Look for a plausible word boundary +- int32_t remaining = rangeEnd - (current+cuWordLength); +- UChar32 pc; +- UChar32 uc; +- int32_t chars = 0; +- for (;;) { +- int32_t pcIndex = (int32_t)utext_getNativeIndex(text); +- pc = utext_next32(text); +- int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; +- chars += pcSize; +- remaining -= pcSize; +- if (remaining <= 0) { ++ if (count == 0) { ++ utext_setNativeIndex(text, ix); ++ int32_t c = utext_current32(text); ++ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) { ++ values.setElementAt(0, count); ++ lengths.setElementAt(1, count++); ++ } else if (fBaseSet.contains(c)) { ++ int32_t currix = utext_getNativeIndex(text); ++ do { ++ utext_next32(text); ++ c = utext_current32(text); ++ if (utext_getNativeIndex(text) >= rangeEnd) + break; +- } +- uc = utext_current32(text); +- if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { +- // Maybe. See if it's in the dictionary. +- int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); +- utext_setNativeIndex(text, current+cuWordLength+chars); +- if (candidates > 0) { ++ if (c == 0x17D2) { // Coeng ++ utext_next32(text); ++ c = utext_current32(text); ++ if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) { + break; ++ } else { ++ utext_next32(text); ++ c = utext_current32(text); ++ if (utext_getNativeIndex(text) >= rangeEnd) ++ break; + } + } +- } +- +- // Bump the word count if there wasn't already one +- if (cuWordLength <= 0) { +- wordsFound += 1; +- } +- +- // Update the length with the passed-over characters +- cuWordLength += chars; +- } +- else { +- // Back up to where we were for next iteration +- utext_setNativeIndex(text, current+cuWordLength); ++ } while (fMarkSet.contains(c) || fIgnoreSet.contains(c)); ++ values.setElementAt(BADSNLP, count); ++ lengths.setElementAt(utext_getNativeIndex(text) - currix, count++); ++ } else { ++ values.setElementAt(BADSNLP, count); ++ lengths.setElementAt(1, count++); + } + } + +- // Never stop before a combining mark. +- int32_t currPos; +- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { +- utext_next32(text); +- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; ++ for (int32_t j = 0; j < count; j++) { ++ uint32_t v = values.elementAti(j); ++ int32_t newSnlp = bestSnlp.elementAti(i) + v; ++ int32_t ln = lengths.elementAti(j); ++ utext_setNativeIndex(text, ln+ix); ++ int32_t c = utext_current32(text); ++ while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) { ++ ++ln; ++ utext_next32(text); ++ c = utext_current32(text); ++ } ++ int32_t ln_j_i = ln + i; // yes really i! ++ if (newSnlp < bestSnlp.elementAti(ln_j_i)) { ++ if (v == BADSNLP) { ++ int32_t p = prev.elementAti(i); ++ if (p < 0) ++ prev.setElementAt(p, ln_j_i); ++ else ++ prev.setElementAt(-i, ln_j_i); ++ } ++ else ++ prev.setElementAt(i, ln_j_i); ++ bestSnlp.setElementAt(newSnlp, ln_j_i); ++ } + } +- +- // Look ahead for possible suffixes if a dictionary word does not follow. +- // We do this in code rather than using a rule so that the heuristic +- // resynch continues to function. For example, one of the suffix characters +- // could be a typo in the middle of a word. +-// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { +-// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 +-// && fSuffixSet.contains(uc = utext_current32(text))) { +-// if (uc == KHMER_PAIYANNOI) { +-// if (!fSuffixSet.contains(utext_previous32(text))) { +-// // Skip over previous end and PAIYANNOI +-// utext_next32(text); +-// utext_next32(text); +-// wordLength += 1; // Add PAIYANNOI to word +-// uc = utext_current32(text); // Fetch next character +-// } +-// else { +-// // Restore prior position +-// utext_next32(text); +-// } +-// } +-// if (uc == KHMER_MAIYAMOK) { +-// if (utext_previous32(text) != KHMER_MAIYAMOK) { +-// // Skip over previous end and MAIYAMOK +-// utext_next32(text); +-// utext_next32(text); +-// wordLength += 1; // Add MAIYAMOK to word +-// } +-// else { +-// // Restore prior position +-// utext_next32(text); +-// } +-// } +-// } +-// else { +-// utext_setNativeIndex(text, current+wordLength); +-// } +-// } +- +- // Did we find a word on this iteration? If so, push it on the break stack +- if (cuWordLength > 0) { +- foundBreaks.push((current+cuWordLength), status); ++ } ++ // Start pushing the optimal offset index into t_boundary (t for tentative). ++ // prev[numCodePts] is guaranteed to be meaningful. ++ // We'll first push in the reverse order, i.e., ++ // t_boundary[0] = numCodePts, and afterwards do a swap. ++ UVector32 t_boundary(numCodePts+1, status); ++ ++ int32_t numBreaks = 0; ++ // No segmentation found, set boundary to end of range ++ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { ++ --numCodePts; ++ } ++ if (numCodePts < 0) { ++ t_boundary.addElement(numCodePts, status); ++ numBreaks++; ++ } else { ++ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) { ++ if (i < 0) i = -i; ++ t_boundary.addElement(i, status); ++ numBreaks++; ++ } ++ U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); ++ } ++ ++ // Now that we're done, convert positions in t_boundary[] (indices in ++ // the normalized input string) back to indices in the original input UText ++ // while reversing t_boundary and pushing values to foundBreaks. ++ for (int32_t i = numBreaks-1; i >= 0; i--) { ++ int32_t cpPos = t_boundary.elementAti(i); ++ if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue; ++ int32_t utextPos = cpPos + rangeStart; ++ while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after)); ++ if (utextPos < before) { ++ // Boundaries are added to foundBreaks output in ascending order. ++ U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); ++ foundBreaks.push(utextPos, status); + } + } +- ++ + // Don't return a break for the end of the dictionary range if there is one there. +- if (foundBreaks.peeki() >= rangeEnd) { ++ if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) { + (void) foundBreaks.popi(); +- wordsFound -= 1; + } +- +- return wordsFound; ++ return foundBreaks.size() - wordsFound; + } + + #if !UCONFIG_NO_NORMALIZATION +diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h +--- icu.org/source/common/dictbe.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/common/dictbe.h 2017-04-21 23:14:23.845894374 +0200 +@@ -34,6 +34,15 @@ + */ + class DictionaryBreakEngine : public LanguageBreakEngine { + private: ++ ++ /** ++ *

Default constructor.

++ * ++ */ ++ DictionaryBreakEngine(); ++ ++ protected: ++ + /** + * The set of characters handled by this engine + * @internal +@@ -48,11 +57,63 @@ + + uint32_t fTypes; + ++ const int32_t WJ = 0x2060; ++ const int32_t ZWSP = 0x200B; ++ + /** +- *

Default constructor.

+- * ++ * A Unicode set of all viramas ++ * @internal + */ +- DictionaryBreakEngine(); ++ UnicodeSet fViramaSet; ++ ++ /** ++ * A Unicode set of all base characters ++ * @internal ++ */ ++ UnicodeSet fBaseSet; ++ ++ /** ++ * A Unicode set of all marks ++ * @internal ++ */ ++ UnicodeSet fMarkSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fIgnoreSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fSkipStartSet; ++ ++ /** ++ * A Unicode set of all characters ignored ignored in dictionary matching ++ * @internal ++ */ ++ UnicodeSet fSkipEndSet; ++ ++ /** ++ * A Unicode set of all characters that should not be broken before ++ * @internal ++ */ ++ UnicodeSet fNBeforeSet; ++ ++ /** ++ * The number of clusters within which breaks are inhibited ++ * @internal ++ */ ++ int32_t clusterLimit; ++ ++ bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const; ++ ++ bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const; ++ bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const; ++ void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const; ++ void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const; + + public: + +@@ -83,7 +144,7 @@ + *

Find any breaks within a run in the supplied text.

+ * + * @param text A UText representing the text. The iterator is left at +- * the end of the run of characters which the engine is capable of handling ++ * the end of the run of characters which the engine is capable of handling + * that starts from the first (or last) character in the range. + * @param startPos The start of the run within the supplied text. + * @param endPos The end of the run within the supplied text. +@@ -245,118 +306,120 @@ + + }; + +-/******************************************************************* +- * BurmeseBreakEngine +- */ +- +-/** +- *

BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a +- * DictionaryMatcher and heuristics to determine Burmese-specific breaks.

+- * +- *

After it is constructed a BurmeseBreakEngine may be shared between +- * threads without synchronization.

+- */ +-class BurmeseBreakEngine : public DictionaryBreakEngine { +- private: +- /** +- * The set of characters handled by this engine +- * @internal +- */ +- +- UnicodeSet fBurmeseWordSet; +- UnicodeSet fEndWordSet; +- UnicodeSet fBeginWordSet; +- UnicodeSet fMarkSet; +- DictionaryMatcher *fDictionary; +- +- public: +- +- /** +- *

Default constructor.

+- * +- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the +- * engine is deleted. +- */ +- BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); +- +- /** +- *

Virtual destructor.

+- */ +- virtual ~BurmeseBreakEngine(); +- +- protected: +- /** +- *

Divide up a range of known dictionary characters.

+- * +- * @param text A UText representing the text +- * @param rangeStart The start of the range of dictionary characters +- * @param rangeEnd The end of the range of dictionary characters +- * @param foundBreaks Output of C array of int32_t break positions, or 0 +- * @return The number of breaks found +- */ +- virtual int32_t divideUpDictionaryRange( UText *text, +- int32_t rangeStart, +- int32_t rangeEnd, +- UStack &foundBreaks ) const; +- +-}; +- +-/******************************************************************* +- * KhmerBreakEngine +- */ +- +-/** +- *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a +- * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

+- * +- *

After it is constructed a KhmerBreakEngine may be shared between +- * threads without synchronization.

+- */ +-class KhmerBreakEngine : public DictionaryBreakEngine { +- private: +- /** +- * The set of characters handled by this engine +- * @internal +- */ +- +- UnicodeSet fKhmerWordSet; +- UnicodeSet fEndWordSet; +- UnicodeSet fBeginWordSet; +- UnicodeSet fMarkSet; +- DictionaryMatcher *fDictionary; +- +- public: +- +- /** +- *

Default constructor.

+- * +- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the +- * engine is deleted. +- */ +- KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); +- +- /** +- *

Virtual destructor.

+- */ +- virtual ~KhmerBreakEngine(); +- +- protected: +- /** +- *

Divide up a range of known dictionary characters.

+- * +- * @param text A UText representing the text +- * @param rangeStart The start of the range of dictionary characters +- * @param rangeEnd The end of the range of dictionary characters +- * @param foundBreaks Output of C array of int32_t break positions, or 0 +- * @return The number of breaks found +- */ +- virtual int32_t divideUpDictionaryRange( UText *text, +- int32_t rangeStart, +- int32_t rangeEnd, +- UStack &foundBreaks ) const; +- +-}; +- ++/******************************************************************* ++ * BurmeseBreakEngine ++ */ ++ ++/** ++ *

BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a ++ * DictionaryMatcher and heuristics to determine Burmese-specific breaks.

++ * ++ *

After it is constructed a BurmeseBreakEngine may be shared between ++ * threads without synchronization.

++ */ ++class BurmeseBreakEngine : public DictionaryBreakEngine { ++ private: ++ /** ++ * The set of characters handled by this engine ++ * @internal ++ */ ++ ++ UnicodeSet fBurmeseWordSet; ++ UnicodeSet fEndWordSet; ++ UnicodeSet fBeginWordSet; ++ UnicodeSet fMarkSet; ++ DictionaryMatcher *fDictionary; ++ ++ public: ++ ++ /** ++ *

Default constructor.

++ * ++ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the ++ * engine is deleted. ++ */ ++ BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); ++ ++ /** ++ *

Virtual destructor.

++ */ ++ virtual ~BurmeseBreakEngine(); ++ ++ protected: ++ /** ++ *

Divide up a range of known dictionary characters.

++ * ++ * @param text A UText representing the text ++ * @param rangeStart The start of the range of dictionary characters ++ * @param rangeEnd The end of the range of dictionary characters ++ * @param foundBreaks Output of C array of int32_t break positions, or 0 ++ * @return The number of breaks found ++ */ ++ virtual int32_t divideUpDictionaryRange( UText *text, ++ int32_t rangeStart, ++ int32_t rangeEnd, ++ UStack &foundBreaks ) const; ++ ++}; ++ ++/******************************************************************* ++ * KhmerBreakEngine ++ */ ++ ++/** ++ *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a ++ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

++ * ++ *

After it is constructed a KhmerBreakEngine may be shared between ++ * threads without synchronization.

++ */ ++class KhmerBreakEngine : public DictionaryBreakEngine { ++ private: ++ /** ++ * The set of characters handled by this engine ++ * @internal ++ */ ++ ++ UnicodeSet fKhmerWordSet; ++ UnicodeSet fBeginWordSet; ++ UnicodeSet fPuncSet; ++ DictionaryMatcher *fDictionary; ++ ++ const uint32_t BADSNLP = 256 * 20; ++ const uint32_t kuint32max = 0x7FFFFFFF; ++ ++ public: ++ ++ /** ++ *

Default constructor.

++ * ++ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the ++ * engine is deleted. ++ */ ++ KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); ++ ++ /** ++ *

Virtual destructor.

++ */ ++ virtual ~KhmerBreakEngine(); ++ ++ protected: ++ /** ++ *

Divide up a range of known dictionary characters.

++ * ++ * @param text A UText representing the text ++ * @param rangeStart The start of the range of dictionary characters ++ * @param rangeEnd The end of the range of dictionary characters ++ * @param foundBreaks Output of C array of int32_t break positions, or 0 ++ * @return The number of breaks found ++ */ ++ virtual int32_t divideUpDictionaryRange( UText *text, ++ int32_t rangeStart, ++ int32_t rangeEnd, ++ UStack &foundBreaks ) const; ++ ++}; ++ + #if !UCONFIG_NO_NORMALIZATION + + /******************************************************************* +diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp +--- icu.org/source/common/dictionarydata.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/common/dictionarydata.cpp 2017-04-21 23:14:23.846894372 +0200 +@@ -44,7 +44,7 @@ + + int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const { ++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { + + UCharsTrie uct(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); +@@ -55,7 +55,13 @@ + UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; ++ if (ignoreSet != NULL && ignoreSet->contains(c)) { ++ continue; ++ } + if (USTRINGTRIE_HAS_VALUE(result)) { ++ if (codePointsMatched < minLength) { ++ continue; ++ } + if (wordCount < limit) { + if (values != NULL) { + values[wordCount] = uct.getValue(); +@@ -112,7 +118,7 @@ + + int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const { ++ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { + BytesTrie bt(characters); + int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); + int32_t wordCount = 0; +@@ -122,7 +128,13 @@ + UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); + int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; + codePointsMatched += 1; ++ if (ignoreSet != NULL && ignoreSet->contains(c)) { ++ continue; ++ } + if (USTRINGTRIE_HAS_VALUE(result)) { ++ if (codePointsMatched < minLength) { ++ continue; ++ } + if (wordCount < limit) { + if (values != NULL) { + values[wordCount] = bt.getValue(); +diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h +--- icu.org/source/common/dictionarydata.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/common/dictionarydata.h 2017-04-21 23:14:23.846894372 +0200 +@@ -21,6 +21,7 @@ + #include "unicode/utext.h" + #include "unicode/udata.h" + #include "udataswp.h" ++#include "unicode/uniset.h" + #include "unicode/uobject.h" + #include "unicode/ustringtrie.h" + +@@ -92,7 +93,7 @@ + */ + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const = 0; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0; + + /** @return DictionaryData::TRIE_TYPE_XYZ */ + virtual int32_t getType() const = 0; +@@ -107,7 +108,7 @@ + virtual ~UCharsDictionaryMatcher(); + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const; + virtual int32_t getType() const; + private: + const UChar *characters; +@@ -125,7 +126,7 @@ + virtual ~BytesDictionaryMatcher(); + virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, + int32_t *lengths, int32_t *cpLengths, int32_t *values, +- int32_t *prefix) const; ++ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const; + virtual int32_t getType() const; + private: + UChar32 transform(UChar32 c) const; +diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in +--- icu.org/source/data/Makefile.in 2017-04-21 23:13:03.248087545 +0200 ++++ icu/source/data/Makefile.in 2017-04-21 23:14:23.846894372 +0200 +@@ -183,7 +183,7 @@ + endif + endif + +-packagedata: icupkg.inc $(PKGDATA_LIST) build-local ++packagedata: icupkg.inc $(PKGDATA_LIST) build-local $(MAINBUILDDIR)/khmerdict.stamp + ifneq ($(ENABLE_STATIC),) + ifeq ($(PKGDATA_MODE),dll) + $(PKGDATA_INVOKE) $(PKGDATA) -e $(ICUDATA_ENTRY_POINT) -T $(OUTTMPDIR) -p $(ICUDATA_NAME) $(PKGDATA_LIBSTATICNAME) -m static $(PKGDATA_VERSIONING) $(PKGDATA_LIST) +@@ -567,8 +567,14 @@ + $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(DICTSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict + + # TODO: figure out why combining characters are here? +-$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES) +- $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict ++#$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES) ++# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict ++ ++#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local ++# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict ++$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local ++ cp $< $(BRKBLDDIR) ++ echo "timestamp" > $@ + + #################################################### CFU + # CFU FILES diff --git a/external/icu/icu4c-macosx.patch b/external/icu/icu4c-macosx.patch deleted file mode 100644 index 8f9f9a75507e..000000000000 --- a/external/icu/icu4c-macosx.patch +++ /dev/null @@ -1,20 +0,0 @@ ---- misc/icu/source/common/putil.cpp -+++ misc/build/icu/source/common/putil.cpp -@@ -1111,8 +1111,16 @@ - static const time_t decemberSolstice=1198332540; /*2007-12-22 06:09 UT*/ - - /* This probing will tell us when daylight savings occurs. */ -+#if U_PLATFORM_IS_DARWIN_BASED -+ struct tm *tmp; -+ tmp = localtime(&juneSolstice); -+ juneSol = *tmp; -+ tmp = localtime(&decemberSolstice); -+ decemberSol = *tmp; -+#else - localtime_r(&juneSolstice, &juneSol); - localtime_r(&decemberSolstice, &decemberSol); -+#endif - if(decemberSol.tm_isdst > 0) { - daylightType = U_DAYLIGHT_DECEMBER; - } else if(juneSol.tm_isdst > 0) { - diff --git a/external/icu/icu4c-macosx.patch.1 b/external/icu/icu4c-macosx.patch.1 new file mode 100644 index 000000000000..fee08eb05771 --- /dev/null +++ b/external/icu/icu4c-macosx.patch.1 @@ -0,0 +1,20 @@ +diff -ur icu.org/source/common/putil.cpp icu/source/common/putil.cpp +--- icu.org/source/common/putil.cpp 2017-04-10 16:22:16.000000000 +0200 ++++ icu/source/common/putil.cpp 2017-04-21 22:14:09.940217733 +0200 +@@ -1198,8 +1198,16 @@ + static const time_t decemberSolstice=1198332540; /*2007-12-22 06:09 UT*/ + + /* This probing will tell us when daylight savings occurs. */ ++#if U_PLATFORM_IS_DARWIN_BASED ++ struct tm *tmp; ++ tmp = localtime(&juneSolstice); ++ juneSol = *tmp; ++ tmp = localtime(&decemberSolstice); ++ decemberSol = *tmp; ++#else + localtime_r(&juneSolstice, &juneSol); + localtime_r(&decemberSolstice, &decemberSol); ++#endif + if(decemberSol.tm_isdst > 0) { + daylightType = U_DAYLIGHT_DECEMBER; + } else if(juneSol.tm_isdst > 0) { diff --git a/external/icu/icu4c-mkdir.patch b/external/icu/icu4c-mkdir.patch deleted file mode 100644 index 094ddd5abca4..000000000000 --- a/external/icu/icu4c-mkdir.patch +++ /dev/null @@ -1,10 +0,0 @@ ---- misc/icu/source/dataMakefile.in.sav 2012-04-05 22:49:20.000000000 +0200 -+++ build/icu/source/data/Makefile.in 2012-12-04 14:24:40.548026700 +0100 -@@ -363,6 +363,7 @@ - ifeq ($(PKGDATA_MODE),dll) - SO_VERSION_DATA = $(OUTTMPDIR)/icudata.res - $(SO_VERSION_DATA) : $(MISCSRCDIR)/icudata.rc -+ mkdir -p $(OUTTMPDIR) - ifeq ($(MSYS_RC_MODE),1) - rc.exe -i$(srcdir)/../common -i$(top_builddir)/common -fo$@ $(CPPFLAGS) $< - else diff --git a/external/icu/icu4c-mkdir.patch.1 b/external/icu/icu4c-mkdir.patch.1 new file mode 100644 index 000000000000..3234f151b677 --- /dev/null +++ b/external/icu/icu4c-mkdir.patch.1 @@ -0,0 +1,11 @@ +diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in +--- icu.org/source/data/Makefile.in 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/data/Makefile.in 2017-04-21 22:29:00.747158002 +0200 +@@ -367,6 +367,7 @@ + ifeq ($(PKGDATA_MODE),dll) + SO_VERSION_DATA = $(OUTTMPDIR)/icudata.res + $(SO_VERSION_DATA) : $(MISCSRCDIR)/icudata.rc ++ mkdir -p $(OUTTMPDIR) + ifeq ($(MSYS_RC_MODE),1) + rc.exe -i$(srcdir)/../common -i$(top_builddir)/common -fo$@ $(CPPFLAGS) $< + else diff --git a/external/icu/icu4c-rpath.patch b/external/icu/icu4c-rpath.patch deleted file mode 100644 index ba1a82595aea..000000000000 --- a/external/icu/icu4c-rpath.patch +++ /dev/null @@ -1,35 +0,0 @@ ---- misc/icu/source/data/pkgdataMakefile.in 2010-12-22 23:44:02.000000000 +0100 -+++ misc/build/icu/source/data/pkgdataMakefile.in 2011-01-03 17:52:44.000000000 +0100 -@@ -15,6 +15,9 @@ include $(top_builddir)/icudefs.mk - OUTPUTFILE=icupkg.inc - MIDDLE_SO_TARGET= - -+# escape $ with \ when passing to echo; needed to preserve $ORIGIN -+SHLIB.c.shell := $(subst $$,\$$,$(SHLIB.c)) -+ - all : clean - @echo GENCCODE_ASSEMBLY_TYPE=$(GENCCODE_ASSEMBLY) >> $(OUTPUTFILE) - @echo SO=$(SO) >> $(OUTPUTFILE) -@@ -24,7 +27,7 @@ all : clean - @echo LIB_EXT_ORDER=$(FINAL_SO_TARGET) >> $(OUTPUTFILE) - @echo COMPILE="$(COMPILE.c)" >> $(OUTPUTFILE) - @echo LIBFLAGS="-I$(top_srcdir)/common -I$(top_builddir)/common $(SHAREDLIBCPPFLAGS) $(SHAREDLIBCFLAGS)" >> $(OUTPUTFILE) -- @echo GENLIB="$(SHLIB.c)" >> $(OUTPUTFILE) -+ @echo GENLIB="$(SHLIB.c.shell)" >> $(OUTPUTFILE) - @echo LDICUDTFLAGS=$(LDFLAGSICUDT) >> $(OUTPUTFILE) - @echo LD_SONAME=$(LD_SONAME) >> $(OUTPUTFILE) - @echo RPATH_FLAGS=$(RPATH_FLAGS) >> $(OUTPUTFILE) ---- misc/icu/source/config/mh-linux 2010-09-29 20:37:36.000000000 +0200 -+++ misc/build/icu/source/config/mh-linux 2011-03-15 10:56:26.653056004 +0100 -@@ -20,6 +20,10 @@ - LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN - LD_RPATH_PRE = -Wl,-rpath, - -+## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH: -+ENABLE_RPATH=YES -+RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' -+ - ## These are the library specific LDFLAGS - LDFLAGSICUDT=-nodefaultlibs -nostdlib - - diff --git a/external/icu/icu4c-rpath.patch.1 b/external/icu/icu4c-rpath.patch.1 new file mode 100644 index 000000000000..debbab1a6e3d --- /dev/null +++ b/external/icu/icu4c-rpath.patch.1 @@ -0,0 +1,36 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/config/mh-linux 2017-04-21 22:38:18.893927819 +0200 +@@ -22,6 +22,10 @@ + LD_RPATH= -Wl,-zorigin,-rpath,'$$'ORIGIN + LD_RPATH_PRE = -Wl,-rpath, + ++## Force RPATH=$ORIGIN to locate own dependencies w/o need for LD_LIBRARY_PATH: ++ENABLE_RPATH=YES ++RPATHLDFLAGS=${LD_RPATH_PRE}'$$ORIGIN' ++ + ## These are the library specific LDFLAGS + LDFLAGSICUDT=-nodefaultlibs -nostdlib + +diff -ur icu.org/source/data/pkgdataMakefile.in icu/source/data/pkgdataMakefile.in +--- icu.org/source/data/pkgdataMakefile.in 2016-06-15 20:58:17.000000000 +0200 ++++ icu/source/data/pkgdataMakefile.in 2017-04-21 22:38:18.892927822 +0200 +@@ -17,6 +17,9 @@ + OUTPUTFILE=icupkg.inc + MIDDLE_SO_TARGET= + ++# escape $ with \ when passing to echo; needed to preserve $ORIGIN ++SHLIB.c.shell := $(subst $$,\$$,$(SHLIB.c)) ++ + all : clean + @echo GENCCODE_ASSEMBLY_TYPE=$(GENCCODE_ASSEMBLY) >> $(OUTPUTFILE) + @echo SO=$(SO) >> $(OUTPUTFILE) +@@ -26,7 +29,7 @@ + @echo LIB_EXT_ORDER=$(FINAL_SO_TARGET) >> $(OUTPUTFILE) + @echo COMPILE="$(COMPILE.c)" >> $(OUTPUTFILE) + @echo LIBFLAGS="-I$(top_srcdir)/common -I$(top_builddir)/common $(SHAREDLIBCPPFLAGS) $(SHAREDLIBCFLAGS)" >> $(OUTPUTFILE) +- @echo GENLIB="$(SHLIB.c)" >> $(OUTPUTFILE) ++ @echo GENLIB="$(SHLIB.c.shell)" >> $(OUTPUTFILE) + @echo LDICUDTFLAGS=$(LDFLAGSICUDT) >> $(OUTPUTFILE) + @echo LD_SONAME=$(LD_SONAME) >> $(OUTPUTFILE) + @echo RPATH_FLAGS=$(RPATH_FLAGS) >> $(OUTPUTFILE) diff --git a/external/icu/icu4c-rtti.patch.1 b/external/icu/icu4c-rtti.patch.1 new file mode 100644 index 000000000000..c058c7f3c87e --- /dev/null +++ b/external/icu/icu4c-rtti.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/config/mh-linux icu/source/config/mh-linux +--- icu.org/source/config/mh-linux 2017-04-21 23:01:23.257769703 +0200 ++++ icu/source/config/mh-linux 2017-04-21 23:03:23.166481552 +0200 +@@ -36,7 +36,7 @@ + #SH# LD_SONAME= + + ## Shared library options +-LD_SOOPTIONS= -Wl,-Bsymbolic ++LD_SOOPTIONS= -Wl,-Bsymbolic-functions + + ## Shared object suffix + SO = so diff --git a/external/icu/icu4c-scriptrun.patch b/external/icu/icu4c-scriptrun.patch deleted file mode 100644 index e307811acaad..000000000000 --- a/external/icu/icu4c-scriptrun.patch +++ /dev/null @@ -1,58 +0,0 @@ ---- misc/icu/source/extra/scrptrun/scrptrun.cpp -+++ misc/build/icu/source/extra/scrptrun/scrptrun.cpp -@@ -150,7 +150,11 @@ - // characters above it on the stack will be poped. - if (pairIndex >= 0) { - if ((pairIndex & 1) == 0) { -- parenStack[++parenSP].pairIndex = pairIndex; -+ ++parenSP; -+ int32_t nVecSize = parenStack.size(); -+ if (parenSP == nVecSize) -+ parenStack.resize(nVecSize + 128); -+ parenStack[parenSP].pairIndex = pairIndex; - parenStack[parenSP].scriptCode = scriptCode; - } else if (parenSP >= 0) { - int32_t pi = pairIndex & ~1; -@@ -184,7 +188,14 @@ - // pop it from the stack - if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { - parenSP -= 1; -- startSP -= 1; -+ /* decrement startSP only if it is >= 0, -+ decrementing it unnecessarily will lead to memory corruption -+ while processing the above while block. -+ e.g. startSP = -4 , parenSP = -1 -+ */ -+ if (startSP >= 0) { -+ startSP -= 1; -+ } - } - } else { - // if the run broke on a surrogate pair, ---- misc/icu/source/extra/scrptrun/scrptrun.h -+++ misc/build/icu/source/extra/scrptrun/scrptrun.h -@@ -17,6 +17,7 @@ - #include "unicode/utypes.h" - #include "unicode/uobject.h" - #include "unicode/uscript.h" -+#include - - struct ScriptRecord - { -@@ -79,7 +80,7 @@ - int32_t scriptEnd; - UScriptCode scriptCode; - -- ParenStackEntry parenStack[128]; -+ std::vector parenStack; - int32_t parenSP; - - static int8_t highBit(int32_t value); -@@ -133,6 +134,7 @@ - scriptEnd = charStart; - scriptCode = USCRIPT_INVALID_CODE; - parenSP = -1; -+ parenStack.resize(128); - } - - inline void ScriptRun::reset(int32_t start, int32_t length) diff --git a/external/icu/icu4c-scriptrun.patch.1 b/external/icu/icu4c-scriptrun.patch.1 new file mode 100644 index 000000000000..fe81d19c846e --- /dev/null +++ b/external/icu/icu4c-scriptrun.patch.1 @@ -0,0 +1,60 @@ +diff -ur icu.org/source/extra/scrptrun/scrptrun.cpp icu/source/extra/scrptrun/scrptrun.cpp +--- icu.org/source/extra/scrptrun/scrptrun.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/extra/scrptrun/scrptrun.cpp 2017-04-21 22:59:31.708037770 +0200 +@@ -151,7 +151,11 @@ + // characters above it on the stack will be poped. + if (pairIndex >= 0) { + if ((pairIndex & 1) == 0) { +- parenStack[++parenSP].pairIndex = pairIndex; ++ ++parenSP; ++ int32_t nVecSize = parenStack.size(); ++ if (parenSP == nVecSize) ++ parenStack.resize(nVecSize + 128); ++ parenStack[parenSP].pairIndex = pairIndex; + parenStack[parenSP].scriptCode = scriptCode; + } else if (parenSP >= 0) { + int32_t pi = pairIndex & ~1; +@@ -185,7 +189,14 @@ + // pop it from the stack + if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) { + parenSP -= 1; +- startSP -= 1; ++ /* decrement startSP only if it is >= 0, ++ decrementing it unnecessarily will lead to memory corruption ++ while processing the above while block. ++ e.g. startSP = -4 , parenSP = -1 ++ */ ++ if (startSP >= 0) { ++ startSP -= 1; ++ } + } + } else { + // if the run broke on a surrogate pair, +diff -ur icu.org/source/extra/scrptrun/scrptrun.h icu/source/extra/scrptrun/scrptrun.h +--- icu.org/source/extra/scrptrun/scrptrun.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/extra/scrptrun/scrptrun.h 2017-04-21 22:59:31.708037770 +0200 +@@ -19,6 +19,7 @@ + #include "unicode/utypes.h" + #include "unicode/uobject.h" + #include "unicode/uscript.h" ++#include + + struct ScriptRecord + { +@@ -81,7 +82,7 @@ + int32_t scriptEnd; + UScriptCode scriptCode; + +- ParenStackEntry parenStack[128]; ++ std::vector parenStack; + int32_t parenSP; + + static int8_t highBit(int32_t value); +@@ -135,6 +136,7 @@ + scriptEnd = charStart; + scriptCode = USCRIPT_INVALID_CODE; + parenSP = -1; ++ parenStack.resize(128); + } + + inline void ScriptRun::reset(int32_t start, int32_t length) diff --git a/external/icu/icu4c-solarisgcc.patch b/external/icu/icu4c-solarisgcc.patch deleted file mode 100644 index a47d9b1e3506..000000000000 --- a/external/icu/icu4c-solarisgcc.patch +++ /dev/null @@ -1,12 +0,0 @@ ---- build/icu.old/source/common/uposixdefs.h -+++ build/icu/source/common/uposixdefs.h -@@ -52,7 +52,7 @@ - * - * z/OS needs this definition for timeval and to get usleep. - */ --#if !defined(_XOPEN_SOURCE_EXTENDED) -+#if !defined(_XOPEN_SOURCE_EXTENDED) && (defined(__IBMC__) || defined(__IBMCPP__)) - # define _XOPEN_SOURCE_EXTENDED 1 - #endif - - diff --git a/external/icu/icu4c-solarisgcc.patch.1 b/external/icu/icu4c-solarisgcc.patch.1 new file mode 100644 index 000000000000..6000ed0cb9e2 --- /dev/null +++ b/external/icu/icu4c-solarisgcc.patch.1 @@ -0,0 +1,12 @@ +diff -ur icu.org/source/common/uposixdefs.h icu/source/common/uposixdefs.h +--- icu.org/source/common/uposixdefs.h 2017-03-09 03:12:45.000000000 +0100 ++++ icu/source/common/uposixdefs.h 2017-04-21 22:23:11.857926971 +0200 +@@ -54,7 +54,7 @@ + * + * z/OS needs this definition for timeval and to get usleep. + */ +-#if !defined(_XOPEN_SOURCE_EXTENDED) && defined(__TOS_MVS__) ++#if !defined(_XOPEN_SOURCE_EXTENDED) && (defined(__TOS_MVS__) || defined(__IBMC__) || defined(__IBMCPP__)) + # define _XOPEN_SOURCE_EXTENDED 1 + #endif + diff --git a/external/icu/icu4c-ubsan.patch.1 b/external/icu/icu4c-ubsan.patch.1 new file mode 100644 index 000000000000..56594005e9d0 --- /dev/null +++ b/external/icu/icu4c-ubsan.patch.1 @@ -0,0 +1,171 @@ +diff -ur icu.org/source/common/rbbidata.h icu/source/common/rbbidata.h +--- icu.org/source/common/rbbidata.h 2017-02-03 19:57:23.000000000 +0100 ++++ icu/source/common/rbbidata.h 2017-04-21 22:46:25.371651160 +0200 +@@ -115,7 +115,7 @@ + /* StatusTable of the set of matching */ + /* tags (rule status values) */ + int16_t fReserved; +- uint16_t fNextState[2]; /* Next State, indexed by char category. */ ++ uint16_t fNextState[1]; /* Next State, indexed by char category. */ + /* This array does not have two elements */ + /* Array Size is actually fData->fHeader->fCatCount */ + /* CAUTION: see RBBITableBuilder::getTableSize() */ +@@ -128,7 +128,7 @@ + uint32_t fRowLen; /* Length of a state table row, in bytes. */ + uint32_t fFlags; /* Option Flags for this state table */ + uint32_t fReserved; /* reserved */ +- char fTableData[4]; /* First RBBIStateTableRow begins here. */ ++ char fTableData[1]; /* First RBBIStateTableRow begins here. */ + /* (making it char[] simplifies ugly address */ + /* arithmetic for indexing variable length rows.) */ + }; +diff -ur icu.org/source/common/rbbitblb.cpp icu/source/common/rbbitblb.cpp +--- icu.org/source/common/rbbitblb.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/common/rbbitblb.cpp 2017-04-21 22:46:25.373651159 +0200 +@@ -1095,15 +1095,15 @@ + return 0; + } + +- size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table. ++ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. + + numRows = fDStates->size(); + numCols = fRB->fSetBuilder->getNumCharCategories(); + +- // Note The declaration of RBBIStateTableRow is for a table of two columns. +- // Therefore we subtract two from numCols when determining ++ // Note The declaration of RBBIStateTableRow is for a table of one columns. ++ // Therefore we subtract one from numCols when determining + // how much storage to add to a row for the total columns. +- rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); ++ rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-1); + size += numRows * rowSize; + return size; + } +@@ -1133,7 +1133,7 @@ + } + + table->fRowLen = sizeof(RBBIStateTableRow) + +- sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); ++ sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 1); + table->fNumStates = fDStates->size(); + table->fFlags = 0; + if (fRB->fLookAheadHardBreak) { +diff -ur icu.org/source/common/ubidiimp.h icu/source/common/ubidiimp.h +--- icu.org/source/common/ubidiimp.h 2017-02-03 19:57:23.000000000 +0100 ++++ icu/source/common/ubidiimp.h 2017-04-21 22:46:25.374651159 +0200 +@@ -198,8 +198,8 @@ + /* in a Run, logicalStart will get this bit set if the run level is odd */ + #define INDEX_ODD_BIT (1UL<<31) + +-#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)(level)<<31)) +-#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)(level)<<31)) ++#define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((uint32_t)(level)<<31)) ++#define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((uint32_t)(level)<<31)) + #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT) + + #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT) +diff -ur icu.org/source/common/ucharstriebuilder.cpp icu/source/common/ucharstriebuilder.cpp +--- icu.org/source/common/ucharstriebuilder.cpp 2017-02-03 19:57:23.000000000 +0100 ++++ icu/source/common/ucharstriebuilder.cpp 2017-04-21 22:46:25.375651159 +0200 +@@ -287,7 +287,7 @@ + + UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) + : LinearMatchNode(len, nextNode), s(units) { +- hash=hash*37+ustr_hashUCharsN(units, len); ++ hash=hash*37U+ustr_hashUCharsN(units, len); + } + + UBool +diff -ur icu.org/source/common/ucmndata.cpp icu/source/common/ucmndata.cpp +--- icu.org/source/common/ucmndata.cpp 2017-03-08 16:34:47.000000000 +0100 ++++ icu/source/common/ucmndata.cpp 2017-04-21 22:46:25.376651159 +0200 +@@ -77,7 +77,7 @@ + typedef struct { + uint32_t count; + uint32_t reserved; +- PointerTOCEntry entry[2]; /* Actual size is from count. */ ++ PointerTOCEntry entry[1]; /* Actual size is from count. */ + } PointerTOC; + + +diff -ur icu.org/source/common/ucmndata.h icu/source/common/ucmndata.h +--- icu.org/source/common/ucmndata.h 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/common/ucmndata.h 2017-04-21 22:46:25.377651159 +0200 +@@ -52,7 +52,7 @@ + + typedef struct { + uint32_t count; +- UDataOffsetTOCEntry entry[2]; /* Actual size of array is from count. */ ++ UDataOffsetTOCEntry entry[1]; /* Actual size of array is from count. */ + } UDataOffsetTOC; + + /** +diff -ur icu.org/source/common/unicode/stringtriebuilder.h icu/source/common/unicode/stringtriebuilder.h +--- icu.org/source/common/unicode/stringtriebuilder.h 2017-03-10 23:01:34.000000000 +0100 ++++ icu/source/common/unicode/stringtriebuilder.h 2017-04-21 22:47:43.395634383 +0200 +@@ -276,7 +276,7 @@ + void setValue(int32_t v) { + hasValue=TRUE; + value=v; +- hash=hash*37+v; ++ hash=hash*37U+v; + } + protected: + UBool hasValue; +@@ -307,7 +307,7 @@ + class LinearMatchNode : public ValueNode { + public: + LinearMatchNode(int32_t len, Node *nextNode) +- : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), ++ : ValueNode((0x333333U*37+len)*37+hashCode(nextNode)), + length(len), next(nextNode) {} + virtual UBool operator==(const Node &other) const; + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); +@@ -342,7 +342,7 @@ + equal[length]=NULL; + values[length]=value; + ++length; +- hash=(hash*37+c)*37+value; ++ hash=(hash*37U+c)*37+value; + } + // Adds a unit which leads to another match node. + void add(int32_t c, Node *node) { +@@ -350,7 +350,7 @@ + equal[length]=node; + values[length]=0; + ++length; +- hash=(hash*37+c)*37+hashCode(node); ++ hash=(hash*37U+c)*37+hashCode(node); + } + protected: + Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". +@@ -365,7 +365,7 @@ + class SplitBranchNode : public BranchNode { + public: + SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) +- : BranchNode(((0x555555*37+middleUnit)*37+ ++ : BranchNode(((0x555555U*37+middleUnit)*37+ + hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), + unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} + virtual UBool operator==(const Node &other) const; +@@ -382,7 +382,7 @@ + class BranchHeadNode : public ValueNode { + public: + BranchHeadNode(int32_t len, Node *subNode) +- : ValueNode((0x666666*37+len)*37+hashCode(subNode)), ++ : ValueNode((0x666666U*37+len)*37+hashCode(subNode)), + length(len), next(subNode) {} + virtual UBool operator==(const Node &other) const; + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); +diff -ur icu.org/source/i18n/collationdatareader.cpp icu/source/i18n/collationdatareader.cpp +--- icu.org/source/i18n/collationdatareader.cpp 2017-01-20 01:20:31.000000000 +0100 ++++ icu/source/i18n/collationdatareader.cpp 2017-04-21 22:46:25.380651158 +0200 +@@ -419,6 +419,7 @@ + tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries)); + if(options == ts.options && ts.variableTop != 0 && + reorderCodesLength == ts.reorderCodesLength && ++ reorderCodesLength != 0 && + uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 && + fastLatinOptions == ts.fastLatinOptions && + (fastLatinOptions < 0 || diff --git a/external/icu/icu4c-warnings.patch b/external/icu/icu4c-warnings.patch deleted file mode 100644 index 96608d7d1ef3..000000000000 --- a/external/icu/icu4c-warnings.patch +++ /dev/null @@ -1,10 +0,0 @@ ---- misc/icu/source/common/unicode/utf16.h -+++ misc/build/icu/source/common/unicode/utf16.h -@@ -319,6 +319,7 @@ - (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ - } else /* c>0x10ffff or not enough space */ { \ - (isError)=TRUE; \ -+ (void)(isError); \ - } \ - } - diff --git a/external/icu/icu4c-warnings.patch.1 b/external/icu/icu4c-warnings.patch.1 new file mode 100644 index 000000000000..3c39ba12ded7 --- /dev/null +++ b/external/icu/icu4c-warnings.patch.1 @@ -0,0 +1,11 @@ +diff -ur icu.org/source/common/unicode/utf16.h icu/source/common/unicode/utf16.h +--- icu.org/source/common/unicode/utf16.h 2017-02-03 19:57:23.000000000 +0100 ++++ icu/source/common/unicode/utf16.h 2017-04-21 22:05:57.414397617 +0200 +@@ -321,6 +321,7 @@ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* c>0x10ffff or not enough space */ { \ + (isError)=TRUE; \ ++ (void)(isError); \ + } \ + } + diff --git a/external/icu/khmerbreakengine.patch b/external/icu/khmerbreakengine.patch deleted file mode 100644 index 8f81f315da3e..000000000000 --- a/external/icu/khmerbreakengine.patch +++ /dev/null @@ -1,1110 +0,0 @@ -diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp -index f1c874d..3ad1b3f 100644 ---- misc/icu/source/common/dictbe.cpp -+++ build/icu/source/common/dictbe.cpp -@@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN - ****************************************************************** - */ - --DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { -+DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) : -+ clusterLimit(3) -+{ -+ UErrorCode status = U_ZERO_ERROR; - fTypes = breakTypes; -+ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status); -+ -+ // note Skip Sets contain fIgnoreSet characters too. -+ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status); -+ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status); -+ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status); - } - - DictionaryBreakEngine::~DictionaryBreakEngine() { -@@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text, - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); - utext_setNativeIndex(text, current); - } -- -+ - return result; - } - -@@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { - fSet.compact(); - } - -+bool -+DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const { -+ UErrorCode status = U_ZERO_ERROR; -+ UText* ut = utext_clone(NULL, text, false, true, &status); -+ utext_setNativeIndex(ut, start); -+ UChar32 c = utext_current32(ut); -+ bool res = false; -+ doBreak = true; -+ while (start >= 0) { -+ if (!fSkipStartSet.contains(c)) { -+ res = (c == ZWSP); -+ break; -+ } -+ --start; -+ c = utext_previous32(ut); -+ doBreak = false; -+ } -+ utext_close(ut); -+ return res; -+} -+ -+bool -+DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const { -+ UErrorCode status = U_ZERO_ERROR; -+ UText* ut = utext_clone(NULL, text, false, true, &status); -+ utext_setNativeIndex(ut, end); -+ UChar32 c = utext_current32(ut); -+ bool res = false; -+ doBreak = !fNBeforeSet.contains(c); -+ while (end < textEnd) { -+ if (!fSkipEndSet.contains(c)) { -+ res = (c == ZWSP); -+ break; -+ } -+ ++end; -+ c = utext_next32(ut); -+ doBreak = false; -+ } -+ utext_close(ut); -+ return res; -+} -+ -+void -+DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const { -+ UChar32 c = 0; -+ start = utext_getNativeIndex(text); -+ while (start > textStart) { -+ c = utext_previous32(text); -+ --start; -+ if (!fSkipEndSet.contains(c)) -+ break; -+ } -+ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters -+ while (start > textStart) { -+ while (fIgnoreSet.contains(c)) -+ c = utext_previous32(text); -+ if (!fMarkSet.contains(c)) { -+ if (fBaseSet.contains(c)) { -+ c = utext_previous32(text); -+ if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark -+ utext_next32(text); -+ c = utext_current32(text); -+ break; -+ } else { -+ --start; -+ } -+ } else { -+ break; -+ } -+ } -+ c = utext_previous32(text); -+ --start; -+ } -+ if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish -+ break; -+ } -+ c = utext_previous32(text); -+ --start; // go round again -+ } // ignore hitting previous inhibitor since scanning for it should have found us! -+ ++start; // counteract --before -+} -+ -+void -+DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const { -+ UChar32 c = utext_current32(text); -+ end = utext_getNativeIndex(text); -+ while (end < textEnd) { -+ if (!fSkipStartSet.contains(c)) -+ break; -+ utext_next32(text); -+ c = utext_current32(text); -+ ++end; -+ } -+ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters -+ while (fIgnoreSet.contains(c)) { -+ utext_next32(text); -+ c = utext_current32(text); -+ } -+ if (fBaseSet.contains(c)) { -+ while (end < textEnd) { -+ utext_next32(text); -+ c = utext_current32(text); -+ ++end; -+ if (!fMarkSet.contains(c)) -+ break; -+ else if (fViramaSet.contains(c)) { // handle coeng + base as mark -+ utext_next32(text); -+ c = utext_current32(text); -+ ++end; -+ if (!fBaseSet.contains(c)) -+ break; -+ } -+ } -+ } else { -+ --end; // bad char so break after char before it -+ break; -+ } -+ } -+} -+ -+bool -+DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const { -+ UErrorCode status = U_ZERO_ERROR; -+ UText* ut = utext_clone(NULL, text, false, true, &status); -+ int32_t nat = start; -+ utext_setNativeIndex(ut, nat); -+ bool foundFirst = true; -+ int32_t curr = start; -+ while (nat < end) { -+ UChar32 c = utext_current32(ut); -+ if (c == ZWSP || c == WJ) { -+ curr = nat + 1; -+ if (foundFirst) // only scan backwards for first inhibitor -+ scanBackClusters(ut, start, before); -+ foundFirst = false; // don't scan backwards if we go around again. Also marks found something -+ -+ utext_next32(ut); -+ scanFwdClusters(ut, end, after); -+ nat = after + 1; -+ -+ if (c == ZWSP || c == WJ) { // did we hit another one? -+ continue; -+ } else { -+ break; -+ } -+ } -+ -+ ++nat; // keep hunting -+ utext_next32(ut); -+ } -+ -+ utext_close(ut); -+ -+ if (nat >= end && foundFirst) { -+ start = before = after = nat; -+ return false; // failed to find anything -+ } -+ else { -+ start = curr; -+ } -+ return true; // yup hit one -+} -+ - /* - ****************************************************************** - * PossibleWord -@@ -128,35 +302,35 @@ private: - public: - PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}; - ~PossibleWord() {}; -- -+ - // Fill the list of candidates if needed, select the longest, and return the number found -- int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); -- -+ int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 ); -+ - // Select the currently marked candidate, point after it in the text, and invalidate self - int32_t acceptMarked( UText *text ); -- -+ - // Back up from the current candidate to the next shorter one; return TRUE if that exists - // and point the text after it - UBool backUp( UText *text ); -- -+ - // Return the longest prefix this candidate location shares with a dictionary word - // Return value is in code points. - int32_t longestPrefix() { return prefix; }; -- -+ - // Mark the current candidate as the one we like - void markCurrent() { mark = current; }; -- -+ - // Get length in code points of the marked word. - int32_t markedCPLength() { return cpLengths[mark]; }; - }; - - --int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { -+int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) { - // TODO: If getIndex is too slow, use offset < 0 and add discardAll() - int32_t start = (int32_t)utext_getNativeIndex(text); - if (start != offset) { - offset = start; -- count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); -+ count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength); - // Dictionary leaves text after longest prefix, not longest word. Back up. - if (count <= 0) { - utext_setNativeIndex(text, start); -@@ -828,51 +1002,28 @@ foundBest: - * KhmerBreakEngine - */ - --// How many words in a row are "good enough"? --static const int32_t KHMER_LOOKAHEAD = 3; -- --// Will not combine a non-word with a preceding dictionary word longer than this --static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; -- --// Will not combine a non-word that shares at least this much prefix with a --// dictionary word, with a preceding word --static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; -- --// Minimum word size --static const int32_t KHMER_MIN_WORD = 2; -- --// Minimum number of characters for two words --static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; -- - KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) - : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), - fDictionary(adoptDictionary) - { -- fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); -+ -+ clusterLimit = 3; -+ -+ fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status); - if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); - } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); -- fMarkSet.add(0x0020); -- fEndWordSet = fKhmerWordSet; -- fBeginWordSet.add(0x1780, 0x17B3); -- //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels -- //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word -- //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word -- fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters -- //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels --// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT --// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI --// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK --// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI --// fSuffixSet.add(THAI_PAIYANNOI); --// fSuffixSet.add(THAI_MAIYAMOK); -+ fIgnoreSet.add(0x2060); // WJ -+ fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ -+ fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status); -+ fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status); - - // Compact for caching. - fMarkSet.compact(); -- fEndWordSet.compact(); -- fBeginWordSet.compact(); --// fSuffixSet.compact(); -+ fIgnoreSet.compact(); -+ fBaseSet.compact(); -+ fPuncSet.compact(); - } - - KhmerBreakEngine::~KhmerBreakEngine() { -@@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text, - int32_t rangeStart, - int32_t rangeEnd, - UStack &foundBreaks ) const { -- if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { -- return 0; // Not enough characters for two words -+ uint32_t wordsFound = foundBreaks.size(); -+ UErrorCode status = U_ZERO_ERROR; -+ int32_t before = 0; -+ int32_t after = 0; -+ int32_t finalBefore = 0; -+ int32_t initAfter = 0; -+ int32_t scanStart = rangeStart; -+ int32_t scanEnd = rangeEnd; -+ -+ bool startZwsp = false; -+ bool breakStart = false; -+ bool breakEnd = false; -+ -+ if (rangeStart > 0) { -+ --scanStart; -+ startZwsp = scanBeforeStart(text, scanStart, breakStart); -+ } -+ utext_setNativeIndex(text, rangeStart); -+ scanFwdClusters(text, rangeEnd, initAfter); -+ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd); -+ utext_setNativeIndex(text, rangeEnd - 1); -+ scanBackClusters(text, rangeStart, finalBefore); -+ if (finalBefore < initAfter) { // the whole run is tented so no breaks -+ if (breakStart || fTypes < UBRK_LINE) -+ foundBreaks.push(rangeStart, status); -+ if (breakEnd || fTypes < UBRK_LINE) -+ foundBreaks.push(rangeEnd, status); -+ return foundBreaks.size() - wordsFound; - } - -- uint32_t wordsFound = 0; -- int32_t cpWordLength = 0; -- int32_t cuWordLength = 0; -- int32_t current; -- UErrorCode status = U_ZERO_ERROR; -- PossibleWord words[KHMER_LOOKAHEAD]; -+ scanStart = rangeStart; -+ scanWJ(text, scanStart, rangeEnd, before, after); -+ if (startZwsp || initAfter >= before) { -+ after = initAfter; -+ before = 0; -+ } -+ if (!endZwsp && after > finalBefore && after < rangeEnd) -+ endZwsp = true; -+ if (endZwsp && before > finalBefore) -+ before = finalBefore; - - utext_setNativeIndex(text, rangeStart); -+ int32_t numCodePts = rangeEnd - rangeStart; -+ // bestSnlp[i] is the snlp of the best segmentation of the first i -+ // code points in the range to be matched. -+ UVector32 bestSnlp(numCodePts + 1, status); -+ bestSnlp.addElement(0, status); -+ for(int32_t i = 1; i <= numCodePts; i++) { -+ bestSnlp.addElement(kuint32max, status); -+ } - -- while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { -- cuWordLength = 0; -- cpWordLength = 0; -+ // prev[i] is the index of the last code point in the previous word in -+ // the best segmentation of the first i characters. Note negative implies -+ // that the code point is part of an unknown word. -+ UVector32 prev(numCodePts + 1, status); -+ for(int32_t i = 0; i <= numCodePts; i++) { -+ prev.addElement(kuint32max, status); -+ } - -- // Look for candidate words at the current position -- int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); -+ const int32_t maxWordSize = 20; -+ UVector32 values(maxWordSize, status); -+ values.setSize(maxWordSize); -+ UVector32 lengths(maxWordSize, status); -+ lengths.setSize(maxWordSize); - -- // If we found exactly one, use that -- if (candidates == 1) { -- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); -- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); -- wordsFound += 1; -- } -+ // Dynamic programming to find the best segmentation. - -- // If there was more than one, see which one can take us forward the most words -- else if (candidates > 1) { -- // If we're already at the end of the range, we're done -- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { -- goto foundBest; -- } -- do { -- int32_t wordsMatched = 1; -- if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { -- if (wordsMatched < 2) { -- // Followed by another dictionary word; mark first word as a good candidate -- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); -- wordsMatched = 2; -- } -+ // In outer loop, i is the code point index, -+ // ix is the corresponding string (code unit) index. -+ // They differ when the string contains supplementary characters. -+ int32_t ix = rangeStart; -+ for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) { -+ if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { -+ continue; -+ } - -- // If we're already at the end of the range, we're done -- if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { -- goto foundBest; -- } -+ int32_t count; -+ count = fDictionary->matches(text, numCodePts - i, maxWordSize, -+ NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2); -+ // Note: lengths is filled with code point lengths -+ // The NULL parameter is the ignored code unit lengths. - -- // See if any of the possible second words is followed by a third word -- do { -- // If we find a third word, stop right away -- if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { -- words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); -- goto foundBest; -- } -- } -- while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); -- } -+ for (int32_t j = 0; j < count; j++) { -+ int32_t ln = lengths.elementAti(j); -+ if (ln + i >= numCodePts) -+ continue; -+ utext_setNativeIndex(text, ln+ix); -+ int32_t c = utext_current32(text); -+ if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng -+ lengths.removeElementAt(j); -+ values.removeElementAt(j); -+ --j; -+ --count; - } -- while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); --foundBest: -- cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); -- cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); -- wordsFound += 1; - } -- -- // We come here after having either found a word or not. We look ahead to the -- // next word. If it's not a dictionary word, we will combine it with the word we -- // just found (if there is one), but only if the preceding word does not exceed -- // the threshold. -- // The text iterator should now be positioned at the end of the word we found. -- if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { -- // if it is a dictionary word, do nothing. If it isn't, then if there is -- // no preceding word, or the non-word shares less than the minimum threshold -- // of characters with a dictionary word, then scan to resynchronize -- if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 -- && (cuWordLength == 0 -- || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { -- // Look for a plausible word boundary -- int32_t remaining = rangeEnd - (current+cuWordLength); -- UChar32 pc; -- UChar32 uc; -- int32_t chars = 0; -- for (;;) { -- int32_t pcIndex = (int32_t)utext_getNativeIndex(text); -- pc = utext_next32(text); -- int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex; -- chars += pcSize; -- remaining -= pcSize; -- if (remaining <= 0) { -+ if (count == 0) { -+ utext_setNativeIndex(text, ix); -+ int32_t c = utext_current32(text); -+ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) { -+ values.setElementAt(0, count); -+ lengths.setElementAt(1, count++); -+ } else if (fBaseSet.contains(c)) { -+ int32_t currix = utext_getNativeIndex(text); -+ do { -+ utext_next32(text); -+ c = utext_current32(text); -+ if (utext_getNativeIndex(text) >= rangeEnd) - break; -- } -- uc = utext_current32(text); -- if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { -- // Maybe. See if it's in the dictionary. -- int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); -- utext_setNativeIndex(text, current+cuWordLength+chars); -- if (candidates > 0) { -+ if (c == 0x17D2) { // Coeng -+ utext_next32(text); -+ c = utext_current32(text); -+ if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) { - break; -+ } else { -+ utext_next32(text); -+ c = utext_current32(text); -+ if (utext_getNativeIndex(text) >= rangeEnd) -+ break; - } - } -- } -- -- // Bump the word count if there wasn't already one -- if (cuWordLength <= 0) { -- wordsFound += 1; -- } -+ } while (fMarkSet.contains(c) || fIgnoreSet.contains(c)); -+ values.setElementAt(BADSNLP, count); -+ lengths.setElementAt(utext_getNativeIndex(text) - currix, count++); -+ } else { -+ values.setElementAt(BADSNLP, count); -+ lengths.setElementAt(1, count++); -+ } -+ } - -- // Update the length with the passed-over characters -- cuWordLength += chars; -+ for (int32_t j = 0; j < count; j++) { -+ uint32_t v = values.elementAti(j); -+ int32_t newSnlp = bestSnlp.elementAti(i) + v; -+ int32_t ln = lengths.elementAti(j); -+ utext_setNativeIndex(text, ln+ix); -+ int32_t c = utext_current32(text); -+ while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) { -+ ++ln; -+ utext_next32(text); -+ c = utext_current32(text); - } -- else { -- // Back up to where we were for next iteration -- utext_setNativeIndex(text, current+cuWordLength); -+ int32_t ln_j_i = ln + i; // yes really i! -+ if (newSnlp < bestSnlp.elementAti(ln_j_i)) { -+ if (v == BADSNLP) { -+ int32_t p = prev.elementAti(i); -+ if (p < 0) -+ prev.setElementAt(p, ln_j_i); -+ else -+ prev.setElementAt(-i, ln_j_i); -+ } -+ else -+ prev.setElementAt(i, ln_j_i); -+ bestSnlp.setElementAt(newSnlp, ln_j_i); - } - } -+ } -+ // Start pushing the optimal offset index into t_boundary (t for tentative). -+ // prev[numCodePts] is guaranteed to be meaningful. -+ // We'll first push in the reverse order, i.e., -+ // t_boundary[0] = numCodePts, and afterwards do a swap. -+ UVector32 t_boundary(numCodePts+1, status); - -- // Never stop before a combining mark. -- int32_t currPos; -- while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { -- utext_next32(text); -- cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; -+ int32_t numBreaks = 0; -+ // No segmentation found, set boundary to end of range -+ while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { -+ --numCodePts; -+ } -+ if (numCodePts < 0) { -+ t_boundary.addElement(numCodePts, status); -+ numBreaks++; -+ } else { -+ for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) { -+ if (i < 0) i = -i; -+ t_boundary.addElement(i, status); -+ numBreaks++; - } -+ U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); -+ } - -- // Look ahead for possible suffixes if a dictionary word does not follow. -- // We do this in code rather than using a rule so that the heuristic -- // resynch continues to function. For example, one of the suffix characters -- // could be a typo in the middle of a word. --// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { --// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 --// && fSuffixSet.contains(uc = utext_current32(text))) { --// if (uc == KHMER_PAIYANNOI) { --// if (!fSuffixSet.contains(utext_previous32(text))) { --// // Skip over previous end and PAIYANNOI --// utext_next32(text); --// utext_next32(text); --// wordLength += 1; // Add PAIYANNOI to word --// uc = utext_current32(text); // Fetch next character --// } --// else { --// // Restore prior position --// utext_next32(text); --// } --// } --// if (uc == KHMER_MAIYAMOK) { --// if (utext_previous32(text) != KHMER_MAIYAMOK) { --// // Skip over previous end and MAIYAMOK --// utext_next32(text); --// utext_next32(text); --// wordLength += 1; // Add MAIYAMOK to word --// } --// else { --// // Restore prior position --// utext_next32(text); --// } --// } --// } --// else { --// utext_setNativeIndex(text, current+wordLength); --// } --// } -- -- // Did we find a word on this iteration? If so, push it on the break stack -- if (cuWordLength > 0) { -- foundBreaks.push((current+cuWordLength), status); -+ // Now that we're done, convert positions in t_boundary[] (indices in -+ // the normalized input string) back to indices in the original input UText -+ // while reversing t_boundary and pushing values to foundBreaks. -+ for (int32_t i = numBreaks-1; i >= 0; i--) { -+ int32_t cpPos = t_boundary.elementAti(i); -+ if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue; -+ int32_t utextPos = cpPos + rangeStart; -+ while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after)); -+ if (utextPos < before) { -+ // Boundaries are added to foundBreaks output in ascending order. -+ U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); -+ foundBreaks.push(utextPos, status); - } - } -- -+ - // Don't return a break for the end of the dictionary range if there is one there. -- if (foundBreaks.peeki() >= rangeEnd) { -+ if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) { - (void) foundBreaks.popi(); -- wordsFound -= 1; - } -- -- return wordsFound; -+ return foundBreaks.size() - wordsFound; - } - - #if !UCONFIG_NO_NORMALIZATION -diff --git a/source/common/dictbe.h b/source/common/dictbe.h -index d3488cd..26caa75 100644 ---- misc/icu/source/common/dictbe.h -+++ build/icu/source/common/dictbe.h -@@ -32,6 +32,15 @@ class Normalizer2; - */ - class DictionaryBreakEngine : public LanguageBreakEngine { - private: -+ -+ /** -+ *

Default constructor.

-+ * -+ */ -+ DictionaryBreakEngine(); -+ -+ protected: -+ - /** - * The set of characters handled by this engine - * @internal -@@ -46,11 +55,63 @@ class DictionaryBreakEngine : public LanguageBreakEngine { - - uint32_t fTypes; - -+ const int32_t WJ = 0x2060; -+ const int32_t ZWSP = 0x200B; -+ - /** -- *

Default constructor.

-- * -+ * A Unicode set of all viramas -+ * @internal - */ -- DictionaryBreakEngine(); -+ UnicodeSet fViramaSet; -+ -+ /** -+ * A Unicode set of all base characters -+ * @internal -+ */ -+ UnicodeSet fBaseSet; -+ -+ /** -+ * A Unicode set of all marks -+ * @internal -+ */ -+ UnicodeSet fMarkSet; -+ -+ /** -+ * A Unicode set of all characters ignored ignored in dictionary matching -+ * @internal -+ */ -+ UnicodeSet fIgnoreSet; -+ -+ /** -+ * A Unicode set of all characters ignored ignored in dictionary matching -+ * @internal -+ */ -+ UnicodeSet fSkipStartSet; -+ -+ /** -+ * A Unicode set of all characters ignored ignored in dictionary matching -+ * @internal -+ */ -+ UnicodeSet fSkipEndSet; -+ -+ /** -+ * A Unicode set of all characters that should not be broken before -+ * @internal -+ */ -+ UnicodeSet fNBeforeSet; -+ -+ /** -+ * The number of clusters within which breaks are inhibited -+ * @internal -+ */ -+ int32_t clusterLimit; -+ -+ bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const; -+ -+ bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const; -+ bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const; -+ void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const; -+ void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const; - - public: - -@@ -81,7 +142,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { - *

Find any breaks within a run in the supplied text.

- * - * @param text A UText representing the text. The iterator is left at -- * the end of the run of characters which the engine is capable of handling -+ * the end of the run of characters which the engine is capable of handling - * that starts from the first (or last) character in the range. - * @param startPos The start of the run within the supplied text. - * @param endPos The end of the run within the supplied text. -@@ -243,118 +304,120 @@ class LaoBreakEngine : public DictionaryBreakEngine { - - }; - --/******************************************************************* -- * BurmeseBreakEngine -- */ -- --/** -- *

BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a -- * DictionaryMatcher and heuristics to determine Burmese-specific breaks.

-- * -- *

After it is constructed a BurmeseBreakEngine may be shared between -- * threads without synchronization.

-- */ --class BurmeseBreakEngine : public DictionaryBreakEngine { -- private: -- /** -- * The set of characters handled by this engine -- * @internal -- */ -- -- UnicodeSet fBurmeseWordSet; -- UnicodeSet fEndWordSet; -- UnicodeSet fBeginWordSet; -- UnicodeSet fMarkSet; -- DictionaryMatcher *fDictionary; -- -- public: -- -- /** -- *

Default constructor.

-- * -- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the -- * engine is deleted. -- */ -- BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); -- -- /** -- *

Virtual destructor.

-- */ -- virtual ~BurmeseBreakEngine(); -- -- protected: -- /** -- *

Divide up a range of known dictionary characters.

-- * -- * @param text A UText representing the text -- * @param rangeStart The start of the range of dictionary characters -- * @param rangeEnd The end of the range of dictionary characters -- * @param foundBreaks Output of C array of int32_t break positions, or 0 -- * @return The number of breaks found -- */ -- virtual int32_t divideUpDictionaryRange( UText *text, -- int32_t rangeStart, -- int32_t rangeEnd, -- UStack &foundBreaks ) const; -- --}; -- --/******************************************************************* -- * KhmerBreakEngine -- */ -- --/** -- *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a -- * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

-- * -- *

After it is constructed a KhmerBreakEngine may be shared between -- * threads without synchronization.

-- */ --class KhmerBreakEngine : public DictionaryBreakEngine { -- private: -- /** -- * The set of characters handled by this engine -- * @internal -- */ -- -- UnicodeSet fKhmerWordSet; -- UnicodeSet fEndWordSet; -- UnicodeSet fBeginWordSet; -- UnicodeSet fMarkSet; -- DictionaryMatcher *fDictionary; -- -- public: -- -- /** -- *

Default constructor.

-- * -- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the -- * engine is deleted. -- */ -- KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); -- -- /** -- *

Virtual destructor.

-- */ -- virtual ~KhmerBreakEngine(); -- -- protected: -- /** -- *

Divide up a range of known dictionary characters.

-- * -- * @param text A UText representing the text -- * @param rangeStart The start of the range of dictionary characters -- * @param rangeEnd The end of the range of dictionary characters -- * @param foundBreaks Output of C array of int32_t break positions, or 0 -- * @return The number of breaks found -- */ -- virtual int32_t divideUpDictionaryRange( UText *text, -- int32_t rangeStart, -- int32_t rangeEnd, -- UStack &foundBreaks ) const; -- --}; -- -+/******************************************************************* -+ * BurmeseBreakEngine -+ */ -+ -+/** -+ *

BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a -+ * DictionaryMatcher and heuristics to determine Burmese-specific breaks.

-+ * -+ *

After it is constructed a BurmeseBreakEngine may be shared between -+ * threads without synchronization.

-+ */ -+class BurmeseBreakEngine : public DictionaryBreakEngine { -+ private: -+ /** -+ * The set of characters handled by this engine -+ * @internal -+ */ -+ -+ UnicodeSet fBurmeseWordSet; -+ UnicodeSet fEndWordSet; -+ UnicodeSet fBeginWordSet; -+ UnicodeSet fMarkSet; -+ DictionaryMatcher *fDictionary; -+ -+ public: -+ -+ /** -+ *

Default constructor.

-+ * -+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the -+ * engine is deleted. -+ */ -+ BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); -+ -+ /** -+ *

Virtual destructor.

-+ */ -+ virtual ~BurmeseBreakEngine(); -+ -+ protected: -+ /** -+ *

Divide up a range of known dictionary characters.

-+ * -+ * @param text A UText representing the text -+ * @param rangeStart The start of the range of dictionary characters -+ * @param rangeEnd The end of the range of dictionary characters -+ * @param foundBreaks Output of C array of int32_t break positions, or 0 -+ * @return The number of breaks found -+ */ -+ virtual int32_t divideUpDictionaryRange( UText *text, -+ int32_t rangeStart, -+ int32_t rangeEnd, -+ UStack &foundBreaks ) const; -+ -+}; -+ -+/******************************************************************* -+ * KhmerBreakEngine -+ */ -+ -+/** -+ *

KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a -+ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.

-+ * -+ *

After it is constructed a KhmerBreakEngine may be shared between -+ * threads without synchronization.

-+ */ -+class KhmerBreakEngine : public DictionaryBreakEngine { -+ private: -+ /** -+ * The set of characters handled by this engine -+ * @internal -+ */ -+ -+ UnicodeSet fKhmerWordSet; -+ UnicodeSet fBeginWordSet; -+ UnicodeSet fPuncSet; -+ DictionaryMatcher *fDictionary; -+ -+ const uint32_t BADSNLP = 256 * 20; -+ const uint32_t kuint32max = 0x7FFFFFFF; -+ -+ public: -+ -+ /** -+ *

Default constructor.

-+ * -+ * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the -+ * engine is deleted. -+ */ -+ KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); -+ -+ /** -+ *

Virtual destructor.

-+ */ -+ virtual ~KhmerBreakEngine(); -+ -+ protected: -+ /** -+ *

Divide up a range of known dictionary characters.

-+ * -+ * @param text A UText representing the text -+ * @param rangeStart The start of the range of dictionary characters -+ * @param rangeEnd The end of the range of dictionary characters -+ * @param foundBreaks Output of C array of int32_t break positions, or 0 -+ * @return The number of breaks found -+ */ -+ virtual int32_t divideUpDictionaryRange( UText *text, -+ int32_t rangeStart, -+ int32_t rangeEnd, -+ UStack &foundBreaks ) const; -+ -+}; -+ - #if !UCONFIG_NO_NORMALIZATION - - /******************************************************************* -diff --git a/source/common/dictionarydata.cpp b/source/common/dictionarydata.cpp -index cb594c6..82f2e77 100644 ---- misc/icu/source/common/dictionarydata.cpp -+++ build/icu/source/common/dictionarydata.cpp -@@ -42,7 +42,7 @@ int32_t UCharsDictionaryMatcher::getType() const { - - int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, - int32_t *lengths, int32_t *cpLengths, int32_t *values, -- int32_t *prefix) const { -+ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { - - UCharsTrie uct(characters); - int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); -@@ -53,7 +53,13 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t - UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); - int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; - codePointsMatched += 1; -+ if (ignoreSet != NULL && ignoreSet->contains(c)) { -+ continue; -+ } - if (USTRINGTRIE_HAS_VALUE(result)) { -+ if (codePointsMatched < minLength) { -+ continue; -+ } - if (wordCount < limit) { - if (values != NULL) { - values[wordCount] = uct.getValue(); -@@ -110,7 +116,7 @@ int32_t BytesDictionaryMatcher::getType() const { - - int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, - int32_t *lengths, int32_t *cpLengths, int32_t *values, -- int32_t *prefix) const { -+ int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { - BytesTrie bt(characters); - int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); - int32_t wordCount = 0; -@@ -120,7 +126,13 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t - UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); - int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; - codePointsMatched += 1; -+ if (ignoreSet != NULL && ignoreSet->contains(c)) { -+ continue; -+ } - if (USTRINGTRIE_HAS_VALUE(result)) { -+ if (codePointsMatched < minLength) { -+ continue; -+ } - if (wordCount < limit) { - if (values != NULL) { - values[wordCount] = bt.getValue(); -diff --git a/source/common/dictionarydata.h b/source/common/dictionarydata.h -index 0216ab0..ee9e571 100644 ---- misc/icu/source/common/dictionarydata.h -+++ build/icu/source/common/dictionarydata.h -@@ -19,6 +19,7 @@ - #include "unicode/utext.h" - #include "unicode/udata.h" - #include "udataswp.h" -+#include "unicode/uniset.h" - #include "unicode/uobject.h" - #include "unicode/ustringtrie.h" - -@@ -90,7 +91,7 @@ public: - */ - virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, - int32_t *lengths, int32_t *cpLengths, int32_t *values, -- int32_t *prefix) const = 0; -+ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0; - - /** @return DictionaryData::TRIE_TYPE_XYZ */ - virtual int32_t getType() const = 0; -@@ -105,7 +106,7 @@ public: - virtual ~UCharsDictionaryMatcher(); - virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, - int32_t *lengths, int32_t *cpLengths, int32_t *values, -- int32_t *prefix) const; -+ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const; - virtual int32_t getType() const; - private: - const UChar *characters; -@@ -123,7 +124,7 @@ public: - virtual ~BytesDictionaryMatcher(); - virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit, - int32_t *lengths, int32_t *cpLengths, int32_t *values, -- int32_t *prefix) const; -+ int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const; - virtual int32_t getType() const; - private: - UChar32 transform(UChar32 c) const; -diff --git a/source/data/Makefile.in b/source/data/Makefile.in -index 816c82d..c637d70 100644 ---- misc/icu/source/data/Makefile.in -+++ build/icu/source/data/Makefile.in -@@ -181,7 +181,7 @@ endif - endif - endif - --packagedata: icupkg.inc $(PKGDATA_LIST) build-local -+packagedata: icupkg.inc $(PKGDATA_LIST) build-local $(MAINBUILDDIR)/khmerdict.stamp - ifneq ($(ENABLE_STATIC),) - ifeq ($(PKGDATA_MODE),dll) - $(PKGDATA_INVOKE) $(PKGDATA) -e $(ICUDATA_ENTRY_POINT) -T $(OUTTMPDIR) -p $(ICUDATA_NAME) $(PKGDATA_LIBSTATICNAME) -m static $(PKGDATA_VERSIONING) $(PKGDATA_LIST) -@@ -564,8 +564,14 @@ $(BRKBLDDIR)/burmesedict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES) - $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(DICTSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict - - # TODO: figure out why combining characters are here? --$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES) -- $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict -+#$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES) -+# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict -+ -+#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local -+# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict -+$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local -+ cp $< $(BRKBLDDIR) -+ echo "timestamp" > $@ - - #################################################### CFU - # CFU FILES - diff --git a/external/icu/rtti.patch.0 b/external/icu/rtti.patch.0 deleted file mode 100644 index 2af9622ec9ad..000000000000 --- a/external/icu/rtti.patch.0 +++ /dev/null @@ -1,11 +0,0 @@ ---- source/config/mh-linux -+++ source/config/mh-linux -@@ -35,7 +35,7 @@ - #SH# LD_SONAME= - - ## Shared library options --LD_SOOPTIONS= -Wl,-Bsymbolic -+LD_SOOPTIONS= -Wl,-Bsymbolic-functions - - ## Shared object suffix - SO = so diff --git a/external/libcdr/ExternalProject_libcdr.mk b/external/libcdr/ExternalProject_libcdr.mk index 86f93daf1b90..2cd41db242cd 100644 --- a/external/libcdr/ExternalProject_libcdr.mk +++ b/external/libcdr/ExternalProject_libcdr.mk @@ -36,7 +36,7 @@ $(call gb_ExternalProject_get_state_target,libcdr,build) : --disable-werror \ --disable-weffc \ $(if $(verbose),--disable-silent-rules,--enable-silent-rules) \ - CXXFLAGS="$(CXXFLAGS) $(BOOST_CPPFLAGS) \ + CXXFLAGS="$(CXXFLAGS) $(ICU_UCHAR_TYPE) $(BOOST_CPPFLAGS) \ -DBOOST_ERROR_CODE_HEADER_ONLY -DBOOST_SYSTEM_NO_DEPRECATED" \ $(if $(CROSS_COMPILING),--build=$(BUILD_PLATFORM) --host=$(HOST_PLATFORM)) \ && $(MAKE) \ diff --git a/external/libebook/ExternalProject_libebook.mk b/external/libebook/ExternalProject_libebook.mk index 45b15f2f5eb7..17065829014b 100644 --- a/external/libebook/ExternalProject_libebook.mk +++ b/external/libebook/ExternalProject_libebook.mk @@ -38,7 +38,7 @@ $(call gb_ExternalProject_get_state_target,libebook,build) : $(if $(verbose),--disable-silent-rules,--enable-silent-rules) \ --disable-werror \ --disable-weffc \ - CXXFLAGS="$(CXXFLAGS) $(BOOST_CPPFLAGS) \ + CXXFLAGS="$(CXXFLAGS) $(ICU_UCHAR_TYPE) $(BOOST_CPPFLAGS) \ -DBOOST_ERROR_CODE_HEADER_ONLY -DBOOST_SYSTEM_NO_DEPRECATED" \ XML_CFLAGS="$(LIBXML_CFLAGS)" \ XML_LIBS="$(LIBXML_LIBS)" \ diff --git a/external/libfreehand/ExternalProject_libfreehand.mk b/external/libfreehand/ExternalProject_libfreehand.mk index 15a17072e872..f1bbacbb06e4 100644 --- a/external/libfreehand/ExternalProject_libfreehand.mk +++ b/external/libfreehand/ExternalProject_libfreehand.mk @@ -25,6 +25,7 @@ $(eval $(call gb_ExternalProject_use_externals,libfreehand,\ $(call gb_ExternalProject_get_state_target,libfreehand,build) : $(call gb_ExternalProject_run,build,\ export PKG_CONFIG="" \ + && export CXXFLAGS="$(CXXFLAGS) $(ICU_UCHAR_TYPE)" \ && MAKE=$(MAKE) ./configure \ --with-pic \ --enable-static \ diff --git a/external/libmspub/ExternalProject_libmspub.mk b/external/libmspub/ExternalProject_libmspub.mk index a168a62c636f..808d74b2f571 100644 --- a/external/libmspub/ExternalProject_libmspub.mk +++ b/external/libmspub/ExternalProject_libmspub.mk @@ -35,7 +35,7 @@ $(call gb_ExternalProject_get_state_target,libmspub,build) : --disable-werror \ --disable-weffc \ $(if $(verbose),--disable-silent-rules,--enable-silent-rules) \ - CXXFLAGS="$(CXXFLAGS) $(BOOST_CPPFLAGS) \ + CXXFLAGS="$(CXXFLAGS) $(ICU_UCHAR_TYPE) $(BOOST_CPPFLAGS) \ -DBOOST_ERROR_CODE_HEADER_ONLY -DBOOST_SYSTEM_NO_DEPRECATED" \ $(if $(CROSS_COMPILING),--build=$(BUILD_PLATFORM) --host=$(HOST_PLATFORM)) \ && $(MAKE) \ diff --git a/external/libvisio/ExternalProject_libvisio.mk b/external/libvisio/ExternalProject_libvisio.mk index 884fde820d00..2f212ceac249 100644 --- a/external/libvisio/ExternalProject_libvisio.mk +++ b/external/libvisio/ExternalProject_libvisio.mk @@ -35,7 +35,7 @@ $(call gb_ExternalProject_get_state_target,libvisio,build) : $(if $(ENABLE_DEBUG),--enable-debug,--disable-debug) \ --disable-werror \ $(if $(verbose),--disable-silent-rules,--enable-silent-rules) \ - CXXFLAGS="$(CXXFLAGS) $(BOOST_CPPFLAGS)" \ + CXXFLAGS="$(CXXFLAGS) $(ICU_UCHAR_TYPE) $(BOOST_CPPFLAGS)" \ $(if $(CROSS_COMPILING),--build=$(BUILD_PLATFORM) --host=$(HOST_PLATFORM)) \ && $(MAKE) \ ) -- cgit v1.2.3