summaryrefslogtreecommitdiff
path: root/external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch
diff options
context:
space:
mode:
Diffstat (limited to 'external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch')
-rw-r--r--external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch264
1 files changed, 264 insertions, 0 deletions
diff --git a/external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch b/external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch
new file mode 100644
index 000000000000..ff2530cfe23d
--- /dev/null
+++ b/external/hunspell/0003-hoist-string-lowering-from-ngram-to-ngsuggest.patch
@@ -0,0 +1,264 @@
+From cf0967951a25a2daa10a636092193af5c5497aa2 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= <caolanm@redhat.com>
+Date: Fri, 10 Feb 2017 16:36:27 +0000
+Subject: [PATCH 3/4] hoist string lowering from ngram to ngsuggest
+
+only lower when we have to and reuse scratch buffers as
+tolower destination
+
+kcachegrind reports 830,529,143 -> 779,887,690 on
+
+echo Hollo | valgrind --tool=callgrind ./src/tools/.libs/hunspell -d nl_NL
+---
+ src/hunspell/suggestmgr.cxx | 143 +++++++++++++++++++++++++++++---------------
+ 1 file changed, 95 insertions(+), 48 deletions(-)
+
+diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
+index 54a474f..ea52707 100644
+--- a/src/hunspell/suggestmgr.cxx
++++ b/src/hunspell/suggestmgr.cxx
+@@ -1075,10 +1075,8 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ u8_u16(w_target, target);
+ }
+
+- std::vector<w_char> w_entry;
+ std::string f;
+ std::vector<w_char> w_f;
+- std::vector<w_char> w_target2;
+
+ for (size_t i = 0; i < rHMgr.size(); ++i) {
+ while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
+@@ -1091,13 +1089,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ continue;
+
+ if (utf8) {
+- w_entry.clear();
+- u8_u16(w_entry, HENTRY_WORD(hp));
+- sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
+- leftcommonsubstring(w_word, w_entry);
++ w_f.clear();
++ u8_u16(w_f, HENTRY_WORD(hp));
++
++ int leftcommon = leftcommonsubstring(w_word, w_f);
++ if (low) {
++ // lowering dictionary word
++ mkallsmall_utf(w_f, langnum);
++ }
++ sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
+ } else {
+- sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
+- leftcommonsubstring(word, HENTRY_WORD(hp));
++ f.assign(HENTRY_WORD(hp));
++
++ int leftcommon = leftcommonsubstring(word, f.c_str());
++ if (low) {
++ // lowering dictionary word
++ mkallsmall(f, csconv);
++ }
++ sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
+ }
+
+ // check special pronounciation
+@@ -1108,11 +1117,20 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ if (utf8) {
+ w_f.clear();
+ u8_u16(w_f, f);
+- sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
+- leftcommonsubstring(w_word, w_f);
++
++ int leftcommon = leftcommonsubstring(w_word, w_f);
++ if (low) {
++ // lowering dictionary word
++ mkallsmall_utf(w_f, langnum);
++ }
++ sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon;
+ } else {
+- sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
+- leftcommonsubstring(word, f.c_str());
++ int leftcommon = leftcommonsubstring(word, f.c_str());
++ if (low) {
++ // lowering dictionary word
++ mkallsmall(f, csconv);
++ }
++ sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
+ }
+ if (sc2 > sc)
+ sc = sc2;
+@@ -1129,14 +1147,14 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ candidate = HENTRY_WORD(hp);
+ mkallcap(candidate, csconv);
+ }
+- std::string target2 = phonet(candidate, *ph);
+- w_target2.clear();
++ f = phonet(candidate, *ph);
++ w_f.clear();
+ if (utf8) {
+- u8_u16(w_target2, target2);
+- scphon = 2 * ngram(3, w_target, w_target2,
++ u8_u16(w_f, f);
++ scphon = 2 * ngram(3, w_target, w_f,
+ NGRAM_LONGER_WORSE);
+ } else {
+- scphon = 2 * ngram(3, target, target2,
++ scphon = 2 * ngram(3, target, f,
+ NGRAM_LONGER_WORSE);
+ }
+ }
+@@ -1177,12 +1195,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ w_mw[k].l = '*';
+ w_mw[k].h = 0;
+ }
+- thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
++
++ if (low) {
++ // lowering dictionary word
++ mkallsmall_utf(w_mw, langnum);
++ }
++
++ thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH);
+ } else {
+ std::string mw = word;
+ for (int k = sp; k < n; k += 4)
+ mw[k] = '*';
+- thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
++
++ if (low) {
++ // lowering dictionary word
++ mkallsmall(mw, csconv);
++ }
++
++ thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+ }
+ }
+ thresh = thresh / 3;
+@@ -1210,7 +1240,6 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ return;
+ }
+
+- std::vector<w_char> w_glst_word;
+ for (int i = 0; i < MAX_ROOTS; i++) {
+ if (roots[i]) {
+ struct hentry* rp = roots[i];
+@@ -1225,15 +1254,26 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+
+ for (int k = 0; k < nw; k++) {
+ if (utf8) {
+- w_glst_word.clear();
+- u8_u16(w_glst_word, glst[k].word);
+- sc = ngram(n, w_word, w_glst_word,
+- NGRAM_ANY_MISMATCH + low) +
+- leftcommonsubstring(w_word, w_glst_word);
++ w_f.clear();
++ u8_u16(w_f, glst[k].word);
++
++ int leftcommon = leftcommonsubstring(w_word, w_f);
++ if (low) {
++ // lowering dictionary word
++ mkallsmall_utf(w_f, langnum);
++ }
++
++ sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon;
+ } else {
+- sc = ngram(n, word, glst[k].word,
+- NGRAM_ANY_MISMATCH + low) +
+- leftcommonsubstring(word, glst[k].word);
++ f = glst[k].word;
++
++ int leftcommon = leftcommonsubstring(word, f.c_str());
++ if (low) {
++ // lowering dictionary word
++ mkallsmall(f, csconv);
++ }
++
++ sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon;
+ }
+
+ if (sc > thresh) {
+@@ -1318,19 +1358,37 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
+ w_gl.clear();
+ if (utf8) {
+ u8_u16(w_gl, gl);
+- re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
+- ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
++ //w_gl is lowercase already at this point
++ re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
++ if (low) {
++ w_f = w_word;
++ // lowering dictionary word
++ mkallsmall_utf(w_f, langnum);
++ re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
++ } else {
++ re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
++ }
+ } else {
+- re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
+- ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
++ //gl is lowercase already at this point
++ re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
++ if (low) {
++ f = word;
++ // lowering dictionary word
++ mkallsmall(f, csconv);
++ re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
++ } else {
++ re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED);
++ }
+ }
+
+ int ngram_score, leftcommon_score;
+ if (utf8) {
+- ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
++ //w_gl is lowercase already at this point
++ ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH);
+ leftcommon_score = leftcommonsubstring(w_word, w_gl);
+ } else {
+- ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
++ //gl is lowercase already at this point
++ ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH);
+ leftcommon_score = leftcommonsubstring(word, gl.c_str());
+ }
+ gscore[i] =
+@@ -1802,14 +1860,6 @@ int SuggestMgr::ngram(int n,
+ l2 = su2.size();
+ if (l2 == 0)
+ return 0;
+- // lowering dictionary word
+- const std::vector<w_char>* p_su2 = &su2;
+- std::vector<w_char> su2_copy;
+- if (opt & NGRAM_LOWERING) {
+- su2_copy = su2;
+- mkallsmall_utf(su2_copy, langnum);
+- p_su2 = &su2_copy;
+- }
+ for (int j = 1; j <= n; j++) {
+ ns = 0;
+ for (int i = 0; i <= (l1 - j); i++) {
+@@ -1817,7 +1867,7 @@ int SuggestMgr::ngram(int n,
+ for (int l = 0; l <= (l2 - j); l++) {
+ for (k = 0; k < j; k++) {
+ const w_char& c1 = su1[i + k];
+- const w_char& c2 = (*p_su2)[l + k];
++ const w_char& c2 = su2[l + k];
+ if ((c1.l != c2.l) || (c1.h != c2.h))
+ break;
+ }
+@@ -1862,14 +1912,11 @@ int SuggestMgr::ngram(int n,
+ if (l2 == 0)
+ return 0;
+ l1 = s1.size();
+- std::string t(s2);
+- if (opt & NGRAM_LOWERING)
+- mkallsmall(t, csconv);
+ for (int j = 1; j <= n; j++) {
+ ns = 0;
+ for (int i = 0; i <= (l1 - j); i++) {
+- //t is haystack, s1[i..i+j) is needle
+- if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
++ //s2 is haystack, s1[i..i+j) is needle
++ if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) {
+ ns++;
+ } else if (opt & NGRAM_WEIGHTED) {
+ ns--;
+--
+2.9.3
+