From cf0967951a25a2daa10a636092193af5c5497aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Caol=C3=A1n=20McNamara?= Date: Fri, 10 Feb 2017 16:36:27 +0000 Subject: [PATCH 3/4] hoist string lowering from ngram to ngsuggest only lower when we have to and reuse scratch buffers as tolower destination kcachegrind reports 830,529,143 -> 779,887,690 on echo Hollo | valgrind --tool=callgrind ./src/tools/.libs/hunspell -d nl_NL --- src/hunspell/suggestmgr.cxx | 143 +++++++++++++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 48 deletions(-) diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx index 54a474f..ea52707 100644 --- a/src/hunspell/suggestmgr.cxx +++ b/src/hunspell/suggestmgr.cxx @@ -1075,10 +1075,8 @@ void SuggestMgr::ngsuggest(std::vector& wlst, u8_u16(w_target, target); } - std::vector w_entry; std::string f; std::vector w_f; - std::vector w_target2; for (size_t i = 0; i < rHMgr.size(); ++i) { while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { @@ -1091,13 +1089,24 @@ void SuggestMgr::ngsuggest(std::vector& wlst, continue; if (utf8) { - w_entry.clear(); - u8_u16(w_entry, HENTRY_WORD(hp)); - sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) + - leftcommonsubstring(w_word, w_entry); + w_f.clear(); + u8_u16(w_f, HENTRY_WORD(hp)); + + int leftcommon = leftcommonsubstring(w_word, w_f); + if (low) { + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + } + sc = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon; } else { - sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + - leftcommonsubstring(word, HENTRY_WORD(hp)); + f.assign(HENTRY_WORD(hp)); + + int leftcommon = leftcommonsubstring(word, f.c_str()); + if (low) { + // lowering dictionary word + mkallsmall(f, csconv); + } + sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon; } // check special pronounciation @@ -1108,11 +1117,20 @@ void SuggestMgr::ngsuggest(std::vector& wlst, if (utf8) { w_f.clear(); u8_u16(w_f, f); - sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) + - leftcommonsubstring(w_word, w_f); + + int leftcommon = leftcommonsubstring(w_word, w_f); + if (low) { + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + } + sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE) + leftcommon; } else { - sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + - leftcommonsubstring(word, f.c_str()); + int leftcommon = leftcommonsubstring(word, f.c_str()); + if (low) { + // lowering dictionary word + mkallsmall(f, csconv); + } + sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon; } if (sc2 > sc) sc = sc2; @@ -1129,14 +1147,14 @@ void SuggestMgr::ngsuggest(std::vector& wlst, candidate = HENTRY_WORD(hp); mkallcap(candidate, csconv); } - std::string target2 = phonet(candidate, *ph); - w_target2.clear(); + f = phonet(candidate, *ph); + w_f.clear(); if (utf8) { - u8_u16(w_target2, target2); - scphon = 2 * ngram(3, w_target, w_target2, + u8_u16(w_f, f); + scphon = 2 * ngram(3, w_target, w_f, NGRAM_LONGER_WORSE); } else { - scphon = 2 * ngram(3, target, target2, + scphon = 2 * ngram(3, target, f, NGRAM_LONGER_WORSE); } } @@ -1177,12 +1195,24 @@ void SuggestMgr::ngsuggest(std::vector& wlst, w_mw[k].l = '*'; w_mw[k].h = 0; } - thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low); + + if (low) { + // lowering dictionary word + mkallsmall_utf(w_mw, langnum); + } + + thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH); } else { std::string mw = word; for (int k = sp; k < n; k += 4) mw[k] = '*'; - thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + + if (low) { + // lowering dictionary word + mkallsmall(mw, csconv); + } + + thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH); } } thresh = thresh / 3; @@ -1210,7 +1240,6 @@ void SuggestMgr::ngsuggest(std::vector& wlst, return; } - std::vector w_glst_word; for (int i = 0; i < MAX_ROOTS; i++) { if (roots[i]) { struct hentry* rp = roots[i]; @@ -1225,15 +1254,26 @@ void SuggestMgr::ngsuggest(std::vector& wlst, for (int k = 0; k < nw; k++) { if (utf8) { - w_glst_word.clear(); - u8_u16(w_glst_word, glst[k].word); - sc = ngram(n, w_word, w_glst_word, - NGRAM_ANY_MISMATCH + low) + - leftcommonsubstring(w_word, w_glst_word); + w_f.clear(); + u8_u16(w_f, glst[k].word); + + int leftcommon = leftcommonsubstring(w_word, w_f); + if (low) { + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + } + + sc = ngram(n, w_word, w_f, NGRAM_ANY_MISMATCH) + leftcommon; } else { - sc = ngram(n, word, glst[k].word, - NGRAM_ANY_MISMATCH + low) + - leftcommonsubstring(word, glst[k].word); + f = glst[k].word; + + int leftcommon = leftcommonsubstring(word, f.c_str()); + if (low) { + // lowering dictionary word + mkallsmall(f, csconv); + } + + sc = ngram(n, word, f, NGRAM_ANY_MISMATCH) + leftcommon; } if (sc > thresh) { @@ -1318,19 +1358,37 @@ void SuggestMgr::ngsuggest(std::vector& wlst, w_gl.clear(); if (utf8) { u8_u16(w_gl, gl); - re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + - ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + //w_gl is lowercase already at this point + re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + if (low) { + w_f = w_word; + // lowering dictionary word + mkallsmall_utf(w_f, langnum); + re += ngram(2, w_gl, w_f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } else { + re += ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } } else { - re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + - ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + //gl is lowercase already at this point + re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + if (low) { + f = word; + // lowering dictionary word + mkallsmall(f, csconv); + re += ngram(2, gl, f, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } else { + re += ngram(2, gl, word, NGRAM_ANY_MISMATCH + NGRAM_WEIGHTED); + } } int ngram_score, leftcommon_score; if (utf8) { - ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low); + //w_gl is lowercase already at this point + ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH); leftcommon_score = leftcommonsubstring(w_word, w_gl); } else { - ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low); + //gl is lowercase already at this point + ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH); leftcommon_score = leftcommonsubstring(word, gl.c_str()); } gscore[i] = @@ -1802,14 +1860,6 @@ int SuggestMgr::ngram(int n, l2 = su2.size(); if (l2 == 0) return 0; - // lowering dictionary word - const std::vector* p_su2 = &su2; - std::vector su2_copy; - if (opt & NGRAM_LOWERING) { - su2_copy = su2; - mkallsmall_utf(su2_copy, langnum); - p_su2 = &su2_copy; - } for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1 - j); i++) { @@ -1817,7 +1867,7 @@ int SuggestMgr::ngram(int n, for (int l = 0; l <= (l2 - j); l++) { for (k = 0; k < j; k++) { const w_char& c1 = su1[i + k]; - const w_char& c2 = (*p_su2)[l + k]; + const w_char& c2 = su2[l + k]; if ((c1.l != c2.l) || (c1.h != c2.h)) break; } @@ -1862,14 +1912,11 @@ int SuggestMgr::ngram(int n, if (l2 == 0) return 0; l1 = s1.size(); - std::string t(s2); - if (opt & NGRAM_LOWERING) - mkallsmall(t, csconv); for (int j = 1; j <= n; j++) { ns = 0; for (int i = 0; i <= (l1 - j); i++) { - //t is haystack, s1[i..i+j) is needle - if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { + //s2 is haystack, s1[i..i+j) is needle + if (s2.find(s1.c_str()+i, 0, j) != std::string::npos) { ns++; } else if (opt & NGRAM_WEIGHTED) { ns--; -- 2.9.3