summaryrefslogtreecommitdiff
path: root/external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch
diff options
context:
space:
mode:
authorLászló Németh <nemeth@numbertext.org>2018-11-07 13:18:36 +0100
committerLászló Németh <nemeth@numbertext.org>2018-11-12 23:30:20 +0100
commita20a2d7e0d28658f2d9089da076961a599833a28 (patch)
tree2aeccf5d2c75ed63f5b0ece0624701713229f83e /external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch
parentc4ebcb211c715ce31dfede0ca96f2eb592efadc5 (diff)
bump hunspell to 1.7libreoffice-6-2-branch-point
Change-Id: Ia8d1f4831e651b3a8d5115f78e5a5239b56c71c4 Reviewed-on: https://gerrit.libreoffice.org/63015 Tested-by: Jenkins Reviewed-by: László Németh <nemeth@numbertext.org>
Diffstat (limited to 'external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch')
-rw-r--r--external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch1605
1 files changed, 0 insertions, 1605 deletions
diff --git a/external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch b/external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch
deleted file mode 100644
index eb48c283b38c..000000000000
--- a/external/hunspell/0001-Recent-Hunspell-fixes-and-improvements.patch
+++ /dev/null
@@ -1,1605 +0,0 @@
-From 9ad1696fb13d65e5d569b7106749dd4014877c15 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
-Date: Wed, 13 Dec 2017 19:27:30 +0100
-Subject: [PATCH] Recent Hunspell fixes and improvements
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Containing the following up-stream patches:
-
-commit 7ba5beb517310a942bafd7d6d08fc92beae0e439
-Author: László Németh <nemeth@numbertext.org>
-Date: Wed Dec 13 19:01:35 2017 +0100
-
- Support dictionary based REP replacements
-
- using the following syntax in the dic file:
-
- word ph:pattern->replacement
-
-commit 711466a276d5d9f3a5f6e9089bb3262894196fbc
-Author: László Németh <nemeth@numbertext.org>
-Date: Tue Dec 12 15:09:36 2017 +0100
-
- fix compiler warnings
-
-commit db142a3addc87bbbdd9a76bc519c69e8ad95af73
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Dec 1 17:24:17 2017 +0100
-
- Fix regression in Hungarian "moving rule"
-
- from commit eb97eb789cec47a172f6e9a01db3e6cf3b8dc81d.
-
- Dictionary words with COMPOUNDFORBIDFLAG are removed
- from the beginning and middle of compound words,
- overriding the effect of COMPOUNDPERMITFLAG,
- except in Hungarian "moving rule".
-
- Add test example.
-
-commit 05082b4e8a917cfbddefbc5fd2d543895b27f4c1
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Dec 1 16:11:20 2017 +0100
-
- BREAK: keep also break-at-first-break-point breaking
-
- to handle the case of suffixes with dashes in compounds.
-
- Add also test example.
-
-commit caa24d60f1a4514d4e0ef48fa14105e85eb6514c
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Dec 1 11:16:35 2017 +0100
-
- Improve ph: usage for capitalization and Unicode
-
- - at capitalized dictionary words, add lowercase ph: patterns
- to the REP rules in a capitalized form, too, to get correct
- suggestions for lowercase and capitalized mispellings:
-
- Wednesday ph:wendsay (in dic file) results
-
- both wendsay and Wendsay -> Wednesday suggestions.
-
- For German and Hungarian:
-
- add also lowercase pattern -> lowercase dictionary word
- replacement to the REP rules, supporting lowercasing
- by compound word generation or derivational suffixes.
-
- - fix UTF-8 support of starred ph: fields
-
- - test examples
-
-commit 8912f2ade54cdc186fe0580471063d92d99eb572
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Dec 1 10:26:07 2017 +0100
-
- Allow suggestion search for prefix + *two suffixes*
-
- Remove artificial performance limit to get correct
- suggestions for relatively simple misspellings in
- Hungarian, etc., when the word form contains prefix
- and both derivative and inflectional suffixes, too:
-
- lefikszálása -> lefixálása
-
-commit eb97eb789cec47a172f6e9a01db3e6cf3b8dc81d
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Dec 1 08:03:38 2017 +0100
-
- Dictionary words with COMPOUNDFORBIDFLAG are removed
-
- from the beginning and middle of compound words,
- overriding the effect of COMPOUNDPERMITFLAG.
-
-commit 526f600e194aacbc3817df26f01d8c95c38bf582
-Author: László Németh <nemeth@numbertext.org>
-Date: Wed Nov 29 14:58:46 2017 +0100
-
- skip empty ph: field and support character stripping
-
- at replacement rule creation.
-
- When the ph: field ends with the character *,
- strip last character of the replacement (the correct word)
- and last two character of the field (the * and last
- character of the matching pattern) in the replacement rule.
-
- For example,
-
- pretty ph:prity*
-
- dictionary item results "prit -> prett" REP replacement
- rule instead of "prity -> pretty", to get
- "prity -> pretty" and "pritiest -> prettiest" suggestions.
-
-commit ebdd308463a0e8432f56f12804976ba7029a95c4
-Author: László Németh <nemeth@numbertext.org>
-Date: Wed Nov 29 13:13:21 2017 +0100
-
- clean-up suggestion
-
- - no ngram and compound word suggestions, if "good" suggestion
- exists, ie. uppercase, REP, ph: or dictionary word pair suggestions
-
- - word pairs are always suggested, if they exist in the dic file
-
- - word pairs have top priority in suggestions, and
- these are the only suggestions if there is no other good suggestion.
-
- - also dictionary word pairs separated by dash instead of space
- are handled specially in two-word suggestion (depending from the
- language)
-
-commit 066704985ae474999345f309c71b4929eff1ca95
-Author: László Németh <nemeth@numbertext.org>
-Date: Tue Nov 28 12:55:35 2017 +0100
-
- check dictionary word pairs to filter compound word overgeneration
-
- Now it's possible to filter bad compound words by listing
- the correct word pairs with space in the dictionary.
-
-commit bbf2eb4ad0c589c38d03321c8b126826d2284a3f
-Author: László Németh <nemeth@numbertext.org>
-Date: Tue Nov 28 11:25:01 2017 +0100
-
- word pairs of the dic file get highest suggestion priority
-
- when the words are written without space.
-
- Instead using REP replacements, now it's enough to add
-
- a lot
-
- to the English dic file (like in a traditional spelling
- dictionary) to get suggestions for "alot" in the requested
- order:
-
- alot
- & alot 7 0: a lot, alto, slot, alt, lot...
-
- (without using word pairs or the REP replacements, the order was
-
- alot
- & alot 7 0: alto, slot, alt, lot, a lot...)
-
-commit 90cb55f8f1a21c7f62539baf8f3cf6f062080afd
-Author: László Németh <nemeth@numbertext.org>
-Date: Tue Nov 28 09:57:23 2017 +0100
-
- Clean-up ngram suggestions for lowercase words
-
- don't suggest capitalized dictionary words for lower
- case misspellings in ngram suggestions, except
- - PHONE usage, or
- - in the case of German, where not only proper
- nouns are capitalized, or
- - the capitalized word has special pronunciation
-
- - fix typos and comments
-
-commit e80685c83d591b834c61866295577a9e214969cb
-Author: László Németh <nemeth@numbertext.org>
-Date: Mon Nov 27 18:26:42 2017 +0100
-
- Remove SUBSTANDARD dictionary roots from suggestions.
-
-commit 89a8ec6ce47ac4442992f4f6ed606012b1a2b799
-Author: László Németh <nemeth@numbertext.org>
-Date: Mon Nov 27 08:52:24 2017 +0100
-
- Optimize condition order in walk_hashtable loop
-
-commit 4e4106fc64bc26df10f8dc24e0e578abb70025c7
-Author: László Németh <nemeth@numbertext.org>
-Date: Sat Nov 25 01:37:52 2017 +0100
-
- Reduce strange ngram suggestions
-
- - don't suggest proper names for lowercase
- misspellings, except in German
-
- - length difference of misspellings and
- suggestions must be less than 5 characters
-
- Other: search capitalized suggestions for lowercase misspellings
- without ngram suggestions, too.
-
-commit 0b8a4d8851c94485dcc13cf8b8688c8d3fb9a783
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Nov 24 20:01:09 2017 +0100
-
- Use only middle replentries for compound word checking
-
- allowing compound filtering for compound stems and affixed
- forms in every languages.
-
- This replaces the partial fix for the CHECKCOMPOUNDREP regression
- in commit 1fada01663b29b57c010a9c274e45a5cf9ecf222.
-
-commit 957950b792fb0fda8fa95983434be265729bb75b
-Author: László Németh <nemeth@numbertext.org>
-Date: Fri Nov 24 10:56:13 2017 +0100
-
- Spelling dictionary should be a real spelling dictionary
-
- Listing common misspelling of words and *word sequences*
- is the new recommended method to fix missing, incomplete or
- verbose suggestions. Combined with CHECKCOMPOUNDREP,
- this method can limit overgeneration of compound words
- in important cases, too.
-
- For example, the following line in the dic file
-
- a lot ph:alot
-
- will result the best suggestion ("a lot") for the bad "alot"
- at the first place in the suggestion list.
-
- Use for:
-
- - give correct suggestions (wendsay or wensday -> Wednesday)
-
- Wednesday ph:wendsay ph:wensday
-
- - set priority of good suggestions (eg. wich -> which, witch, winch)
-
- which ph:wich
- witch ph:witch
-
- - suggest with one or *more* spaces (eg. inspite->in spite)
-
- in spite ph:inspite
- Oh, my gosh! ph:omg
-
- - switch off ngram suggestions for a common misspelling
-
- - better suggestion during affixation and compounding
-
- With CHECKCOMPOUNDREP
-
- - forbid bad compound words
-
- Implementation details:
-
- REP reptable created from REP definitions of the aff file and from
- "ph:" fields of the dic file (reptable contains phonetic and other
- common misspellings of letters, letter groups, morphemes and words
- for better suggestions). REP suggestions have greater priority in
- the suggestion list, and they switch off ngram suggestion
- search, avoiding overgeneration of suggestions.
-
-commit 4a8921bd65b39e24344ef38c396e797384b74677
-Author: László Németh <nemeth@numbertext.org>
-Date: Wed Nov 22 23:27:00 2017 +0100
-
- BREAK tries to break at the second word break
-
- to recognize dictionary words with word break characters
- (at the beginning of the compound word).
-
- This fixes the problems with the new Hungarian orthography
- about compounding of words with n-dash.
-
- Example:
-
- The Hungarian compound word "e-mail-cím" (e-mail address)
- will break into "e-mail" (dictionary word) and "cím", instead
- of "e" and "mail-cím" ("mail" is not a dictionary word) at
- first level of recursive word breaking.
----
- src/hunspell/affixmgr.cxx | 183 +++++++++++-----------------------
- src/hunspell/affixmgr.hxx | 5 +-
- src/hunspell/csutil.hxx | 6 +-
- src/hunspell/hashmgr.cxx | 236 +++++++++++++++++++++++++++++++++++++++++---
- src/hunspell/hashmgr.hxx | 15 ++-
- src/hunspell/htypes.hxx | 9 +-
- src/hunspell/hunspell.cxx | 75 ++++++++++----
- src/hunspell/suggestmgr.cxx | 200 ++++++++++++++++++++++++-------------
- src/hunspell/suggestmgr.hxx | 7 +-
- 9 files changed, 503 insertions(+), 233 deletions(-)
-
-diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
-index ffce7bb..a98071a 100644
---- a/src/hunspell/affixmgr.cxx
-+++ b/src/hunspell/affixmgr.cxx
-@@ -96,7 +96,6 @@ AffixMgr::AffixMgr(const char* affpath,
- complexprefixes = 0;
- parsedmaptable = false;
- parsedbreaktable = false;
-- parsedrep = false;
- iconvtable = NULL;
- oconvtable = NULL;
- // allow simplified compound forms (see 3rd field of CHECKCOMPOUNDPATTERN)
-@@ -529,14 +528,6 @@ int AffixMgr::parse_file(const char* affpath, const char* key) {
- }
- }
-
-- /* parse in the typical fault correcting table */
-- if (line.compare(0, 3, "REP", 3) == 0) {
-- if (!parse_reptable(line, afflst)) {
-- finishFileMgr(afflst);
-- return 1;
-- }
-- }
--
- /* parse in the input conversion table */
- if (line.compare(0, 5, "ICONV", 5) == 0) {
- if (!parse_convtable(line, afflst, &iconvtable, "ICONV")) {
-@@ -1278,22 +1269,41 @@ std::string AffixMgr::prefix_check_twosfx_morph(const char* word,
- // Is word a non compound with a REP substitution (see checkcompoundrep)?
- int AffixMgr::cpdrep_check(const char* word, int wl) {
-
-- if ((wl < 2) || reptable.empty())
-+ if ((wl < 2) || get_reptable().empty())
- return 0;
-
-- for (size_t i = 0; i < reptable.size(); ++i) {
-- const char* r = word;
-- const size_t lenp = reptable[i].pattern.size();
-- // search every occurence of the pattern in the word
-- while ((r = strstr(r, reptable[i].pattern.c_str())) != NULL) {
-- std::string candidate(word);
-- size_t type = r == word && langnum != LANG_hu ? 1 : 0;
-- if (r - word + reptable[i].pattern.size() == lenp && langnum != LANG_hu)
-- type += 2;
-- candidate.replace(r - word, lenp, reptable[i].outstrings[type]);
-+ for (size_t i = 0; i < get_reptable().size(); ++i) {
-+ // use only available mid patterns
-+ if (!get_reptable()[i].outstrings[0].empty()) {
-+ const char* r = word;
-+ const size_t lenp = get_reptable()[i].pattern.size();
-+ // search every occurence of the pattern in the word
-+ while ((r = strstr(r, get_reptable()[i].pattern.c_str())) != NULL) {
-+ std::string candidate(word);
-+ candidate.replace(r - word, lenp, get_reptable()[i].outstrings[0]);
-+ if (candidate_check(candidate.c_str(), candidate.size()))
-+ return 1;
-+ ++r; // search for the next letter
-+ }
-+ }
-+ }
-+
-+ return 0;
-+}
-+
-+// forbid compound words, if they are in the dictionary as a
-+// word pair separated by space
-+int AffixMgr::cpdwordpair_check(const char * word, int wl) {
-+ if (wl > 2) {
-+ std::string candidate(word);
-+ for (size_t i = 1; i < candidate.size(); i++) {
-+ // go to end of the UTF-8 character
-+ if (utf8 && ((word[i] & 0xc0) == 0x80))
-+ continue;
-+ candidate.insert(i, 1, ' ');
- if (candidate_check(candidate.c_str(), candidate.size()))
- return 1;
-- ++r; // search for the next letter
-+ candidate.erase(i, 1);
- }
- }
-
-@@ -1647,6 +1657,12 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
- affixed = 1;
- rv = lookup(st.c_str()); // perhaps without prefix
-
-+ // forbid dictionary stems with COMPOUNDFORBIDFLAG in
-+ // compound words, overriding the effect of COMPOUNDPERMITFLAG
-+ if ((rv) && compoundforbidflag &&
-+ TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
-+ continue;
-+
- // search homonym with compound flag
- while ((rv) && !hu_mov_rule &&
- ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
-@@ -1911,7 +1927,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
- TESTAFF(rv->astr, checkcpdtable[scpd - 1].cond2, rv->alen))) {
- // forbid compound word, if it is a non compound word with typical
- // fault
-- if (checkcompoundrep && cpdrep_check(word.c_str(), len))
-+ if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
-+ cpdwordpair_check(word.c_str(), len))
- return NULL;
- return rv_first;
- }
-@@ -2035,7 +2052,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
- ((!checkcompounddup || (rv != rv_first)))) {
- // forbid compound word, if it is a non compound word with typical
- // fault
-- if (checkcompoundrep && cpdrep_check(word.c_str(), len))
-+ if ((checkcompoundrep && cpdrep_check(word.c_str(), len)) ||
-+ cpdwordpair_check(word.c_str(), len))
- return NULL;
- return rv_first;
- }
-@@ -2060,7 +2078,11 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
- }
- if (rv) {
- // forbid compound word, if it is a non compound word with typical
-- // fault
-+ // fault, or a dictionary word pair
-+
-+ if (cpdwordpair_check(word.c_str(), len))
-+ return NULL;
-+
- if (checkcompoundrep || forbiddenword) {
-
- if (checkcompoundrep && cpdrep_check(word.c_str(), len))
-@@ -2071,7 +2093,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
- char r = st[i + rv->blen];
- st[i + rv->blen] = '\0';
-
-- if (checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) {
-+ if ((checkcompoundrep && cpdrep_check(st.c_str(), i + rv->blen)) ||
-+ cpdwordpair_check(st.c_str(), i + rv->blen)) {
- st[ + i + rv->blen] = r;
- continue;
- }
-@@ -2198,6 +2221,12 @@ int AffixMgr::compound_check_morph(const char* word,
-
- rv = lookup(st.c_str()); // perhaps without prefix
-
-+ // forbid dictionary stems with COMPOUNDFORBIDFLAG in
-+ // compound words, overriding the effect of COMPOUNDPERMITFLAG
-+ if ((rv) && compoundforbidflag &&
-+ TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
-+ continue;
-+
- // search homonym with compound flag
- while ((rv) && !hu_mov_rule &&
- ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
-@@ -3414,7 +3443,7 @@ int AffixMgr::expand_rootword(struct guessword* wlst,
-
- // return replacing table
- const std::vector<replentry>& AffixMgr::get_reptable() const {
-- return reptable;
-+ return pHMgr->get_reptable();
- }
-
- // return iconv table
-@@ -3554,6 +3583,11 @@ FLAG AffixMgr::get_nongramsuggest() const {
- return nongramsuggest;
- }
-
-+// return the substandard root/affix control flag
-+FLAG AffixMgr::get_substandard() const {
-+ return substandard;
-+}
-+
- // return the forbidden words flag modify flag
- FLAG AffixMgr::get_needaffix() const {
- return needaffix;
-@@ -3692,103 +3726,6 @@ bool AffixMgr::parse_cpdsyllable(const std::string& line, FileMgr* af) {
- return true;
- }
-
--/* parse in the typical fault correcting table */
--bool AffixMgr::parse_reptable(const std::string& line, FileMgr* af) {
-- if (parsedrep) {
-- HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
-- af->getlinenum());
-- return false;
-- }
-- parsedrep = true;
-- int numrep = -1;
-- int i = 0;
-- int np = 0;
-- std::string::const_iterator iter = line.begin();
-- std::string::const_iterator start_piece = mystrsep(line, iter);
-- while (start_piece != line.end()) {
-- switch (i) {
-- case 0: {
-- np++;
-- break;
-- }
-- case 1: {
-- numrep = atoi(std::string(start_piece, iter).c_str());
-- if (numrep < 1) {
-- HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
-- af->getlinenum());
-- return false;
-- }
-- reptable.reserve(numrep);
-- np++;
-- break;
-- }
-- default:
-- break;
-- }
-- ++i;
-- start_piece = mystrsep(line, iter);
-- }
-- if (np != 2) {
-- HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
-- af->getlinenum());
-- return false;
-- }
--
-- /* now parse the numrep lines to read in the remainder of the table */
-- for (int j = 0; j < numrep; ++j) {
-- std::string nl;
-- if (!af->getline(nl))
-- return false;
-- mychomp(nl);
-- reptable.push_back(replentry());
-- iter = nl.begin();
-- i = 0;
-- int type = 0;
-- start_piece = mystrsep(nl, iter);
-- while (start_piece != nl.end()) {
-- switch (i) {
-- case 0: {
-- if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
-- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
-- af->getlinenum());
-- reptable.clear();
-- return false;
-- }
-- break;
-- }
-- case 1: {
-- if (*start_piece == '^')
-- type = 1;
-- reptable.back().pattern.assign(start_piece + type, iter);
-- mystrrep(reptable.back().pattern, "_", " ");
-- if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
-- type += 2;
-- reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
-- }
-- break;
-- }
-- case 2: {
-- reptable.back().outstrings[type].assign(start_piece, iter);
-- mystrrep(reptable.back().outstrings[type], "_", " ");
-- break;
-- }
-- default:
-- break;
-- }
-- ++i;
-- start_piece = mystrsep(nl, iter);
-- }
-- if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
-- HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
-- af->getlinenum());
-- reptable.clear();
-- return false;
-- }
-- }
-- return true;
--}
--
--/* parse in the typical fault correcting table */
- bool AffixMgr::parse_convtable(const std::string& line,
- FileMgr* af,
- RepList** rl,
-diff --git a/src/hunspell/affixmgr.hxx b/src/hunspell/affixmgr.hxx
-index d41e69c..38842a3 100644
---- a/src/hunspell/affixmgr.hxx
-+++ b/src/hunspell/affixmgr.hxx
-@@ -120,8 +120,6 @@ class AffixMgr {
- FLAG nongramsuggest;
- FLAG needaffix;
- int cpdmin;
-- bool parsedrep;
-- std::vector<replentry> reptable;
- RepList* iconvtable;
- RepList* oconvtable;
- bool parsedmaptable;
-@@ -251,6 +249,7 @@ class AffixMgr {
-
- short get_syllable(const std::string& word);
- int cpdrep_check(const char* word, int len);
-+ int cpdwordpair_check(const char * word, int len);
- int cpdpat_check(const char* word,
- int len,
- hentry* r1,
-@@ -311,6 +310,7 @@ class AffixMgr {
- FLAG get_forbiddenword() const;
- FLAG get_nosuggest() const;
- FLAG get_nongramsuggest() const;
-+ FLAG get_substandard() const;
- FLAG get_needaffix() const;
- FLAG get_onlyincompound() const;
- const char* get_derived() const;
-@@ -338,7 +338,6 @@ class AffixMgr {
- bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af);
- bool parse_num(const std::string& line, int* out, FileMgr* af);
- bool parse_cpdsyllable(const std::string& line, FileMgr* af);
-- bool parse_reptable(const std::string& line, FileMgr* af);
- bool parse_convtable(const std::string& line,
- FileMgr* af,
- RepList** rl,
-diff --git a/src/hunspell/csutil.hxx b/src/hunspell/csutil.hxx
-index 5d83f80..01c0a24 100644
---- a/src/hunspell/csutil.hxx
-+++ b/src/hunspell/csutil.hxx
-@@ -272,7 +272,7 @@ LIBHUNSPELL_DLL_EXPORTED char* get_stored_pointer(const char* s);
- // hash entry macros
- LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
- char* ret;
-- if (!h->var)
-+ if (!(h->var & H_OPT))
- ret = NULL;
- else if (h->var & H_OPT_ALIASM)
- ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
-@@ -284,7 +284,7 @@ LIBHUNSPELL_DLL_EXPORTED inline char* HENTRY_DATA(struct hentry* h) {
- LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
- const struct hentry* h) {
- const char* ret;
-- if (!h->var)
-+ if (!(h->var & H_OPT))
- ret = NULL;
- else if (h->var & H_OPT_ALIASM)
- ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
-@@ -297,7 +297,7 @@ LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA(
- LIBHUNSPELL_DLL_EXPORTED inline const char* HENTRY_DATA2(
- const struct hentry* h) {
- const char* ret;
-- if (!h->var)
-+ if (!(h->var & H_OPT))
- ret = "";
- else if (h->var & H_OPT_ALIASM)
- ret = get_stored_pointer(HENTRY_WORD(h) + h->blen + 1);
-diff --git a/src/hunspell/hashmgr.cxx b/src/hunspell/hashmgr.cxx
-index 23421b5..ec3803b 100644
---- a/src/hunspell/hashmgr.cxx
-+++ b/src/hunspell/hashmgr.cxx
-@@ -78,6 +78,7 @@
- #include "hashmgr.hxx"
- #include "csutil.hxx"
- #include "atypes.hxx"
-+#include "langnum.hxx"
-
- // build a hash table from a munched word list
-
-@@ -182,7 +183,8 @@ int HashMgr::add_word(const std::string& in_word,
- unsigned short* aff,
- int al,
- const std::string* in_desc,
-- bool onlyupcase) {
-+ bool onlyupcase,
-+ int captype) {
- const std::string* word = &in_word;
- const std::string* desc = in_desc;
-
-@@ -243,20 +245,119 @@ int HashMgr::add_word(const std::string& in_word,
- hp->astr = aff;
- hp->next = NULL;
- hp->next_homonym = NULL;
-+ hp->var = (captype == INITCAP) ? H_OPT_INITCAP : 0;
-
- // store the description string or its pointer
- if (desc) {
-- hp->var = H_OPT;
-+ hp->var += H_OPT;
- if (aliasm) {
- hp->var += H_OPT_ALIASM;
- store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
- } else {
- strcpy(hpw + word->size() + 1, desc->c_str());
- }
-- if (strstr(HENTRY_DATA(hp), MORPH_PHON))
-+ if (strstr(HENTRY_DATA(hp), MORPH_PHON)) {
- hp->var += H_OPT_PHON;
-- } else
-- hp->var = 0;
-+ // store ph: fields (pronounciation, misspellings, old orthography etc.)
-+ // of a morphological description in reptable to use in REP replacements.
-+ if (reptable.capacity() < (unsigned int)(tablesize/MORPH_PHON_RATIO))
-+ reptable.reserve(tablesize/MORPH_PHON_RATIO);
-+ std::string fields = HENTRY_DATA(hp);
-+ std::string::const_iterator iter = fields.begin();
-+ std::string::const_iterator start_piece = mystrsep(fields, iter);
-+ while (start_piece != fields.end()) {
-+ if (std::string(start_piece, iter).find(MORPH_PHON) == 0) {
-+ std::string ph = std::string(start_piece, iter).substr(sizeof MORPH_PHON - 1);
-+ if (ph.size() > 0) {
-+ std::vector<w_char> w;
-+ size_t strippatt;
-+ std::string wordpart;
-+ // dictionary based REP replacement, separated by "->"
-+ // for example "pretty ph:prity ph:priti->pretti" to handle
-+ // both prity -> pretty and pritier -> prettiest suggestions.
-+ if (((strippatt = ph.find("->")) != std::string::npos) &&
-+ (strippatt > 0) && (strippatt < ph.size() - 2)) {
-+ wordpart = ph.substr(strippatt + 2);
-+ ph.erase(ph.begin() + strippatt, ph.end());
-+ } else
-+ wordpart = in_word;
-+ // when the ph: field ends with the character *,
-+ // strip last character of the pattern and the replacement
-+ // to match in REP suggestions also at character changes,
-+ // for example, "pretty ph:prity*" results "prit->prett"
-+ // REP replacement instead of "prity->pretty", to get
-+ // prity->pretty and pritiest->prettiest suggestions.
-+ if (ph.at(ph.size()-1) == '*') {
-+ strippatt = 1;
-+ size_t stripword = 0;
-+ if (utf8) {
-+ while ((strippatt < ph.size()) &&
-+ ((ph.at(ph.size()-strippatt-1) & 0xc0) == 0x80))
-+ ++strippatt;
-+ while ((stripword < wordpart.size()) &&
-+ ((wordpart.at(wordpart.size()-stripword-1) & 0xc0) == 0x80))
-+ ++stripword;
-+ }
-+ ++strippatt;
-+ ++stripword;
-+ if ((ph.size() > strippatt) && (wordpart.size() > stripword)) {
-+ ph.erase(ph.size()-strippatt, strippatt);
-+ wordpart.erase(in_word.size()-stripword, stripword);
-+ }
-+ }
-+ // capitalize lowercase pattern for capitalized words to support
-+ // good suggestions also for capitalized misspellings, eg.
-+ // Wednesday ph:wendsay
-+ // results wendsay -> Wednesday and Wendsay -> Wednesday, too.
-+ if (captype==INITCAP) {
-+ std::string ph_capitalized;
-+ if (utf8) {
-+ u8_u16(w, ph);
-+ if (get_captype_utf8(w, langnum) == NOCAP) {
-+ mkinitcap_utf(w, langnum);
-+ u16_u8(ph_capitalized, w);
-+ }
-+ } else if (get_captype(ph, csconv) == NOCAP)
-+ mkinitcap(ph_capitalized, csconv);
-+
-+ if (ph_capitalized.size() > 0) {
-+ // add also lowercase word in the case of German or
-+ // Hungarian to support lowercase suggestions lowercased by
-+ // compound word generation or derivational suffixes
-+ // (for example by adjectival suffix "-i" of geographical
-+ // names in Hungarian:
-+ // Massachusetts ph:messzecsuzec
-+ // messzecsuzeci -> massachusettsi (adjective)
-+ // For lowercasing by conditional PFX rules, see
-+ // tests/germancompounding test example or the
-+ // Hungarian dictionary.)
-+ if (langnum == LANG_de || langnum == LANG_hu) {
-+ std::string wordpart_lower(wordpart);
-+ if (utf8) {
-+ u8_u16(w, wordpart_lower);
-+ mkallsmall_utf(w, langnum);
-+ u16_u8(wordpart_lower, w);
-+ } else {
-+ mkallsmall(wordpart_lower, csconv);
-+ }
-+ reptable.push_back(replentry());
-+ reptable.back().pattern.assign(ph);
-+ reptable.back().outstrings[0].assign(wordpart_lower);
-+ }
-+ reptable.push_back(replentry());
-+ reptable.back().pattern.assign(ph_capitalized);
-+ reptable.back().outstrings[0].assign(wordpart);
-+ }
-+ }
-+ reptable.push_back(replentry());
-+ reptable.back().pattern.assign(ph);
-+ reptable.back().outstrings[0].assign(wordpart);
-+ }
-+ }
-+ start_piece = mystrsep(fields, iter);
-+ }
-+ }
-+ }
-
- struct hentry* dp = tableptr[i];
- if (!dp) {
-@@ -347,12 +448,12 @@ int HashMgr::add_hidden_capitalized_word(const std::string& word,
- mkallsmall_utf(w, langnum);
- mkinitcap_utf(w, langnum);
- u16_u8(st, w);
-- return add_word(st, wcl, flags2, flagslen + 1, dp, true);
-+ return add_word(st, wcl, flags2, flagslen + 1, dp, true, INITCAP);
- } else {
- std::string new_word(word);
- mkallsmall(new_word, csconv);
- mkinitcap(new_word, csconv);
-- int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
-+ int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true, INITCAP);
- return ret;
- }
- }
-@@ -435,7 +536,7 @@ int HashMgr::add(const std::string& word) {
- int al = 0;
- unsigned short* flags = NULL;
- int wcl = get_clen_and_captype(word, &captype);
-- add_word(word, wcl, flags, al, NULL, false);
-+ add_word(word, wcl, flags, al, NULL, false, captype);
- return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
- captype);
- }
-@@ -450,14 +551,14 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example)
- int captype;
- int wcl = get_clen_and_captype(word, &captype);
- if (aliasf) {
-- add_word(word, wcl, dp->astr, dp->alen, NULL, false);
-+ add_word(word, wcl, dp->astr, dp->alen, NULL, false, captype);
- } else {
- unsigned short* flags =
- (unsigned short*)malloc(dp->alen * sizeof(unsigned short));
- if (flags) {
- memcpy((void*)flags, (void*)dp->astr,
- dp->alen * sizeof(unsigned short));
-- add_word(word, wcl, flags, dp->alen, NULL, false);
-+ add_word(word, wcl, flags, dp->alen, NULL, false, captype);
- } else
- return 1;
- }
-@@ -605,7 +706,7 @@ int HashMgr::load_tables(const char* tpath, const char* key) {
- int wcl = get_clen_and_captype(ts, &captype, workbuf);
- const std::string *dp_str = dp.empty() ? NULL : &dp;
- // add the word and its index plus its capitalized form optionally
-- if (add_word(ts, wcl, flags, al, dp_str, false) ||
-+ if (add_word(ts, wcl, flags, al, dp_str, false, captype) ||
- add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
- delete dict;
- return 5;
-@@ -940,8 +1041,19 @@ int HashMgr::load_config(const char* affpath, const char* key) {
- if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
- complexprefixes = 1;
-
-+ /* parse in the typical fault correcting table */
-+ if (line.compare(0, 3, "REP", 3) == 0) {
-+ if (!parse_reptable(line, afflst)) {
-+ delete afflst;
-+ return 1;
-+ }
-+ }
-+
-+ // don't check the full affix file, yet
- if (((line.compare(0, 3, "SFX", 3) == 0) ||
-- (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
-+ (line.compare(0, 3, "PFX", 3) == 0)) &&
-+ line.size() > 3 && isspace(line[3]) &&
-+ !reptable.empty()) // (REP table is in the end of Afrikaans aff file)
- break;
- }
-
-@@ -1191,3 +1303,103 @@ char* HashMgr::get_aliasm(int index) const {
- HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
- return NULL;
- }
-+
-+/* parse in the typical fault correcting table */
-+bool HashMgr::parse_reptable(const std::string& line, FileMgr* af) {
-+ if (!reptable.empty()) {
-+ HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
-+ af->getlinenum());
-+ return false;
-+ }
-+ int numrep = -1;
-+ int i = 0;
-+ int np = 0;
-+ std::string::const_iterator iter = line.begin();
-+ std::string::const_iterator start_piece = mystrsep(line, iter);
-+ while (start_piece != line.end()) {
-+ switch (i) {
-+ case 0: {
-+ np++;
-+ break;
-+ }
-+ case 1: {
-+ numrep = atoi(std::string(start_piece, iter).c_str());
-+ if (numrep < 1) {
-+ HUNSPELL_WARNING(stderr, "error: line %d: incorrect entry number\n",
-+ af->getlinenum());
-+ return false;
-+ }
-+ reptable.reserve(numrep);
-+ np++;
-+ break;
-+ }
-+ default:
-+ break;
-+ }
-+ ++i;
-+ start_piece = mystrsep(line, iter);
-+ }
-+ if (np != 2) {
-+ HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
-+ af->getlinenum());
-+ return false;
-+ }
-+
-+ /* now parse the numrep lines to read in the remainder of the table */
-+ for (int j = 0; j < numrep; ++j) {
-+ std::string nl;
-+ if (!af->getline(nl))
-+ return false;
-+ mychomp(nl);
-+ reptable.push_back(replentry());
-+ iter = nl.begin();
-+ i = 0;
-+ int type = 0;
-+ start_piece = mystrsep(nl, iter);
-+ while (start_piece != nl.end()) {
-+ switch (i) {
-+ case 0: {
-+ if (nl.compare(start_piece - nl.begin(), 3, "REP", 3) != 0) {
-+ HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
-+ af->getlinenum());
-+ reptable.clear();
-+ return false;
-+ }
-+ break;
-+ }
-+ case 1: {
-+ if (*start_piece == '^')
-+ type = 1;
-+ reptable.back().pattern.assign(start_piece + type, iter);
-+ mystrrep(reptable.back().pattern, "_", " ");
-+ if (!reptable.back().pattern.empty() && reptable.back().pattern[reptable.back().pattern.size() - 1] == '$') {
-+ type += 2;
-+ reptable.back().pattern.resize(reptable.back().pattern.size() - 1);
-+ }
-+ break;
-+ }
-+ case 2: {
-+ reptable.back().outstrings[type].assign(start_piece, iter);
-+ mystrrep(reptable.back().outstrings[type], "_", " ");
-+ break;
-+ }
-+ default:
-+ break;
-+ }
-+ ++i;
-+ start_piece = mystrsep(nl, iter);
-+ }
-+ if (reptable.back().pattern.empty() || reptable.back().outstrings[type].empty()) {
-+ HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
-+ af->getlinenum());
-+ reptable.clear();
-+ return false;
-+ }
-+ }
-+ return true;
-+}
-+
-+// return replacing table
-+const std::vector<replentry>& HashMgr::get_reptable() const {
-+ return reptable;
-+}
-diff --git a/src/hunspell/hashmgr.hxx b/src/hunspell/hashmgr.hxx
-index da485d7..b6eaddd 100644
---- a/src/hunspell/hashmgr.hxx
-+++ b/src/hunspell/hashmgr.hxx
-@@ -81,6 +81,12 @@
-
- enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
-
-+// morphological description of a dictionary item can contain
-+// arbitrary number "ph:" (MORPH_PHON) fields to store typical
-+// phonetic or other misspellings of that word.
-+// ratio of lines/lines with "ph:" in the dic file: 1/MORPH_PHON_RATIO
-+#define MORPH_PHON_RATIO 500
-+
- class HashMgr {
- int tablesize;
- struct hentry** tableptr;
-@@ -99,6 +105,10 @@ class HashMgr {
- unsigned short* aliasflen;
- int numaliasm; // morphological desciption `compression' with aliases
- char** aliasm;
-+ // reptable created from REP table of aff file and from "ph:" fields
-+ // of the dic file. It contains phonetic and other common misspellings
-+ // (letters, letter groups and words) for better suggestions
-+ std::vector<replentry> reptable;
-
- public:
- HashMgr(const char* tpath, const char* apath, const char* key = NULL);
-@@ -119,6 +129,7 @@ class HashMgr {
- int get_aliasf(int index, unsigned short** fvec, FileMgr* af) const;
- int is_aliasm() const;
- char* get_aliasm(int index) const;
-+ const std::vector<replentry>& get_reptable() const;
-
- private:
- int get_clen_and_captype(const std::string& word, int* captype);
-@@ -129,7 +140,8 @@ class HashMgr {
- unsigned short* ap,
- int al,
- const std::string* desc,
-- bool onlyupcase);
-+ bool onlyupcase,
-+ int captype);
- int load_config(const char* affpath, const char* key);
- bool parse_aliasf(const std::string& line, FileMgr* af);
- int add_hidden_capitalized_word(const std::string& word,
-@@ -139,6 +151,7 @@ class HashMgr {
- const std::string* dp,
- int captype);
- bool parse_aliasm(const std::string& line, FileMgr* af);
-+ bool parse_reptable(const std::string& line, FileMgr* af);
- int remove_forbidden_flag(const std::string& word);
- };
-
-diff --git a/src/hunspell/htypes.hxx b/src/hunspell/htypes.hxx
-index 8f66a00..76228c4 100644
---- a/src/hunspell/htypes.hxx
-+++ b/src/hunspell/htypes.hxx
-@@ -44,9 +44,10 @@
- (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q)) - 1));
-
- // hentry options
--#define H_OPT (1 << 0)
--#define H_OPT_ALIASM (1 << 1)
--#define H_OPT_PHON (1 << 2)
-+#define H_OPT (1 << 0) // is there optional morphological data?
-+#define H_OPT_ALIASM (1 << 1) // using alias compression?
-+#define H_OPT_PHON (1 << 2) // is there ph: field in the morphological data?
-+#define H_OPT_INITCAP (1 << 3) // is dictionary word capitalized?
-
- // see also csutil.hxx
- #define HENTRY_WORD(h) &(h->word[0])
-@@ -61,7 +62,7 @@ struct hentry {
- unsigned short* astr; // affix flag vector
- struct hentry* next; // next word with same hash code
- struct hentry* next_homonym; // next homonym word (with same hash code)
-- char var; // variable fields (only for special pronounciation yet)
-+ char var; // bit vector of H_OPT hentry options
- char word[1]; // variable-length word (8-bit or UTF-8 encoding)
- };
-
-diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
-index 1ef11df..6c5aeb6 100644
---- a/src/hunspell/hunspell.cxx
-+++ b/src/hunspell/hunspell.cxx
-@@ -666,6 +666,37 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
- size_t plen = wordbreak[j].size();
- size_t found = scw.find(wordbreak[j]);
- if ((found > 0) && (found < wl - plen)) {
-+ size_t found2 = scw.find(wordbreak[j], found + 1);
-+ // try to break at the second occurance
-+ // to recognize dictionary words with wordbreak
-+ if (found2 > 0 && (found2 < wl - plen))
-+ found = found2;
-+ if (!spell(scw.substr(found + plen)))
-+ continue;
-+ std::string suffix(scw.substr(found));
-+ scw.resize(found);
-+ // examine 2 sides of the break point
-+ if (spell(scw))
-+ return true;
-+ scw.append(suffix);
-+
-+ // LANG_hu: spec. dash rule
-+ if (langnum == LANG_hu && wordbreak[j] == "-") {
-+ suffix = scw.substr(found + 1);
-+ scw.resize(found + 1);
-+ if (spell(scw))
-+ return true; // check the first part with dash
-+ scw.append(suffix);
-+ }
-+ // end of LANG specific region
-+ }
-+ }
-+
-+ // other patterns (break at first break point)
-+ for (size_t j = 0; j < wordbreak.size(); ++j) {
-+ size_t plen = wordbreak[j].size();
-+ size_t found = scw.find(wordbreak[j]);
-+ if ((found > 0) && (found < wl - plen)) {
- if (!spell(scw.substr(found + plen)))
- continue;
- std::string suffix(scw.substr(found));
-@@ -870,6 +901,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- }
-
- int capwords = 0;
-+ bool good = false;
-
- // check capitalized form for FORCEUCASE
- if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
-@@ -884,22 +916,27 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
-
- switch (captype) {
- case NOCAP: {
-- pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
-+ if (abbv) {
-+ std::string wspace(scw);
-+ wspace.push_back('.');
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ }
- break;
- }
-
- case INITCAP: {
- capwords = 1;
-- pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
- std::string wspace(scw);
- mkallsmall2(wspace, sunicw);
-- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
- break;
- }
- case HUHINITCAP:
- capwords = 1;
- case HUHCAP: {
-- pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
- // something.The -> something. The
- size_t dot_pos = scw.find('.');
- if (dot_pos != std::string::npos) {
-@@ -925,19 +962,19 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- // TheOpenOffice.org -> The OpenOffice.org
- wspace = scw;
- mkinitsmall2(wspace, sunicw);
-- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
- }
- wspace = scw;
- mkallsmall2(wspace, sunicw);
- if (spell(wspace.c_str()))
- insert_sug(slst, wspace);
- size_t prevns = slst.size();
-- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
- if (captype == HUHINITCAP) {
- mkinitcap2(wspace, sunicw);
- if (spell(wspace.c_str()))
- insert_sug(slst, wspace);
-- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
- }
- // aNew -> "a New" (instead of "a new")
- for (size_t j = prevns; j < slst.size(); ++j) {
-@@ -964,11 +1001,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- case ALLCAP: {
- std::string wspace(scw);
- mkallsmall2(wspace, sunicw);
-- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
- if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
- insert_sug(slst, wspace);
- mkinitcap2(wspace, sunicw);
-- pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
-+ good |= pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
- for (size_t j = 0; j < slst.size(); ++j) {
- mkallcap(slst[j]);
- if (pAMgr && pAMgr->get_checksharps()) {
-@@ -1000,12 +1037,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- }
- }
- // END OF LANG_hu section
--
-- // try ngram approach since found nothing or only compound words
-- if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
-+ // try ngram approach since found nothing good suggestion
-+ if (!good && pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
- switch (captype) {
- case NOCAP: {
-- pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs);
-+ pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs, NOCAP);
- break;
- }
- case HUHINITCAP:
-@@ -1013,21 +1049,21 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- case HUHCAP: {
- std::string wspace(scw);
- mkallsmall2(wspace, sunicw);
-- pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
-+ pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, HUHCAP);
- break;
- }
- case INITCAP: {
- capwords = 1;
- std::string wspace(scw);
- mkallsmall2(wspace, sunicw);
-- pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
-+ pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, INITCAP);
- break;
- }
- case ALLCAP: {
- std::string wspace(scw);
- mkallsmall2(wspace, sunicw);
- size_t oldns = slst.size();
-- pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
-+ pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs, ALLCAP);
- for (size_t j = oldns; j < slst.size(); ++j) {
- mkallcap(slst[j]);
- }
-@@ -1037,6 +1073,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- }
-
- // try dash suggestion (Afo-American -> Afro-American)
-+ // Note: LibreOffice was modified to treat dashes as word
-+ // characters to check "scot-free" etc. word forms, but
-+ // we need to handle suggestions for "Afo-American", etc.,
-+ // while "Afro-American" is missing from the dictionary.
-+ // TODO avoid possible overgeneration
- size_t dash_pos = scw.find('-');
- if (dash_pos != std::string::npos) {
- int nodashsug = 1;
-@@ -1048,7 +1089,7 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
- size_t prev_pos = 0;
- bool last = false;
-
-- while (nodashsug && !last) {
-+ while (!good && nodashsug && !last) {
- if (dash_pos == scw.size())
- last = 1;
- std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
-diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx
-index 73ea91e..ade85af 100644
---- a/src/hunspell/suggestmgr.cxx
-+++ b/src/hunspell/suggestmgr.cxx
-@@ -132,6 +132,11 @@ SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) {
- ctryl = u8_u16(ctry_utf, tryme);
- }
- }
-+
-+ // language with possible dash usage
-+ // (latin letters or dash in TRY characters)
-+ lang_with_dash_usage = (ctry &&
-+ ((strchr(ctry, '-') != NULL) || (strchr(ctry, 'a') != NULL)));
- }
-
- SuggestMgr::~SuggestMgr() {
-@@ -169,10 +174,13 @@ void SuggestMgr::testsug(std::vector<std::string>& wlst,
- }
- }
-
--// generate suggestions for a misspelled word
--// pass in address of array of char * pointers
--// onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
--void SuggestMgr::suggest(std::vector<std::string>& slst,
-+/* generate suggestions for a misspelled word
-+ * pass in address of array of char * pointers
-+ * onlycompoundsug: probably bad suggestions (need for ngram sugs, too)
-+ * return value: true, if there is a good suggestion
-+ * (REP, ph: or a dictionary word pair)
-+ */
-+bool SuggestMgr::suggest(std::vector<std::string>& slst,
- const char* w,
- int* onlycompoundsug) {
- int nocompoundtwowords = 0;
-@@ -182,6 +190,7 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
- std::string w2;
- const char* word = w;
- size_t oldSug = 0;
-+ bool good_suggestion = false;
-
- // word reversing wrapper for complex prefixes
- if (complexprefixes) {
-@@ -196,11 +205,11 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
- if (utf8) {
- wl = u8_u16(word_utf, word);
- if (wl == -1) {
-- return;
-+ return false;
- }
- }
-
-- for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0);
-+ for (int cpdsuggest = 0; (cpdsuggest < 2) && (nocompoundtwowords == 0) && !good_suggestion;
- cpdsuggest++) {
- // limit compound suggestion
- if (cpdsuggest > 0)
-@@ -208,15 +217,21 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
-
- // suggestions for an uppercase word (html -> HTML)
- if (slst.size() < maxSug) {
-+ size_t i = slst.size();
- if (utf8)
- capchars_utf(slst, &word_utf[0], wl, cpdsuggest);
- else
- capchars(slst, word, cpdsuggest);
-+ if (slst.size() > i)
-+ good_suggestion = true;
- }
-
- // perhaps we made a typical fault of spelling
- if ((slst.size() < maxSug) && (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
-+ size_t i = slst.size();
- replchars(slst, word, cpdsuggest);
-+ if (slst.size() > i)
-+ good_suggestion = true;
- }
-
- // perhaps we made chose the wrong char from a related set
-@@ -294,15 +309,19 @@ void SuggestMgr::suggest(std::vector<std::string>& slst,
- }
-
- // perhaps we forgot to hit space and two words ran together
-- if (!nosplitsugs && (slst.size() < maxSug) &&
-- (!cpdsuggest || (slst.size() < oldSug + maxcpdsugs))) {
-- twowords(slst, word, cpdsuggest);
-+ // (dictionary word pairs have top priority here, so
-+ // we always suggest them, in despite of nosplitsugs, and
-+ // drop compound word and other suggestions)
-+ if (!cpdsuggest || (!nosplitsugs && slst.size() < oldSug + maxcpdsugs)) {
-+ good_suggestion = twowords(slst, word, cpdsuggest, good_suggestion);
- }
-
- } // repeating ``for'' statement compounding support
-
- if (!nocompoundtwowords && (!slst.empty()) && onlycompoundsug)
- *onlycompoundsug = 1;
-+
-+ return good_suggestion;
- }
-
- // suggestions for an uppercase word (html -> HTML)
-@@ -721,17 +740,22 @@ int SuggestMgr::forgotchar_utf(std::vector<std::string>& wlst,
- return wlst.size();
- }
-
--/* error is should have been two words */
--int SuggestMgr::twowords(std::vector<std::string>& wlst,
-+/* error is should have been two words
-+ * return value is true, if there is a dictionary word pair,
-+ * or there was already a good suggestion before calling
-+ * this function.
-+ */
-+bool SuggestMgr::twowords(std::vector<std::string>& wlst,
- const char* word,
-- int cpdsuggest) {
-+ int cpdsuggest,
-+ bool good) {
- int c2;
- int forbidden = 0;
- int cwrd;
-
- int wl = strlen(word);
- if (wl < 3)
-- return wlst.size();
-+ return false;
-
- if (langnum == LANG_hu)
- forbidden = check_forbidden(word, wl);
-@@ -750,63 +774,87 @@ int SuggestMgr::twowords(std::vector<std::string>& wlst,
- }
- if (utf8 && p[1] == '\0')
- break; // last UTF-8 character
-- *p = '\0';
-- int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
-- if (c1) {
-- c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
-- if (c2) {
-- *p = ' ';
--
-- // spec. Hungarian code (need a better compound word support)
-- if ((langnum == LANG_hu) && !forbidden &&
-- // if 3 repeating letter, use - instead of space
-- (((p[-1] == p[1]) &&
-- (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
-- // or multiple compounding, with more, than 6 syllables
-- ((c1 == 3) && (c2 >= 2))))
-- *p = '-';
--
-- cwrd = 1;
-- for (size_t k = 0; k < wlst.size(); ++k) {
-- if (wlst[k] == candidate) {
-- cwrd = 0;
-- break;
-- }
-- }
-- if (wlst.size() < maxSug) {
-- if (cwrd) {
-- wlst.push_back(candidate);
-- }
-- } else {
-- free(candidate);
-- return wlst.size();
-+
-+ // Suggest only word pairs, if they are listed in the dictionary.
-+ // For example, adding "a lot" to the English dic file will
-+ // result only "alot" -> "a lot" suggestion instead of
-+ // "alto, slot, alt, lot, allot, aloft, aloe, clot, plot, blot, a lot".
-+ // Note: using "ph:alot" keeps the other suggestions:
-+ // a lot ph:alot
-+ // alot -> a lot, alto, slot...
-+ *p = ' ';
-+ if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
-+ // remove not word pair suggestions
-+ if (!good) {
-+ good = true;
-+ wlst.clear();
-+ }
-+ wlst.insert(wlst.begin(), candidate);
-+ }
-+
-+ // word pairs with dash?
-+ if (lang_with_dash_usage) {
-+ *p = '-';
-+
-+ if (!cpdsuggest && checkword(candidate, cpdsuggest, NULL, NULL)) {
-+ // remove not word pair suggestions
-+ if (!good) {
-+ good = true;
-+ wlst.clear();
- }
-- // add two word suggestion with dash, if TRY string contains
-- // "a" or "-"
-- // NOTE: cwrd doesn't modified for REP twoword sugg.
-- if (ctry && (strchr(ctry, 'a') || strchr(ctry, '-')) &&
-- mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
-- *p = '-';
-+ wlst.insert(wlst.begin(), candidate);
-+ }
-+ }
-+
-+ if (wlst.size() < maxSug && !nosplitsugs && !good) {
-+ *p = '\0';
-+ int c1 = checkword(candidate, cpdsuggest, NULL, NULL);
-+ if (c1) {
-+ c2 = checkword((p + 1), cpdsuggest, NULL, NULL);
-+ if (c2) {
-+ // spec. Hungarian code (TODO need a better compound word support)
-+ if ((langnum == LANG_hu) && !forbidden &&
-+ // if 3 repeating letter, use - instead of space
-+ (((p[-1] == p[1]) &&
-+ (((p > candidate + 1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
-+ // or multiple compounding, with more, than 6 syllables
-+ ((c1 == 3) && (c2 >= 2))))
-+ *p = '-';
-+ else
-+ *p = ' ';
-+
-+ cwrd = 1;
- for (size_t k = 0; k < wlst.size(); ++k) {
- if (wlst[k] == candidate) {
- cwrd = 0;
- break;
- }
- }
-- if (wlst.size() < maxSug) {
-- if (cwrd) {
-+
-+ if (cwrd && (wlst.size() < maxSug))
- wlst.push_back(candidate);
-+
-+ // add two word suggestion with dash, depending on the language
-+ // Note that cwrd doesn't modified for REP twoword sugg.
-+ if ( !nosplitsugs && lang_with_dash_usage &&
-+ mystrlen(p + 1) > 1 && mystrlen(candidate) - mystrlen(p) > 1) {
-+ *p = '-';
-+ for (size_t k = 0; k < wlst.size(); ++k) {
-+ if (wlst[k] == candidate) {
-+ cwrd = 0;
-+ break;
-+ }
- }
-- } else {
-- free(candidate);
-- return wlst.size();
-+
-+ if ((wlst.size() < maxSug) && cwrd)
-+ wlst.push_back(candidate);
- }
- }
- }
- }
- }
- free(candidate);
-- return wlst.size();
-+ return good;
- }
-
- // error is adjacent letter were swapped
-@@ -994,7 +1042,8 @@ int SuggestMgr::movechar_utf(std::vector<std::string>& wlst,
- // generate a set of suggestions for very poorly spelled words
- void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
- const char* w,
-- const std::vector<HashMgr*>& rHMgr) {
-+ const std::vector<HashMgr*>& rHMgr,
-+ int captype) {
- int lval;
- int sc;
- int lp, lpphon;
-@@ -1071,18 +1120,34 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
- u8_u16(w_word, word);
- u8_u16(w_target, target);
- }
--
-+
- std::string f;
- std::vector<w_char> w_f;
--
-+
- for (size_t i = 0; i < rHMgr.size(); ++i) {
- while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
-- if ((hp->astr) && (pAMgr) &&
-- (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
-- TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
-- TESTAFF(hp->astr, nosuggest, hp->alen) ||
-- TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
-- TESTAFF(hp->astr, onlyincompound, hp->alen)))
-+ // skip exceptions
-+ if (
-+ // skip it, if the word length different by 5 or
-+ // more characters (to avoid strange suggestions)
-+ // (except Unicode characters over BMP)
-+ (((abs(n - hp->clen) > 4) && !nonbmp)) ||
-+ // don't suggest capitalized dictionary words for
-+ // lower case misspellings in ngram suggestions, except
-+ // - PHONE usage, or
-+ // - in the case of German, where not only proper
-+ // nouns are capitalized, or
-+ // - the capitalized word has special pronunciation
-+ ((captype == NOCAP) && (hp->var & H_OPT_INITCAP) &&
-+ !ph && (langnum != LANG_de) && !(hp->var & H_OPT_PHON)) ||
-+ // or it has one of the following special flags
-+ ((hp->astr) && (pAMgr) &&
-+ (TESTAFF(hp->astr, forbiddenword, hp->alen) ||
-+ TESTAFF(hp->astr, ONLYUPCASEFLAG, hp->alen) ||
-+ TESTAFF(hp->astr, nosuggest, hp->alen) ||
-+ TESTAFF(hp->astr, nongramsuggest, hp->alen) ||
-+ TESTAFF(hp->astr, onlyincompound, hp->alen)))
-+ )
- continue;
-
- if (utf8) {
-@@ -1105,7 +1170,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
- sc = ngram(3, word, f, NGRAM_LONGER_WORSE) + leftcommon;
- }
-
-- // check special pronounciation
-+ // check special pronunciation
- f.clear();
- if ((hp->var & H_OPT_PHON) &&
- copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
-@@ -1559,7 +1624,8 @@ int SuggestMgr::checkword(const std::string& word,
- if (rv) {
- if ((rv->astr) &&
- (TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
-- TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen)))
-+ TESTAFF(rv->astr, pAMgr->get_nosuggest(), rv->alen) ||
-+ TESTAFF(rv->astr, pAMgr->get_substandard(), rv->alen)))
- return 0;
- while (rv) {
- if (rv->astr &&
-@@ -1584,7 +1650,7 @@ int SuggestMgr::checkword(const std::string& word,
- if (!rv && pAMgr->have_contclass()) {
- rv = pAMgr->suffix_check_twosfx(word.c_str(), word.size(), 0, NULL, FLAG_NULL);
- if (!rv)
-- rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 1, FLAG_NULL);
-+ rv = pAMgr->prefix_check_twosfx(word.c_str(), word.size(), 0, FLAG_NULL);
- }
-
- // check forbidden words
-diff --git a/src/hunspell/suggestmgr.hxx b/src/hunspell/suggestmgr.hxx
-index 19ffc03..f0daf23 100644
---- a/src/hunspell/suggestmgr.hxx
-+++ b/src/hunspell/suggestmgr.hxx
-@@ -109,6 +109,7 @@ class SuggestMgr {
- char* ctry;
- size_t ctryl;
- std::vector<w_char> ctry_utf;
-+ bool lang_with_dash_usage;
-
- AffixMgr* pAMgr;
- unsigned int maxSug;
-@@ -124,8 +125,8 @@ class SuggestMgr {
- SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr);
- ~SuggestMgr();
-
-- void suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
-- void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr);
-+ bool suggest(std::vector<std::string>& slst, const char* word, int* onlycmpdsug);
-+ void ngsuggest(std::vector<std::string>& slst, const char* word, const std::vector<HashMgr*>& rHMgr, int captype);
-
- std::string suggest_morph(const std::string& word);
- std::string suggest_gen(const std::vector<std::string>& pl, const std::string& pattern);
-@@ -149,7 +150,7 @@ class SuggestMgr {
- int extrachar(std::vector<std::string>&, const char*, int);
- int badcharkey(std::vector<std::string>&, const char*, int);
- int badchar(std::vector<std::string>&, const char*, int);
-- int twowords(std::vector<std::string>&, const char*, int);
-+ bool twowords(std::vector<std::string>&, const char*, int, bool);
-
- void capchars_utf(std::vector<std::string>&, const w_char*, int wl, int);
- int doubletwochars_utf(std::vector<std::string>&, const w_char*, int wl, int);
---
-2.7.4
-