fix spell checking issues using recent Hunspell patches

Test: English word "Ian" are "item" are not allowed as "İan", "İtem" now. Patch list with commit ids in Hunspell repository: commit 66badb7449c2053c89456f11a7f71f3f5916b550 Extend dotless i and dotted I rules to Crimean Tatar language commit 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Allow dotted I in dictionary, and disable bad capitalization commit 39b785a6b03b35cc8a27f43f6005dcaa432694e1 FORBIDDENWORD precedes BREAK commit 0f691abe68788d0a58e72ab66877a9f670cd2741 Remove forbidden words from dash suggestion list commit 15b2cde4f01706f0a648518a5cfc57394d015448 tdf#95024 fix compound handling for new Hungarian orthography commit de3ae6844af62300e473f7b7b66a56e54153b4b9 fix compound word part "pa:" Change-Id: Id12b5629b0c975464072b5b144743cbe40fe45a3 Reviewed-on: https://gerrit.libreoffice.org/44200 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Andras Timar <andras.timar@collabora.com>
author: László Németh <nemeth@numbertext.org> 2017-11-02 09:51:36 +0100
committer: Andras Timar <andras.timar@collabora.com> 2017-11-06 17:50:04 +0100
commit: f037207675010fdff2c1968a67fae5b0c2c34331 (patch)
tree: da0e0d233368adc67063e024a72b2391db8a3d03 /external
parent: a7cd63df37144eba8544f6b10b83737fa0496461 (diff)
7 files changed, 252 insertions, 0 deletions
diff --git a/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch
new file mode 100644
index 000000000000..b4b04385c935
--- /dev/null
+++ b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch
@@ -0,0 +1,55 @@
+From 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
+ <laszlo.nemeth@collabora.com>
+Date: Thu, 5 Oct 2017 12:24:02 +0200
+Subject: [PATCH] Allow dotted I in dictionary, and disable bad capitalization
+ of i.
+
+Dictionary words weren't recognized with dotted I, but dictionary
+words with the letter i were recognized with dotted I, too.
+---
+ src/hunspell/hunspell.cxx | 18 +++++++++++++-----
+ 1 file changed, 13 insertions(+), 5 deletions(-)
+
+diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
+index 1ef11df..5c98f8a 100644
+--- a/src/hunspell/hunspell.cxx
++++ b/src/hunspell/hunspell.cxx
+@@ -562,11 +562,15 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
+       }
+     }
+     case INITCAP: {
+-
++      // handle special capitalization of dotted I
++      bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0);
+       *info += SPELL_ORIGCAP;
+-      mkallsmall2(scw, sunicw);
+-      std::string u8buffer(scw);
+-      mkinitcap2(scw, sunicw);
++      if (captype == ALLCAP) {
++          mkallsmall2(scw, sunicw);
++          mkinitcap2(scw, sunicw);
++          if (Idot)
++             scw.replace(0, 1, "\xc4\xb0");
++      }
+       if (captype == INITCAP)
+         *info += SPELL_INITCAP;
+       rv = checkword(scw, info, root);
+@@ -581,9 +585,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
+       }
+       if (rv && is_keepcase(rv) && (captype == ALLCAP))
+         rv = NULL;
+-      if (rv)
++      if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh))
+         break;
+ 
++      mkallsmall2(scw, sunicw);
++      std::string u8buffer(scw);
++      mkinitcap2(scw, sunicw);
++
+       rv = checkword(u8buffer, info, root);
+       if (abbv && !rv) {
+         u8buffer.push_back('.');
+-- 
+1.9.1
+
diff --git a/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch
new file mode 100644
index 000000000000..66cc78188521
--- /dev/null
+++ b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch
@@ -0,0 +1,66 @@
+From 66badb7449c2053c89456f11a7f71f3f5916b550 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
+ <laszlo.nemeth@collabora.com>
+Date: Thu, 5 Oct 2017 11:13:28 +0200
+Subject: [PATCH] Extend dotless i and dotted I rules to Crimean Tatar language
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+to support its special casing of ı/I, i/İ.
+
+(Use
+
+LANG crh
+
+in the affix file to use this feature.)
+---
+ src/hunspell/csutil.cxx  | 5 +++--
+ src/hunspell/langnum.hxx | 1 +
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx
+index df97b57..2980da7 100644
+--- a/src/hunspell/csutil.cxx
++++ b/src/hunspell/csutil.cxx
+@@ -2401,6 +2401,7 @@ static struct lang_map lang2enc[] =
+     {{"ar", LANG_ar},    {"az", LANG_az},
+      {"az_AZ", LANG_az},  // for back-compatibility
+      {"bg", LANG_bg},    {"ca", LANG_ca},
++     {"crh", LANG_crh},
+      {"cs", LANG_cs},    {"da", LANG_da},
+      {"de", LANG_de},    {"el", LANG_el},
+      {"en", LANG_en},    {"es", LANG_es},
+@@ -2458,7 +2459,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) {
+   // In Azeri and Turkish, I and i dictinct letters:
+   // There are a dotless lower case i pair of upper `I',
+   // and an upper I with dot pair of lower `i'.
+-  if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr)))
++  if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
+     return 0x0130;
+ #ifdef OPENOFFICEORG
+   return static_cast<unsigned short>(u_toupper(c));
+@@ -2475,7 +2476,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) {
+   // In Azeri and Turkish, I and i dictinct letters:
+   // There are a dotless lower case i pair of upper `I',
+   // and an upper I with dot pair of lower `i'.
+-  if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr)))
++  if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
+     return 0x0131;
+ #ifdef OPENOFFICEORG
+   return static_cast<unsigned short>(u_tolower(c));
+diff --git a/src/hunspell/langnum.hxx b/src/hunspell/langnum.hxx
+index a64d3d7..f09de40 100644
+--- a/src/hunspell/langnum.hxx
++++ b/src/hunspell/langnum.hxx
+@@ -48,6 +48,7 @@ enum {
+   LANG_az = 100,  // custom number
+   LANG_bg = 41,
+   LANG_ca = 37,
++  LANG_crh = 102, // custom number
+   LANG_cs = 42,
+   LANG_da = 45,
+   LANG_de = 49,
+-- 
+1.9.1
+
diff --git a/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch
new file mode 100644
index 000000000000..6cad45d8a8bf
--- /dev/null
+++ b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch
@@ -0,0 +1,27 @@
+From 39b785a6b03b35cc8a27f43f6005dcaa432694e1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Mon, 9 Oct 2017 13:02:39 +0200
+Subject: [PATCH] FORBIDDENWORD precedes BREAK
+
+Now it's possible to forbid compound forms recognized by
+BREAK word breaking.
+---
+ src/hunspell/hunspell.cxx | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
+index 5c98f8a..3fd0d16 100644
+--- a/src/hunspell/hunspell.cxx
++++ b/src/hunspell/hunspell.cxx
+@@ -633,7 +633,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
+   }
+ 
+   // recursive breaking at break points
+-  if (!wordbreak.empty()) {
++  if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) {
+ 
+     int nbr = 0;
+     wl = scw.size();
+-- 
+1.9.1
+
diff --git a/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch
new file mode 100644
index 000000000000..b0f8563371ed
--- /dev/null
+++ b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch
@@ -0,0 +1,29 @@
+From 0f691abe68788d0a58e72ab66877a9f670cd2741 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Tue, 10 Oct 2017 11:58:43 +0200
+Subject: [PATCH] Remove forbidden words from dash suggestion list
+
+---
+ src/hunspell/hunspell.cxx | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
+index 3fd0d16..76e61b1 100644
+--- a/src/hunspell/hunspell.cxx
++++ b/src/hunspell/hunspell.cxx
+@@ -1069,7 +1069,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
+             wspace.append("-");
+             wspace.append(scw.substr(dash_pos + 1));
+           }
+-          insert_sug(slst, wspace);
++          int info = 0;
++          if (pAMgr && pAMgr->get_forbiddenword())
++            checkword(wspace, &info, NULL);
++          if (!(info & SPELL_FORBIDDEN))
++            insert_sug(slst, wspace);
+         }
+         nodashsug = 0;
+       }
+-- 
+1.9.1
+
diff --git a/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch
new file mode 100644
index 000000000000..0bf52bdd95d4
--- /dev/null
+++ b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch
@@ -0,0 +1,43 @@
+From 15b2cde4f01706f0a648518a5cfc57394d015448 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Thu, 12 Oct 2017 16:47:57 +0200
+Subject: [PATCH] fix compound handling for new Hungarian orthography
+
+Extend partial fix in commit 42807f970ac2d65f0d13a7c57eb454b210e92240.
+---
+ src/hunspell/affixmgr.cxx | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
+index ffce7bb..ea0f0fc 100644
+--- a/src/hunspell/affixmgr.cxx
++++ b/src/hunspell/affixmgr.cxx
+@@ -1990,6 +1990,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
+                 std::string tmp(sfxappnd);
+                 reverseword(tmp);
+                 numsyllable -= get_syllable(tmp) + sfxextra;
++              } else {
++                numsyllable -= sfxextra;
+               }
+ 
+               // + 1 word, if syllable number of the prefix > 1 (hungarian
+@@ -2024,7 +2026,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
+                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+               wordnum++;
+             }
+-
+             // second word is acceptable, as a word with prefix or/and suffix?
+             // hungarian conventions: compounding is acceptable,
+             // when compound forms consist 2 word, otherwise
+@@ -2553,6 +2554,8 @@ int AffixMgr::compound_check_morph(const char* word,
+             std::string tmp(sfxappnd);
+             reverseword(tmp);
+             numsyllable -= get_syllable(tmp) + sfxextra;
++          } else {
++            numsyllable -= sfxextra;
+           }
+ 
+           // + 1 word, if syllable number of the prefix > 1 (hungarian
+-- 
+1.9.1
+
diff --git a/external/hunspell/0001-fix-compound-word-part-pa.patch b/external/hunspell/0001-fix-compound-word-part-pa.patch
new file mode 100644
index 000000000000..152a9ff58a14
--- /dev/null
+++ b/external/hunspell/0001-fix-compound-word-part-pa.patch
@@ -0,0 +1,26 @@
+From de3ae6844af62300e473f7b7b66a56e54153b4b9 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
+Date: Mon, 16 Oct 2017 23:00:23 +0200
+Subject: [PATCH] fix compound word part "pa:"
+
+(regression in morphological analysis)
+---
+ src/hunspell/affixmgr.cxx | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
+index ea0f0fc..52c7fa5 100644
+--- a/src/hunspell/affixmgr.cxx
++++ b/src/hunspell/affixmgr.cxx
+@@ -2608,7 +2608,7 @@ int AffixMgr::compound_check_morph(const char* word,
+           if (!m.empty()) {
+             result.push_back(MSEP_FLD);
+             result.append(MORPH_PART);
+-            result.append(word + 1);
++            result.append(word + i);
+             line_uniq_app(m, MSEP_REC);
+             result.append(m);
+           }
+-- 
+1.9.1
+
diff --git a/external/hunspell/UnpackedTarball_hunspell.mk b/external/hunspell/UnpackedTarball_hunspell.mk
index 3bb7e5e42dc7..23d3aca47131 100644
--- a/external/hunspell/UnpackedTarball_hunspell.mk
+++ b/external/hunspell/UnpackedTarball_hunspell.mk
@@ -21,6 +21,12 @@ $(eval $(call gb_UnpackedTarball_set_patchlevel,hunspell,1))
 
 $(eval $(call gb_UnpackedTarball_add_patches,hunspell, \
 	external/hunspell/0001-Revert-Remove-autotools-autogenerated-files.patch \
+	external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch \
+	external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch \
+	external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch \
+	external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch \
+	external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch \
+	external/hunspell/0001-fix-compound-word-part-pa.patch \
 ))
 
 # vim: set noet sw=4 ts=4:
author	László Németh <nemeth@numbertext.org>	2017-11-02 09:51:36 +0100
committer	Andras Timar <andras.timar@collabora.com>	2017-11-06 17:50:04 +0100
commit	f037207675010fdff2c1968a67fae5b0c2c34331 (patch)
tree	da0e0d233368adc67063e024a72b2391db8a3d03 /external
parent	a7cd63df37144eba8544f6b10b83737fa0496461 (diff)