diff options
Diffstat (limited to 'libtextcat')
78 files changed, 32256 insertions, 0 deletions
diff --git a/libtextcat/data/new_fingerprints/LICENSE b/libtextcat/data/new_fingerprints/LICENSE new file mode 100644 index 000000000000..6d883704c525 --- /dev/null +++ b/libtextcat/data/new_fingerprints/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2003, WiseGuys Internet B.V. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +- Neither the name of the WiseGuys Internet B.V. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/libtextcat/data/new_fingerprints/fpdb.conf b/libtextcat/data/new_fingerprints/fpdb.conf new file mode 100644 index 000000000000..df56f9e270ef --- /dev/null +++ b/libtextcat/data/new_fingerprints/fpdb.conf @@ -0,0 +1,85 @@ +# +# A sample config file for the language models +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - You may consider eliminating a couple of small languages from this +# list because they cause false positives with big languages and are +# bad for performance. (Do you really want to recognize Drents?) +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# + +# this file have been modified (to OOo by Jocelyn MERAND joc.mer@gmail.com) to include country and encoding +# guess strings are made as following : language-country-encoding + +afrikaans.lm af--utf8 +albanian.lm sq--utf8 +amharic_utf.lm am--utf8 +arabic.lm ar--utf8 +basque.lm eu--utf8 +belarus.lm be--utf8 +bosnian.lm bs--utf8 +breton.lm br--utf8 +catalan.lm ca--utf8 +chinese_simplified.lm zh-CN-utf8 +chinese_traditional.lm zh-TW-utf8 +croatian.lm hr--utf8 +czech.lm cs--utf8 +danish.lm da--utf8 +dutch.lm nl--utf8 +english.lm en--utf8 +esperanto.lm eo--utf8 +estonian.lm et--utf8 +finnish.lm fi--utf8 +french.lm fr--utf8 +frisian.lm fy--utf8 +georgian.lm ka--utf8 +german.lm de--utf8 +greek.lm el--utf8 +hebrew.lm he--utf8 +hindi.lm hi--utf8 +hungarian.lm hu--utf8 +icelandic.lm is--utf8 +indonesian.lm id--utf8 +irish_gaelic.lm ga--utf8 +italian.lm it--utf8 +japanese.lm ja--utf8 +korean.lm ko--utf8 +latin.lm la--utf8 +latvian.lm lv--utf8 +lithuanian.lm lt--utf8 +luxembourgish.lm lb--utf8 +malay.lm ms--utf8 +manx_gaelic.lm gv--utf8 +marathi.lm mr--utf8 +mongolian_cyrillic.lm mn--utf8 +nepali.lm ne--utf8 +norwegian.lm nb--utf8 # Norwegian (Bokmal) +persian.lm fa--utf8 # Farsi +polish.lm pl--utf8 +portuguese.lm pt-PT-utf8 +quechua.lm qu--utf8 +romanian.lm ro--utf8 +romansh.lm rm--utf8 +russian.lm ru--utf8 +sanskrit.lm sa--utf8 +scots.lm sco--utf8 +scots_gaelic.lm gd--utf8 +serbian_ascii.lm sh-YU-utf8 +slovak_ascii.lm sk-SK-utf8 +slovenian.lm sl--utf8 +spanish.lm es--utf8 +swahili.lm sw--utf8 +swedish.lm sv--utf8 +tagalog.lm tl--utf8 +tamil.lm ta--utf8 +thai.lm th--utf8 +turkish.lm tr--utf8 +ukrainian.lm uk--utf8 +vietnamese.lm vi--utf8 +welsh.lm cy--utf8 +yiddish_utf.lm yi--utf8 +zulu.lm zu--utf8 diff --git a/libtextcat/data/new_fingerprints/lm/afrikaans.lm b/libtextcat/data/new_fingerprints/lm/afrikaans.lm new file mode 100644 index 000000000000..c110f154b664 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/afrikaans.lm @@ -0,0 +1,400 @@ +_ 23602 +e 8036 +a 4087 +n 3782 +i 3726 +o 3314 +r 2951 +s 2885 +t 2749 +d 2479 +e_ 2118 +l 1854 +k 1741 +ie 1670 +g 1601 +n_ 1447 +m 1440 +_d 1219 +t_ 1143 +er 1124 +h 1124 +u 1110 +ie_ 1079 +y 1048 +w 986 +s_ 982 +_s 969 +_h 956 +di 924 +an 922 +r_ 912 +aa 882 +v 876 +en 807 +_di 807 +. 790 +y_ 747 +_v 709 +et 706 +._ 694 +die 691 +die_ 667 +_n 666 +_die 651 +p 639 +_m 634 +_die_ 633 +_w 632 +ee 607 +ge 606 +_o 598 +b 586 +te 568 +, 560 +in 555 +k_ 550 +_e 550 +,_ 548 +oo 516 +et_ 511 +de 509 +el 489 +_g 486 +f 461 +ar 451 +ni 450 +nd 442 +an_ 440 +en_ 437 +_i 426 +he 423 +g_ 418 +_t 412 +oe 410 +at 406 +er_ 400 +om 381 +wa 378 +_a 378 +_b 377 +_k 371 +nie 371 +_he 370 +aar 355 +_ge 351 +es 351 +_ni 348 +da 346 +m_ 342 +ou 338 +it 335 +_nie 335 +d_ 332 +l_ 330 +_wa 329 +or 327 +le 326 +we 326 +ek 324 +het 321 +me 319 +_het 319 +is 318 +j 315 +at_ 311 +on 309 +se 308 +_en 298 +ma 294 +st 291 +as 280 +va 277 +_en_ 270 +re 270 +" 269 +' 265 +het_ 261 +_het_ 260 +om_ 254 +al 252 +ar_ 250 +li 248 +te_ 247 +aar_ 247 +_da 245 +u_ 242 +nde 241 +ou_ 237 +_l 231 +be 229 +_' 226 +rd 224 +_va 224 +ig 223 +ng 222 +ns 221 +ve 220 +it_ 218 +_j 216 +_me 216 +sy 215 +ke 213 +_sy 212 +aan 212 +van 212 +_in 210 +is_ 210 +in_ 208 +sy_ 206 +_sy_ 206 +'n 205 +ro 205 +ko 204 +_'n 203 +ra 203 +'n_ 203 +_'n_ 202 +so 202 +D 202 +ho 201 +rs 200 +eer 200 +ik 199 +la 198 +_te 196 +_van 196 +_ma 195 +as_ 194 +ui 194 +ver 192 +e. 192 +der 191 +to 188 +op 187 +van_ 184 +ag 184 +_ve 182 +and 180 +_van_ 178 +ha 178 +f_ 176 +ka 176 +ne 175 +_is 175 +sk 174 +e._ 174 +oor 174 +_ver 170 +ek_ 170 +_hy 170 +hy 170 +p_ 168 +_be 168 +ri 168 +ur 167 +nie_ 165 +_so 165 +_D 164 +si 164 +ll 164 +no 164 +_in_ 163 +_hy_ 162 +hy_ 162 +ed 161 +ers 160 +_r 156 +ak 156 +_ho 155 +_nie_ 153 +eg 153 +nt 152 +de_ 152 +_p 151 +_we 148 +_is_ 148 +ei 147 +es_ 142 +maa 142 +wee 142 +na 141 +nder 139 +a_ 138 +ing 138 +ew 138 +S 135 +lle 135 +_om 135 +_te_ 134 +eu 134 +ie. 134 +wo 132 +em 132 +wat 131 +_no 130 +_" 130 +vo 130 +E 129 +H 128 +_wat 127 +ti 126 +mo 126 +A 126 +e, 126 +_ha 125 +vi 125 +el_ 125 +ter 125 +e,_ 124 +dat 124 +eer_ 124 +wat_ 124 +le_ 124 +ta 124 +Di 123 +dat_ 123 +_wat_ 122 +ie._ 122 +was 121 +ste 121 +_H 121 +_se 121 +se_ 120 +ul 120 +al_ 120 +_was 120 +_om_ 119 +_st 119 +lik 118 +"_ 118 +_ko 118 +_maa 118 +lo 117 +_to 117 +ns_ 115 +aan_ 115 +nie. 114 +_vi 114 +met 114 +_nie. 111 +nk 110 +_Di 110 +- 110 +_op 109 +_oo 109 +_on 108 +ir 108 +ord 108 +uit 106 +ens 105 +_was_ 105 +was_ 105 +een 105 +_met 105 +os 105 +_S 104 +nie._ 104 +ig_ 103 +_sk 102 +op_ 101 +_ek 101 +_wee 101 +ir_ 101 +met_ 100 +_met_ 100 +rt 100 +ik_ 99 +end 99 +nd_ 99 +gt 99 +ond 98 +ot 98 +_aa 97 +og 97 +vir_ 95 +vir 95 +_ka 94 +hu 94 +_mo 94 +_vir_ 94 +_vir 94 +_dit 93 +kr 93 +am 93 +ol 93 +dit 93 +_ek_ 93 +ki 93 +sa 93 +_aan 92 +man 92 +jy 92 +ng_ 92 +aak 92 +lle_ 91 +_hu 91 +_na 91 +_vo 90 +ewe 90 +of 90 +jy_ 90 +_dit_ 90 +dit_ 90 +_jy 89 +der_ 89 +jo 89 +_f 88 +_u 88 +sie 87 +_dat 87 +_jy_ 87 +daa 87 +do 87 +vr 87 +wi 86 +ry 86 +_dat_ 86 +eur 86 +rs_ 85 +_jo 85 +_wo 84 +_ne 84 +jie 84 +ji 84 +pe 83 +moe 83 +my 82 +ull 82 +Die 81 +maar 81 +_hom 81 +ulle 81 +_maar 81 +hom 81 +_uit 80 +_ui 80 +ges 80 +raa 80 +or_ 80 +ies 80 +jou 79 +_la 79 +maar_ 79 +ulle_ 79 +_daa 79 +Die_ 79 +daar 78 +_daar 78 +ien 78 +_my 78 +_jou 78 +ok 78 +il 78 +lik_ 77 +sta 77 +_Die 77 +ur_ 77 +ga 77 +ag_ 77 +kan 77 diff --git a/libtextcat/data/new_fingerprints/lm/albanian.lm b/libtextcat/data/new_fingerprints/lm/albanian.lm new file mode 100644 index 000000000000..0665a962d018 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/albanian.lm @@ -0,0 +1,400 @@ +_ 19480 +ë 4099 +e 4082 +t 3635 +i 3134 +a 2893 +r 2820 +n 2610 +s 2380 +h 2060 +ë_ 2055 +e_ 1825 +j 1677 +u 1489 +d 1381 +o 1370 +m 1318 +k 1264 +të 1091 +p 1072 +_t 1068 +sh 998 +l 936 +_n 876 +a_ 822 +, 816 +,_ 808 +të_ 795 +i_ 770 +_p 739 +_m 702 +_s 700 +te 653 +ër 620 +_d 613 +_e 607 +g 602 +_k 601 +_të 593 +. 575 +_të_ 574 +v 567 +_e_ 554 +r_ 525 +._ 523 +ht 503 +n_ 480 +he 473 +në 462 +sht 461 +te_ 457 +q 454 +nd 436 +ri 432 +is 414 +et 403 +b 402 +je 401 +me 395 +in 391 +it 381 +rë 374 +_a 374 +t_ 359 +ur 353 +_i 346 +ar 342 +ës 339 +er 338 +në_ 338 +ën 338 +dh 337 +en 336 +pë 334 +f 328 +_v 323 +jë 318 +nj 313 +ish 312 +për 294 +y 285 +z 282 +es 281 +at 274 +_me 273 +_q 273 +gj 269 +ra 261 +as 258 +_në 256 +ku 256 +j_ 250 +ta 249 +re 246 +një 245 +o_ 243 +ni 243 +_pë 240 +hte 240 +_nj 239 +on 239 +isht 236 +pa 234 +th 233 +shte 233 +_për 232 +se 228 +_g 223 +ve 221 +in_ 220 +s_ 219 +_në_ 219 +do 218 +hte_ 218 +më 216 +ti 215 +aj 212 +shte_ 212 +ej 212 +u_ 211 +që 211 +_sh 210 +nt 207 +jë_ 206 +_b 205 +_një 203 +di 202 +_pa 201 +_i_ 201 +ll 199 +_f 199 +kë 198 +me_ 197 +dhe 195 +ishte 195 +si 194 +hi 191 +he_ 188 +- 187 +ja 187 +_që 187 +ua 186 +il 184 +_dh 184 +ur_ 183 +ër_ 182 +or 180 +se_ 179 +që_ 178 +S 176 +ç 175 +_h 173 +an 172 +një_ 172 +ng 170 +nte 170 +_që_ 169 +_S 169 +rë_ 166 +dhe_ 165 +_me_ 164 +ka 162 +im 159 +hë 158 +mi 157 +to 156 +tu 156 +ën_ 155 +_një_ 154 +ha 153 +nte_ 150 +tr 148 +sa 148 +ët 148 +_gj 148 +un 147 +rr 147 +ë, 147 +_dhe 147 +ej_ 147 +ki 146 +ë,_ 146 +_ku 145 +_- 144 +_ng 142 +ik 141 +_nd 140 +end 138 +uk 137 +etë 135 +ko 135 +_dhe_ 135 +_ve 132 +va 131 +_l 131 +për_ 131 +shi 131 +erë 129 +ke 127 +kis 127 +së 126 +jo 125 +li 124 +ga 124 +kish 123 +_ki 122 +po 122 +_se 122 +' 121 +du 120 +mb 120 +_më 119 +Si 115 +më_ 115 +esh 115 +_si 114 +qe 114 +lë 114 +_kis 113 +oh 113 +_kish 113 +_Si 113 +pr 112 +_u 112 +uar 111 +de 111 +hu 111 +_th 111 +al 111 +ta_ 109 +ilv 108 +Sil 108 +Silv 108 +lv 108 +k_ 108 +e, 108 +ji 107 +e,_ 106 +_Sil 106 +_Silv 106 +_r 105 +os 104 +_se_ 104 +kisht 102 +_di 102 +st 101 +_për_ 101 +bë 101 +tj 100 +_nga 99 +nga 99 +_du 98 +ra_ 98 +vë 98 +gji 98 +_ish 96 +rt 96 +_is 96 +ro 95 +ir 94 +ga_ 94 +ësh 94 +ont 93 +c 93 +t, 93 +t,_ 93 +hin 92 +a, 92 +_at 92 +und 92 +jt 91 +_mb 91 +a,_ 91 +tje 90 +_nga_ 90 +_do 90 +_pr 90 +rit 90 +men 90 +nga_ 90 +ri_ 89 +N 89 +ma 89 +it_ 88 +_kë 88 +-_ 88 +m_ 87 +jo_ 87 +onte 87 +atë 87 +la 87 +ëri 87 +ilva 86 +shin 86 +ë. 86 +Silva 86 +lva 86 +së_ 85 +jer 85 +et_ 85 +_po 85 +ës_ 84 +kur 84 +ru 84 +nin 83 +ot 83 +hin_ 83 +_N 83 +her 83 +htë 82 +ap 82 +shin_ 82 +mo 81 +ash 81 +tha 81 +_ç 81 +ë._ 81 +ëm 81 +jit 80 +_ta 80 +ul 80 +le 80 +ho 80 +_z 79 +dr 78 +jet 78 +nin_ 78 +_më_ 78 +gjit 78 +A 78 +hk 78 +onte_ 78 +oni 77 +lo 77 +ba 77 +herë 77 +ndo 76 +shk 76 +mend 75 +_vë 75 +ha_ 75 +dë 75 +tur 74 +_A 74 +el 74 +bi 74 +_ko 74 +uk_ 73 +erë_ 73 +si_ 73 +_sa 73 +ar_ 72 +P 72 +rs 72 +pas 72 +ith 72 +uar_ 71 +_isht 71 +ai 70 +e. 70 +_vet 70 +vet 70 +_bë 70 +zi 70 +d_ 70 +jith 70 +da 70 +gjith 69 +duk 69 +na 69 +hej 69 +tër 68 +_men 68 +_ka 68 +am 68 +nd_ 68 +_c 67 +_pas 67 +_duk 67 +jes 67 +ak 67 +s, 67 +e._ 67 +s,_ 67 +K 67 +ësht 67 +mu 66 +kur_ 66 +yr 66 +em 65 +_së 65 +tha_ 65 +imi 65 +ie 65 +hej_ 64 +_së_ 64 +_u_ 64 +? 64 +fu 64 +_P 64 diff --git a/libtextcat/data/new_fingerprints/lm/amharic_utf.lm b/libtextcat/data/new_fingerprints/lm/amharic_utf.lm new file mode 100644 index 000000000000..0c5bc813e663 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/amharic_utf.lm @@ -0,0 +1,400 @@ +á 21403 +_ 10092 +ˆ 7734 +ሠ6558 +_á 5003 +‹ 4717 +‰ 4401 +በ4274 +á‹ 4176 +Š 4054 +አ3868 + 2728 +Œ 1656 +ጠ1591 +µ 1579 + 1425 + á 1402 +_አ1261 +_á‹ 1231 +¨ 1217 +á 1187 +¨á 1183 +_ሠ1160 +• 1145 +ˆá 1123 + 1097 +ን 1043 +Š• 1043 +° 1041 +°á 1004 +_በ991 +á 936 +« 880 +‹ 855 +á‹ 855 +¥ 849 +µá 805 +‰µ 783 +ት 783 +µ_ 763 +«á 709 +¥á 704 +‰ 682 +በ682 +á 679 +˜ 670 +•á 667 +በá 666 +‰ á 666 +˜á 658 + 643 +የ 637 +‹¨ 637 +‹¨á 627 +የá 627 +ለ 614 +ˆˆ 614 +ˆ 611 +ሠ611 +_ 588 +‰µ_ 583 +ት_ 583 +_የ 577 +_የá 574 +ለá 573 +ˆˆá 573 +ንá 570 +Š•á 570 +መ 563 +ˆ˜ 563 +ˆ˜á 557 +መá 557 +Š 554 +አ554 +አá 553 +ተ 553 +Š á 553 +‰° 553 +ተá 547 +‰°á 547 +ሠ534 +ˆ 534 + ሠ532 +- 531 +ስ 525 +ˆµ 525 +-- 521 +ሠ515 +ˆ 515 +--- 512 +---- 503 +_በ499 +----- 494 +_በá 487 +‹á 479 +¨áˆ 477 +•_ 473 +‹á 469 +á‹á 469 +ን_ 468 +Š•_ 468 +¢ 465 +³ 464 +á‹« 457 +‹« 457 +°áˆ 444 +_አ424 +_አá 424 +ስá 423 +ˆµá 423 +_ 415 +³á 402 +½ 401 + 390 +ˆáˆ 389 +› 382 +‹_ 378 +á‹_ 378 +á 365 +‹«á 364 +á‹«á 364 +“ 363 +áˆá 357 +ˆá 357 +£ 356 +¢_ 351 +‰½ 347 +ች 347 +Š 341 +አ341 +á‹ 337 +¢ 337 +ᢠ337 +‹ 337 +¢_ 337 +á¢_ 337 +á 336 +… 334 +Š¥ 320 +እ 320 +£á 320 +ሠ320 +Š¥á 318 +እá 318 +ˆ_ 314 +áˆ_ 314 +•á‹ 312 +›á 311 +¨á‰ 301 +ና 300 +ገ 300 +Š“ 300 +Œˆ 300 + በ299 +˜áˆ 297 +áˆá 294 +Œ 294 +ጠ294 +ˆá 294 +š 293 +ˆá 291 +ከ 291 +Œˆá 291 +Šá 291 +Š¨ 291 +áˆá 291 +ገá 291 +áŠá 291 +á‹° 288 +‹° 288 +_እ 285 +_እá 283 +® 279 +Š¨á 279 +ከá 279 +‰ ሠ279 +በሠ279 +ንዠ276 +šá 276 +Š•á‹ 276 +_ጠ272 +‰¥ 270 +ብ 270 +_ 269 +áŒá 264 +Œá 264 +ˆ˜áˆ 262 +¥áˆ 262 +መሠ262 +¥áŠ 262 +á‹á 261 +‹á 261 +ˆ› 260 +ማ 260 +á‹°á 259 +‹°á 259 +ራ 254 +‰£ 254 +ባ 254 +ˆ« 254 +€ 253 +®á 249 +á 247 +€á 245 +µáˆ 244 +ላ 242 +ˆ‹ 242 +የሠ242 +‹¨áˆ 242 +ማá 238 +ˆ›á 238 + አ237 +ረ 237 +ˆ¨ 237 +‰°áˆ 236 +ተሠ236 +áˆ_ 235 +ˆ_ 235 +ˆá‹ 234 +ሠ233 +«áˆ 233 +‰£á 230 +ˆš 230 +ባá 230 +ሚ 230 +ድ 228 +‹µ 228 +_መ 227 +በ227 +_መá 226 +ˆ¨á 225 +ረá 225 +² 225 +ᢠ222 +á¢_ 222 + á‹ 216 +እአ214 +ሚá 214 +Š¥áŠ 214 +ˆšá 214 +…á 213 +²á 212 +«á‹ 210 +ˆ‹á 209 +ላá 209 +© 208 +ˆá‰ 207 +‹áˆ 206 +½_ 206 +‰¥á 205 +ብá 205 +አ202 +ˆ° 200 +ታ 200 +‰³ 200 +ሰ 200 +ˆ«á 199 +ራá 199 +ሰá 198 +ˆ°á 198 +ትá 195 +ወ 195 +‰µá 195 +‹ˆ 195 +‹ˆá 194 +ወá 194 +½á 191 +Ž 191 +‰½_ 189 +ች_ 189 +“á 188 +¸á 186 +¸ 186 +_ለ 184 +_ለá 183 +ለሠ183 +ˆˆáˆ 183 +¥áŠ• 180 +‹¨á‰ 179 +የበ179 +Žá 178 +± 177 +¥áŠ•á 177 +_ከ 175 +Œ¥ 174 +ጥ 174 +Š ሠ172 +አሠ172 +_ከá 170 +† 170 +«_ 169 +Š¥áŠ• 169 +†á 166 +°á‹ 166 +áˆá 165 +“_ 165 +ˆá 165 +ና_ 163 +Š“_ 163 +‰¸ 160 +ቸ 160 +ቸá 160 +‰¸á 160 +ˆáŠ 160 +¨áŠ 159 +¸á‹ 159 +‰€ 158 +ቀ 158 +በ158 +£áˆ 156 +ቀá 155 +‰€á 155 +ችá 154 +‰¸á‹ 154 +áˆ_ 154 +‰½á 154 +¸á‹ 154 +ቸዠ154 +ˆ_ 154 +µá‰ 152 +³á‹ 151 +‰¸á‹ 151 +Šá 150 +_አ150 +¶ 150 +_áŠá 150 +ƒ 150 +_á‹á 150 +_á‹ 150 +‰³á 149 +ታá 149 +ˆá¢ 148 +‰ በ147 +ደሠ147 +‹°áˆ 147 +በበ147 + 146 +_ተ 146 +_ተá 146 +ለዠ145 +ˆˆá‹ 145 +ድá 144 +µáŠ 144 +›áˆ 144 +‹µá 144 +ቅ 143 +‰… 143 +¥_ 143 +áŒáˆ 142 +Œáˆ 142 +‹³ 141 +ዳ 141 +Žá‰ 139 +ህ 138 +ˆ… 138 +Š“á 137 +ናá 137 +«á‰ 137 +ን 136 +ጠ136 +አዠ135 +Š á‹ 135 +†áŠ 135 +ስበ134 +ˆµá‰ 134 +ጠ133 +ዳá 133 +Œ 133 +ሆá 133 +ˆ†á 133 +ሆ 133 +‹³á 133 +ˆ† 133 +ሆአ132 +ˆ†áŠ 132 +¨á‰° 131 +ያሠ131 +¨á‰°á 131 +‹«áˆ 131 +á‹ 129 +µá‹ 128 +Š 128 +አ128 +á 128 + 128 +°á‰ 127 +Œ á 127 +ጠá 127 +Š« 126 +ካ 126 +á 124 +በአ123 +°áŒ 123 +á‹«á‹ 123 +‹«á‹ 123 +‰ አ123 +Š¨áˆ 122 +ከሠ122 +Œˆáˆ 121 +ገሠ121 diff --git a/libtextcat/data/new_fingerprints/lm/arabic.lm b/libtextcat/data/new_fingerprints/lm/arabic.lm new file mode 100644 index 000000000000..85f701965e2e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/arabic.lm @@ -0,0 +1,400 @@ +_ +ا +Ù„ +Ùˆ +ال +_ا +ÙŠ +Ù† +Ù… +_ال +ر +ب +. +ت +د +ع +Ù‡ +_Ùˆ +Ù†_ +Ù +ا_ +Ùƒ +ج +.. +Ø© +Ø +Ø£ +س +_Ù… +._ +Ù‚ +Ø©_ +Ù‡_ +لا +Ù’ +_Ø£ +ان +_Ù +Ù +_ب +ÙŽ +لم +د_ +ول +ÙŠ_ +Ù‰ +Ù‰_ +... +وج +_Ù„ +_ع +Ù„_ +وا +جو +Ù’. +ص +الم +_الم +..._ +.._ +Ø« +ود +Ø° +Ø´ +من +وجو +ÙŽ_ +ÙÙŠ +لا_ +جود +ر_ +لى_ +لى +ان_ +وجود +لو +Ù…_ +_ت +Ù +_من +Ù’... +_وا +لع +الو +عل +Ù’..._ +Ù’.. +ين +الع +_ÙÙŠ +ز +ات +_ÙŠ +_الع +Ù_ +_Ùƒ +_الو +من_ +_ان +مر +Ø¡ +ÙÙŠ_ +يا +ب_ +را +ØŒ_ +Ù_ +ØŒ +ض +_ÙÙŠ_ +تب +_من_ +لوج +كا +لي +ت_ +لوجو +Ù‘ +ون +الوج +اء +جود_ +Ø£Ø +_Ø£Ø +الوجو +له +ود_ +ها +Øا +ذا +_ر +على_ +وجود_ +على +رب +لوجود +عر +_ان_ +او +اول +Ø· +رت +لت +بْ +Ø£Øا +_الوج +Ø£Øاو +با +وال +_ول +اد +_وال +Øاول +_Ø£Øاو +_Ø£Øا +Ø£Øاول +_ØŒ_ +Øاو +_ØŒ +ني +بي +_عل +لن +ته +ما +-_ +- +مرتب +نا +_. +ها_ +مرت +_._ +_- +_-_ +بة +ول_ +_Ø +رتب +دا +له_ +Ø¡_ +Ùƒ_ +قي +تبة +اول_ +مرتبة +ية +بل +ور +ده +الت +Ø® +رتبة +الا +رتبة_ +ين_ +عرب +ير +بة_ +تبة_ +قد +ربْ +لعربْ +لعر +العر +أن +لك +Øد +ون_ +لعرب +_على_ +_العر +ت٠+عن +بْ. +_لا +Øاول_ +ذات +العرب +_على +ية_ +عربْ +Ø¥ +اب +ئ +سا +نو +كو +المر +لل +يت +_Ø´ +لم_ +_المر +اع +مو +لمر +_الا +ته_ +اج +Ù +_Ù‚ +س_ +ائ +جب +ام +اجب_ +كون +واجب_ +Ù„ÙŽ +_لا_ +اني +سي +واج +سم +Ù„ÙŽ_ +يس +ال_ +_ولا +عي +وص +عا +جب_ +اس +ير_ +_مر +واجب +اجب +_بل +الن +ولا +_بال +وأ +أع +اك +وق +بلاد +نت +Ù†Ù +ضا +نه +كون_ +بْ.. +ثل +كل +ولا_ +_ذا +ذاته +المرت +دة +ذاته_ +ور_ +بال +بْ... +_ولا_ +_الت +يه +_الل +_س +اء_ +ات_ +بلا +_وأ +_Ø° +صو +ربْ. +_بلاد +لاد +_بلا +غ +لمرتب +_Ù‡ +بن +لمرت +عربْ. +_Ù† +_ذات +اته_ +لله +Ù’._ +_با +اته +_Ø¥ +وم +الل +الوا +موج +_الله +نْ +Ù„Ù +ا٠+_يكو +لر +قا +عين +ست +يكون +موجو +ليس +ده_ +Ù„Ù_ +_وج +_وص +دي +ØÙ… +الواج +بين +_الر +_يك +مس +Ù…Ù +لله_ +Ù_ +عد +يل +_الن +عق +اش +يكو +يق +الر +تÙ_ +_كا +شي +_يكون +لوا +ار +موجود +يك +هْ +_ذاته +ع_ +جا +الله +ÙÙˆ +وب +_عي +رس +دة_ +لواجب +يكون_ +لواج +رك +Ù_ +كان +لص +لش +لث +زا +ياء +ساء +لعق +انت +علم +العق +ما_ +قد_ +Ù„Ù +الله_ diff --git a/libtextcat/data/new_fingerprints/lm/armenian.lm b/libtextcat/data/new_fingerprints/lm/armenian.lm new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/armenian.lm diff --git a/libtextcat/data/new_fingerprints/lm/basque.lm b/libtextcat/data/new_fingerprints/lm/basque.lm new file mode 100644 index 000000000000..32522a7eca20 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/basque.lm @@ -0,0 +1,400 @@ +_ 12114 +a 5537 +e 4040 +i 3103 +t 2567 +n 2474 +r 2406 +o 1937 +k 1761 +z 1627 +u 1561 +d 1367 +l 1075 +s 1048 +b 995 +n_ 958 +a_ 957 +en 802 +g 772 +_e 736 +er 680 +ra 669 +. 625 +at 610 +_b 594 +ar 589 +an 585 +_d 580 +, 569 +,_ 569 +h 563 +ta 563 +te 555 +._ 520 +ko 513 +m 503 +o_ 486 +al 481 +en_ 471 +tz 458 +re 435 +_a 423 +ik 416 +in 409 +it 392 +za 391 +ak 388 +ba 358 +et 357 +da 352 +di 351 +ka 350 +ze 349 +ai 347 +p 340 +ri 335 +la 329 +an_ 327 +de 322 +ez 318 +na 314 +ti 307 +be 305 +i_ 302 +iz 294 +k_ 293 +tu 289 +eta 287 +( 280 +) 280 +_i 279 +or 276 +_( 270 +ko_ 269 +ek 265 +rr 264 +_h 263 +e_ 260 +es 260 +_ba 258 +nt 258 +ha 255 +_g 247 +era 246 +ia 240 +_be 234 +oa 229 +un 229 +ta_ 218 +ma 212 +on 211 +z_ 209 +du 207 +(_ 205 +_(_ 205 +: 204 +tze 204 +:_ 204 +us 191 +u_ 191 +_da 188 +ren 183 +az 183 +_) 183 +ur 182 +st 182 +ea 181 +_et 181 +eta_ 179 +zi 178 +si 177 +zen 177 +as 176 +_n 175 +go 175 +_eta 172 +il 169 +tik 169 +_z 167 +_eta_ 167 +ne 167 +bi 166 +zk 166 +sk 165 +ag 163 +t_ 162 +le 160 +rt 160 +ke 158 +- 150 +gu 148 +au 148 +tza 147 +_de 147 +ati 146 +ut 146 +A 145 +li 145 +hi 145 +E 144 +bat 144 +_o 143 +a, 142 +a,_ 142 +_m 141 +ntz 141 +ad 140 +ren_ 139 +_er 138 +io 138 +)_ 138 +_ez 138 +ir 137 +eh 137 +ab 135 +ate 135 +ak_ 134 +sa 134 +ra_ 134 +_bat 133 +nd 133 +_l 133 +ki 132 +ere 131 +ika 125 +ku 125 +f 124 +ga 123 +ld 122 +_di 121 +arr 120 +el 120 +eg 119 +uz 119 +are 116 +ng 116 +os 116 +_E 115 +gi 113 +_p 113 +_du 113 +mo 113 +zen_ 112 +ib 111 +_k 110 +ber 109 +ako 108 +_iz 107 +iza 107 +ala 107 +itz 107 +har 106 +eko 106 +adi 105 +l_ 105 +ie 105 +ste 105 +atu 104 +am 104 +ska 104 +ah 104 +_. 104 +r_ 104 +zt 103 +rri 103 +ait 103 +ua 103 +tzen 103 +ald 103 +usk 102 +aren 102 +_._ 102 +ro 102 +id 100 +la_ 99 +_s 98 +ue 98 +tek 97 +uska 97 +atz 96 +aren_ 96 +_ha 95 +rre 95 +" 94 +atik 94 +tzen_ 94 +ara 94 +in_ 93 +ni 93 +j 92 +ge 91 +ez_ 91 +ain 91 +od 91 +no 91 +na_ 91 +ri_ 90 +tan 90 +mat 90 +do 89 +_mo 88 +ho 88 +iko 88 +beh 88 +_ad 88 +al_ 87 +_iza 87 +I 87 +zan 87 +_ze 87 +_adi 87 +_A 86 +uk 86 +eha 85 +dit 85 +ru 85 +pe 85 +eko_ 84 +ela 84 +ed 84 +kar 84 +n,_ 83 +ari 83 +bil 83 +_beh 83 +bai 83 +tu_ 83 +n, 83 +ehar 83 +_izan 82 +itu 82 +_ho 82 +is 82 +ean 82 +ama 82 +izan 82 +_)_ 81 +ik_ 81 +B 80 +koa 80 +ot 80 +_na 79 +zu 79 +beha 78 +behar 78 +_t 78 +H 78 +_ko 78 +dir 77 +mati 76 +_j 75 +at_ 75 +em 74 +tika 74 +K 74 +_B 74 +zte 74 +ten 74 +_beha 74 +ter 74 +matik 73 +egi 73 +_dir 73 +a. 72 +ago 72 +kal 72 +ram 72 +ena 72 +unt 71 +lt 71 +_H 71 +su 71 +mod 71 +bo 71 +G 70 +_ber 70 +lde 70 +c 69 +te_ 69 +ar_ 69 +me 69 +ina 69 +dal 69 +ako_ 68 +L 68 +rik 68 +a._ 68 +ori 68 +ite 68 +raz 67 +alde 67 +)._ 67 +). 67 +zio 66 +_ez_ 66 +tan_ 66 +amat 66 +atika 66 +est 66 +ntza 65 +dut 65 +izk 65 +_ga 65 +ia_ 65 +s_ 65 +ita 65 +tea 64 +ei 64 +hal 64 +_mod 64 +pa 64 +bu 64 +re_ 63 +eu 63 +ert 63 +oa_ 63 +_era 63 +ten_ 63 +_G 63 +_eg 63 +rama 63 +odal 63 +ramat 63 +oda 63 +amati 62 +atze 62 +gr 62 +untz 62 +eza 62 +gra 62 +_ge 61 +gram 61 +kat 61 +abi 61 +_gr 60 +_gram 60 +_gra 60 +int 60 +rd 60 +_in 60 +k, 59 +_hi 59 +zko 59 +k,_ 59 +pr 59 +rab 59 +da_ 59 +ide 58 +_)._ 57 +tat 57 +ing 57 +ira 57 +tak 57 +x 57 +_). 57 +_ed 57 +_es 57 +_bi 57 +rak 56 +_- 56 diff --git a/libtextcat/data/new_fingerprints/lm/belarus.lm b/libtextcat/data/new_fingerprints/lm/belarus.lm new file mode 100644 index 000000000000..7d58602e51c3 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/belarus.lm @@ -0,0 +1,400 @@ +_ +а +н +Ñ€ +е +i +к +Ñ‹ +л +у +Ñ +Ñ +Ñ‚ +д +а_ +о +м +в +ц +з +. +на +г +п +ка +._ +ра +ч +i_ +Ñ +ал +у_ +ар +ан +Ñ_ +та +, +б +,_ +ва +Ñ‹_ +ш +_Ñ +ла +ÑŒ +й +_д +ле +га +_а +ад +_п +ры +да +_у +ен +Ñ‚Ñ‹ +й_ +е_ +_г +ны +_н +на_ +зе +_в +- +_б +Ð°Ñ +ам +_i +ав +ро +аг +_у_ +_з +па +нн +Ñк +Ñ… +ÑŽ +_м +не +дз +_i_ +_к +Ð +Ñ‹Ñ +пр +ÑÑ‚ +ак +ын +iк +ль +Ð½Ñ +нi +лi +за +аз +ру +ет +ж +Ñн +ку +Ð»Ñ +ай +_Ðœ +_Ð +_Ñ +Ð½Ñ +ага +ат +ай_ +Ðœ +але +га_ +ага_ +ме +_на +ала +" +_па +Ñ‚Ñ€ +ер +кал +кi +м_ +кт +ава +вi +дзе +нÑк +ана +Ð’ +го +ац +Ñц +а. +) +о_ +iн +_ад +цы +ца +а._ +тар +ма +цi +ау +ÑŒ_ +Ñка +ча +_ка +Ñа +Ñ€_ +_._ +Ñк +_га +_Ñ€ +_Ñ‚ +_пр +_за +ве +ÑÑ +ны_ +да_ +др +мi +бы +_. +Ñ€Ñ +ара +он +оу +_да +ек +и +лек +_У +ым +ыл +Ð°Ñ +ло +нк +нт +пра +)_ +пi +ары +_на_ +та_ +рн +Ñ…_ +У +Ñi +шы +кр +аÑ_ +ел +ван +ец +Ñта +чы +_бы +ае +I +_Ñ +_П +iка +Ñу +Ñд +( +iм +газе +_газ +_газе +_I +азе +_) +Ð²Ñ +_( +iч +тро +нÑ_ +аш +ор +газ +_)_ +ов +аван +П +ран +ун +лÑ_ +ар_ +дак +вар +_да_ +Ñ‹Ñн +Ñв +черн +_Я +ык +ктрон +трон +Ñл +' +ект +ектро +но +нага_ +нна +-- +iл +ку_ +нÑу +-_ +Ðл +Ве +был +Ñ€Ñ‹Ñ +онн +_- +ронн +iÑ +Ñдак +ктр +кта +Ñан +лi_ +_" +д_ +Ñлект +ап +ктро +_Ñл +чер +Ñлек +зе_ +лектр +мп +ерн +_Ñле +_кал +нÑка +ктар +Ñцi +Вечер +ечер +ач +ечерн +Я +ам_ +ектр +Вече +ÐºÑ +_--_ +дзе_ +тронн +наг +Ð +Ñда +Веч +Б +Ñ‚Ñ‹_ +зет +ече +лект +_Ðл +еч +ук +ада +рон +_ва +ла_ +_Ñлек +_был +ÑƒÑ +--_ +нага +че +вары +Ñле +_-- +рав +уку +каг +_ш +кай +ний_ +Ðа +ий_ +ка. +_ч +ннаг +ыц +_Ð +Ñ€ÑƒÑ +_Ð’ +_Ð +ÑÑ€ +Ñн +Ñм +_Б +кра +черни +у,_ +нд +ни +дакт +лару +нц +кага_ +_Мин +Ð½Ñ‹Ñ +ÐœÐ¸Ð½Ñ +нь +акта +лар +_Ð´Ð»Ñ +.. +_вар +нiк +i, +_пра +i. +Ñнт +Ñна +о, +Ñу +(_ +бо +Ð½Ð½Ñ +ць +рни +цц +це +Ñнта_ +дл +_ÐœÐ¸Ð½Ñ +Ð»Ð°Ñ€ÑƒÑ +Ñнта +_го +Ñ‚Ñ +к_ +_Ñ€Ñ +ерни +вы +цца +ана_ +ронна +не_ +ндр +зеты_ +_Ñк +Мин +длÑ_ +_длÑ_ +елар +Бел +рыÑн diff --git a/libtextcat/data/new_fingerprints/lm/bosnian.lm b/libtextcat/data/new_fingerprints/lm/bosnian.lm new file mode 100644 index 000000000000..cf6b8a41ce67 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/bosnian.lm @@ -0,0 +1,400 @@ +_ 9464 +a 2787 +i 2108 +e 2077 +o 2018 +j 1396 +n 1328 +s 1170 +u 1010 +r 988 +d 957 +a_ 889 +e_ 833 +t 774 +je 771 +k 756 +l 743 +m 702 +v 685 +p 604 +c 538 +i_ 538 +_s 522 +u_ 476 +z 463 +_p 450 +o_ 433 +,_ 381 +, 381 +_i 369 +_n 358 +b 349 +_d 349 +. 346 +na 341 +je_ 337 +._ 321 +_j 307 +g 299 +ra 292 +st 283 +ko 278 +_je 274 +ij 254 +_o 245 +ni 240 +_k 236 +an 228 +oj 227 +da 226 +_u 222 +pr 221 +no 219 +ma 219 +la 211 +ri 206 +_je_ 203 +po 203 +ci 196 +_pr 191 +os 190 +od 187 +ka 186 +im 185 +ti 184 +li 182 +vo 178 +_po 174 +ja 171 +_i_ 169 +ov 169 +al 168 +re 167 +ne 167 +m_ 164 +ta 160 +na_ 158 +ed 157 +_m 157 +_na 157 +se 156 +_u_ 154 +en 153 +ic 151 +sa 151 +_b 149 +ak 141 +va 140 +ad 137 +h 136 +ju 135 +su 134 +dj 133 +ije 133 +ar 127 +ca 127 +_z 126 +nj 125 +ji 124 +da_ 124 +_ko 123 +_da 122 +il 121 +" 120 +av 120 +_t 118 +aj 116 +ob 115 +ro 114 +am 114 +vi 114 +_su 113 +om 112 +dje 112 +za 112 +at 111 +le 111 +di 110 +su_ 107 +iz 107 +ve 107 +lj 106 +_se 105 +ev 105 +is 105 +es 103 +se_ 103 +do 101 +ih 100 +a, 99 +_su_ 99 +a,_ 99 +on 98 +bi 98 +in 97 +voj 97 +az 96 +ac 95 +_da_ 95 +la_ 95 +_r 93 +_g 93 +jev 92 +ma_ 92 +er 92 +or 91 +h_ 90 +_sa 90 +e, 90 +e,_ 90 +ba 90 +ima 89 +a. 88 +ol 88 +_do 87 +dn 87 +it 87 +ko_ 86 +ne_ 86 +ost 85 +ek 85 +to 85 +d_ 84 +as 84 +ju_ 84 +ao 84 +ih_ 84 +a._ 84 +te 83 +evo 83 +koj 83 +pri 82 +jevo 82 +ce 81 +_se_ 81 +og 80 +go 80 +jevoj 79 +de 79 +uc 79 +evoj 79 +_od 78 +_za 78 +tr 78 +S 77 +_koj 76 +ke 75 +_v 75 +ao_ 75 +_dje 74 +_bi 74 +sta 74 +_dj 74 +cij 74 +ik 74 +djev 73 +sl 73 +_djev 72 +ga 72 +djevo 72 +_ka 71 +rij 71 +_iz 71 +P 71 +_pri 70 +_a 69 +us 68 +_S 68 +mo 67 +el 67 +sk 66 +me 66 +zi 66 +ija 65 +n_ 65 +ku 64 +im_ 63 +_st 63 +ica 63 +_na_ 62 +_ne 62 +em 61 +edn 61 +jk 61 +io 61 +li_ 60 +ojk 60 +evojk 60 +_" 60 +zn 60 +vojk 60 +pro 59 +lo 59 +ije_ 59 +jed 58 +ke_ 58 +om_ 58 +jen 58 +sti 57 +_im 57 +le_ 57 +_ra 56 +e. 56 +ze 55 +_pro 55 +nu 55 +nje 55 +ti_ 55 +ec 55 +pre 55 +oc 54 +aci 54 +no_ 54 +et 54 +oji 53 +si 53 +ara 53 +ama 53 +z_ 53 +pos 52 +rad 52 +ran 52 +ima_ 52 +ru 52 +_P 52 +tu 52 +mu 51 +e._ 51 +ja_ 50 +_pre 50 +sa_ 49 +io_ 49 +od_ 48 +ni_ 48 +_nj 48 +j_ 48 +_pos 47 +_c 47 +ila 47 +K 46 +_sa_ 46 +uz 46 +N 46 +_ni 45 +zna 45 +U 45 +za_ 45 +_no 45 +ako 45 +u, 44 +lu 44 +ali 44 +u,_ 44 +sto 44 +ste 44 +ve_ 44 +ani 44 +oli 44 +aka 44 +_jed 43 +i,_ 43 +ji_ 43 +uci 43 +i, 43 +ci_ 43 +osti 43 +_N 42 +dr 42 +so 42 +ust 41 +ila_ 41 +B 41 +- 41 +red 41 +jke 41 +sv 41 +_go 41 +bar 41 +g_ 41 +est 40 +D 40 +iv 40 +vojke 40 +aju 40 +ta_ 40 +A 40 +lje 40 +jedn 40 +bil 40 +ojke 40 +ova 40 +ati 39 +_mu 39 +pa 39 +M 39 +_ba 39 +ca_ 39 +O 39 +ka_ 39 +_a_ 38 +_B 38 +_ima 38 +sn 38 +nu_ 38 +T 38 +to_ 38 +eg 38 +ava 38 +ros 37 +ir 37 +ala 37 +og_ 37 +osl 37 +ovi 37 +koji 37 +_sv 37 +dv 36 +ric 36 +_za_ 36 +br 36 +_on 36 +odi 36 +_koji 36 +_jedn 35 +nik 35 +dno 35 +_D 35 +jo 35 +tra 35 +_M 35 +sp 35 +iz_ 35 +oz 35 +vr 35 +u. 35 +eri 35 +I 35 +eko 35 +ale 35 +_ma 34 +lik 34 +_bil 34 +c_ 34 +ut 34 +je,_ 34 +u._ 34 +str 34 +je, 34 +adi 34 +tit 34 +_iz_ 34 +iti 34 +i. 33 +_rad 33 +ici 33 +rost 33 +aju_ 33 +va_ 33 +_ob 33 +nog 33 diff --git a/libtextcat/data/new_fingerprints/lm/breton.lm b/libtextcat/data/new_fingerprints/lm/breton.lm new file mode 100644 index 000000000000..6d021d739672 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/breton.lm @@ -0,0 +1,400 @@ +_ 21447 +e 6375 +a 5414 +n 3228 +r 3039 +o 2968 +t 2392 +i 1812 +h 1751 +u 1650 +l 1630 +d 1506 +a_ 1352 +z 1319 +t_ 1310 +_e 1168 +_a 1168 +e_ 1133 +m 1105 +s 1100 +g 1090 +r_ 998 +k 997 +n_ 958 +et 941 +v 888 +_d 868 +an 859 +. 846 +' 841 +en 836 +b 757 +, 749 +,_ 743 +._ 716 +ar 703 +ou 700 +et_ 689 +c 686 +ez 572 +'h 572 +_g 565 +er 555 +p 553 +_k 535 +c'h 530 +c' 530 +nt 513 +_h 505 +re 505 +ra 478 +ha 466 +ñ 458 +ne 456 +oa 454 +_o 442 +_b 434 +- 432 +zh 422 +ar_ 415 +_m 414 +_e_ 414 +nn 384 +el 376 +_a_ 356 +ur 350 +o_ 346 +h_ 345 +ve 340 +nt_ 340 +w 339 +ke 338 +de 333 +añ 332 +_p 332 +s_ 327 +he 325 +on 318 +le 318 +ga 316 +ma 315 +_ar 312 +eu 312 +_n 310 +an_ 298 +ant 296 +enn 285 +z_ 282 +_ar_ 281 +be 280 +_v 276 +_r 272 +al 270 +en_ 268 +_ke 267 +l_ 264 +em 264 +_c 263 +ñ_ 262 +da 262 +_s 261 +ho 260 +di 259 +_ha 252 +ll 250 +tr 248 +oa_ 247 +me 246 +us 242 +_ga 234 +la 231 +ket 227 +ant_ 219 +_da 219 +_l 216 +ur_ 216 +_oa 215 +in 214 +ket_ 211 +gan 211 +_c' 207 +_u 207 +_c'h 207 +ad 207 +añ_ 207 +ao 204 +_ma 204 +_t 204 +_ket 201 +_an 199 +_di 197 +ezh 196 +ù 196 +où 196 +_de 195 +ev 193 +? 192 +st 192 +ro 192 +P 192 +_ket_ 188 +er_ 188 +f 186 +na 186 +ue 185 +da_ 184 +?_ 184 +_gan 184 +_da_ 184 +_ne 183 +ed 182 +_P 180 +g_ 180 +pe 179 +m_ 178 +A 177 +ri 176 +us_ 175 +ta 174 +ze 174 +gant 174 +ka 174 +i_ 172 +d_ 171 +G 167 +te 167 +ae 166 +zh_ 164 +ha_ 163 +_ha_ 163 +_he 161 +_gant 159 +do 159 +oue 159 +_G 158 +eus 158 +eo 158 +'h_ 157 +_en 157 +go 157 +am 157 +c'h_ 157 +_be 156 +we 156 +iz 154 +_an_ 151 +_A 150 +eus_ 147 +sk 147 +li 146 +as 146 +_pe 146 +j 146 +_oa_ 146 +av 144 +gant_ 143 +ut 142 +no 141 +vez 140 +va 140 +_ra 140 +ge 138 +ez_ 138 +bo 137 + 137 +_ur 136 +lo 134 +he_ 134 +où_ 133 +ù_ 133 +_ur_ 132 +es 130 +'ho 129 +ni 129 +uz 129 +tra 127 +se 126 +it 125 +ra_ 125 +out 125 +is 125 +at 125 +hi 125 +eg 125 +ig 124 +ko 124 +io 123 +k_ 123 +ch 123 +_w 121 +or 121 +Pe 121 +_ma_ 119 +ma_ 119 +gw 118 +_em 118 +_Pe 118 +un 118 +eme 117 +ne_ 117 +nn_ 117 +c'ho 117 +ol 116 +ag 116 +M 115 +'ha 115 +_en_ 115 +iv 115 +vi 113 +_ka 113 +K 113 +ud 112 +_he_ 111 +ont 110 +oc 110 +vo 110 +ec 109 +wa 109 +.. 107 +_M 107 +_z 107 +br 107 +om 106 +to 105 +_f 105 +N 105 +_c'ho 104 +ti 104 +ut_ 104 +D 104 +_o_ 103 +_la 103 +_go 101 +az 101 +out_ 101 +ba 101 +enn_ 101 +c'ha 101 +our 100 +oc'h 100 +ell 100 +oc' 100 +etr 99 +el_ 99 +_K 99 +_D 99 +: 99 +:_ 99 +eve 98 +_d' 97 +all 97 +d' 97 +E 97 +_ne_ 97 +_me 95 +eo_ 95 +ak 95 +bet 95 +_eu 95 +rc 94 +_do 94 +_gw 94 +zi 93 +oz 93 +aou 93 +etra 92 +pa 91 +ab 90 +on_ 90 +ei 90 +tra_ 90 +n, 89 +zo 89 +ag_ 89 +_ev 88 +ul 88 +'e 88 +n' 88 +n,_ 88 +ouz 87 +v_ 86 +_n' 86 +_eus 84 +H 83 +za 83 +S 83 +etra_ 83 +_eo 82 +t,_ 82 +t, 82 +il 81 +ent 81 +fe 81 +rc'h 81 +rc' 81 +_eus_ 80 +ie 80 +_bo 79 +ele 79 +_ve 79 +mp 79 +_bet 78 +B 78 +it_ 77 +_vo 77 +'a 77 +n. 76 +_S 76 +hag 76 +hoa 75 +_hag 75 +len 75 +_N 75 +'hoa 74 +_E 74 +ir 74 +hag_ 74 +_hag_ 74 +mañ 74 +as_ 73 +eze 73 +ont_ 73 +_. 73 +iñ 73 +r, 72 +pr 72 +ed_ 72 +n._ 72 +re_ 72 +in_ 72 +r,_ 72 +_H 71 +'he 70 +t. 70 +gou 70 +em_ 70 +_br 70 +rae 70 +rez 69 +t._ 69 +bet_ 69 +net 69 +dr 68 +_eo_ 68 +ll_ 68 +mo 67 +po 67 +oul 67 +rou 67 +c'hoa 67 +a- 67 +vel 67 +oc'h_ 67 +nna 66 +_B 66 +met 65 +ec' 65 +ec'h 65 +R 64 +den 64 diff --git a/libtextcat/data/new_fingerprints/lm/catalan.lm b/libtextcat/data/new_fingerprints/lm/catalan.lm new file mode 100644 index 000000000000..086a45b50c50 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/catalan.lm @@ -0,0 +1,400 @@ +_ 16604 +e 5091 +a 4937 +s 3120 +r 3037 +i 2987 +l 2747 +n 2641 +t 2466 +o 2274 +d 1775 +c 1740 +u 1515 +a_ 1460 +s_ 1416 +_d 1116 +e_ 1067 +p 1014 +m 995 +es 985 +de 972 +_de 842 +en 700 +_l 691 +re 652 +_e 637 +de_ 611 +el 602 +_de_ 601 +_a 600 +, 599 +,_ 599 +er 598 +la 584 +ar 583 +_p 550 +l_ 547 +ci 522 +n_ 518 +es_ 515 +an 515 +ra 499 +nt 495 +ta 495 +b 476 +_c 452 +al 450 +v 440 +g 428 +. 420 +on 420 +or 404 +t_ 402 +at 396 +._ 377 +r_ 372 +i_ 371 +la_ 368 +_i 364 +_la 351 +_la_ 325 +f 325 +le 322 +te 312 +' 311 +_s 308 +st 308 +se 305 +ó 302 +ue 302 +na 301 +os 301 +qu 300 +q 300 +el_ 299 +it 292 +co 290 +ri 277 +ca 277 +ti 273 +ac 272 +in 268 +ll 264 +ic 264 +me 259 +un 257 +que 248 +C 248 +tr 244 +ns 242 +ió 238 +_i_ 236 +ad 233 +ent 232 +_el 227 +ne 226 +_t 223 +_co 221 +_m 221 +_C 217 +en_ 217 +li 217 +ia 212 +à 212 +pe 207 +_a_ 207 +pr 206 +ció 203 +ó_ 203 +pa 203 +ro 202 +o_ 198 +E 198 +di 197 +io 197 +ls 196 +h 192 +_q 192 +_qu 192 +_en 187 +aci 186 +am 186 +ec 183 +to 183 +as 180 +om 180 +ni 180 +da 179 +ió_ 179 +si 178 +ls_ 176 +L 175 +ma 172 +res 172 +ur 171 +_el_ 162 +rt 162 +ue_ 160 +A 160 +os_ 159 +_que 159 +que_ 158 +_r 158 +po 157 +_es 155 +er_ 155 +_que_ 155 +M 155 +_se 153 +va 153 +del 153 +ció_ 151 +_pr 151 +is 150 +_en_ 149 +P 147 +_pe 146 +_del 145 +ts 145 +lo 145 +_M 144 +ct 144 +_u 144 +ol 143 +ve 141 +_L 140 +x 140 +y 140 +a,_ 139 +a, 139 +nc 138 +men 137 +al_ 137 +_f 137 +_re 137 +_P 136 +ació 136 +les 136 +rs 134 +est 133 +tu 131 +_E 130 +et 130 +s,_ 129 +_un 129 +na_ 129 +_v 129 +s, 129 +ion 127 +per 126 +so 125 +em 125 +at_ 124 +no 124 +j 124 +br 123 +nt_ 122 +ar_ 122 +sa 121 +_n 119 +les_ 118 +é 118 +ce 117 +il 117 +ell 116 +_per 114 +à 114 +ob 113 +re_ 113 +ir 113 +_A 112 +ons 112 +do 112 +ua 112 +con 112 +ment 111 +gu 111 +ts_ 110 +ss 110 +ns_ 109 +ant 109 +ra_ 109 +Co 109 +par 108 +l' 107 +d' 107 +_l' 107 +els 107 +tat 107 +sc 106 +_d' 106 +an_ 105 +_Co 105 +vi 104 +els_ 104 +ica 104 +ran 103 +ul 102 +iv 102 +S 102 +_del_ 102 +del_ 102 +mb 101 +mi 101 +ita 101 +nta 100 +_pa 99 +_o 99 +_con 98 +ació_ 97 +rn 96 +_in 96 +ia_ 96 +z 96 +im 95 +rr 95 +art 94 +ta_ 93 +com 93 +tre 92 +_h 92 +s. 91 +mp 90 +ie 90 +J 90 +s._ 89 +cio 89 +_le 89 +bre 88 +_ca 88 +_al 88 +sta 88 +_com 88 +cu 88 +à _ 87 +pre 87 +fe 86 +ba 86 +tra 86 +ge 85 +pro 85 +_les 84 +des 84 +ter 84 +_po 84 +_les_ 84 +T 84 +_J 84 +nd 84 +cion 84 +_S 84 +ura 83 +nci 82 +va_ 81 +ha 81 +ona 81 +ent_ 80 +ues 80 +oc 80 +ea 80 +nte 80 +és 80 +_di 79 +ui 79 +as_ 78 +ut 78 +ici 78 +res_ 78 +us 77 +ot 77 +ara 77 +ip 75 +rm 75 +ab 75 +eg 75 +_per_ 75 +ng 75 +'a 75 +I 75 +per_ 75 +rec 74 +du 74 +_tr 74 +è 73 +cia 73 +_no 73 +b_ 73 +_par 72 +ep 72 +id 72 +lle 71 +rc 71 +_pro 70 +D 69 +G 69 +ga 69 +fo 69 +una 68 +El 68 +lit 68 +un_ 68 +ques 68 +amb 67 +ix 67 +és_ 67 +_G 67 +era 67 +cr 67 +) 66 +da_ 66 +- 66 +sp 66 +y_ 66 +ada 66 +tor 66 +( 66 +_( 66 +_T 65 +ment_ 65 +B 65 +_es_ 65 +Ma 65 +V 65 +uc 65 +ect 65 +ame 64 +iu 64 +_Ma 64 +orn 64 +_B 64 +_D 64 +Ca 64 +sti 64 +_g 63 +esc 63 +rd 63 +una_ 63 +là 62 +" 62 +ed 62 +amen 61 +mo 61 +ions 61 +_El 61 +_Ca 61 +Jo 61 +eu 61 +ari 61 +lt 61 +F 61 +u_ 61 +ament 60 +_V 60 +m_ 60 +fi 60 +au 60 +ev 60 +La 60 +itat 59 +_ha 59 diff --git a/libtextcat/data/new_fingerprints/lm/chinese_simplified.lm b/libtextcat/data/new_fingerprints/lm/chinese_simplified.lm new file mode 100644 index 000000000000..622b89c3ba80 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/chinese_simplified.lm @@ -0,0 +1,400 @@ +_ +,_ +_,_ +_, +, +çš„ +çš„_ +_çš„ +_çš„_ +_。 +。_ +。 +_。_ +国 +了_ +了 +_了_ +在 +_了 +_ï¿¥_ +_在 +_1_ +_1 +_ï¿¥ +国_ +1 +在_ +ï¿¥_ +ï¿¥ +å¹´ +1_ +_在_ +_ï¼ +_ï¼_ +部 +ï¼ +ï¼_ +5_ +æ°‘ +_5 +_5_ +5 +政 +å·¥ +ã€_ +_å¹´ +_æ—¥ +æ—¥ +_〠+å¹´_ +è¡Œ +_ã€_ +建 +〠+ï¼” +多_ +多 +ï¼– +百 +作 +会 +æ—¥_ +_百 +_建 +ç† +å’Œ +_ï¼” +军 +_多 +_多_ +_百_ +_部 +_æ—¥_ +å…³ +_ï¼–_ +å’Œ_ +æˆ +百_ +_ï¼– +_å’Œ_ +_å’Œ +ï¼”_ +è¿› +ï¼–_ +_军 +_ï¼”_ +_万_ +ï¼™ +我 +_万 +展 +å¾— +地 +们 +产 +举 +过 +主 +上 +ç›® +è¡Œ_ +于 +一 +万 +_国 +å…¨ +å‘ +到_ +_è¿› +_å·¥ +_过 +人 +_ï¼™_ +_我 +到 +个_ +个 +_上 +_ï¼™ +万_ +_举 +ä¸ +部_ +_å…³ +ï¼™_ +们_ +é•¿ +_政 +å¼€ +战 +ç» +_性 +_上_ +并 +性 +æ²»_ +大 +_工作 +_个 +æ°‘_ +_内_ +ä»– +è¿™ +_内 +æ²» +计 +å“ +术 +工作_ +å…± +县 +内 +―_ +区_ +员_ +_― +术_ +_工作_ +上_ +部门 +_部门 +作_ +_并 +_è¿™ +一_ +于_ +分 +区 +å“_ +_引 +_å¼€ +ç³» +员 +县_ +_―_ +_ä¸ +工作 +_一_ +― +_å‘ +_到_ +_ä»– +_到 +_æ°‘ +主义 +_å¹´_ +_å…¨ +生 +_å…± +_个_ +性_ +é—¨ +_性_ +地_ +å¾€ +机 +案 +引 +åˆ +内_ +_一 +展_ +义 +府 +问题_ +里_ +常 +å–得了 +市 +期 +è¿™_ +æ•° +é—¨_ +å°± +è¦_ +ç‰_ +å° +ä»–_ +å— +家 +社会 +技 +å¾—_ +å°† +_分_ +å–得了_ +è¥ +8 +女 +级 +约 +_å°†_ +ç» +统 +厂_ +我们 +安 +å—_ +_厂_ +大_ +#_ +_作 +å·¥_ +æ–‡ +å°†_ +æ–° +组 +外 +_å·²_ +å·² +产_ +_ä¸ +_产 +_技术_ +_生产 +过_ +但 +ä»· +ç»_ +å‡ +å +å·²_ +_之 +_技 +家_ +事 +ï¼’ +_举行_ +è”åˆå›½_ +ä¸ +严 +è® +_å…ƒ_ +_ç‰_ +å¾€_ +之 +_但 +_å‘展 +问题 +_机 +åŽ +_å–得了_ +_市_ +政治_ +厂 +_战 +å– +务 +_但_ +å +é•¿_ +政_ +å‡ +刚 +_æˆ +_å_ +事_ +_会 +_8_ +会主义 +部门_ +_女 +_ï¼’_ +机_ +_å·² +_†+ç†_ +å…¬ +å„ +技术 +å…ƒ_ +_èŒå·¥ +ç»æµŽ_ +举行 +_#_ +_军民 +_问题 +义_ +之_ +举行_ +_æ–° +解 +_两 +_è¿™_ +得了 +_å¾—_ +_å‡ _ +_é—® +两_ +我们_ +进行 +å–å¾— +_æ–‡ +_ä»–_ +_é•¿ +_è”åˆ +_过_ +_计 +_进行_ +法国_ +建设_ +_说 +题_ +é‡ +_政治 +_市 +里 +_解 +_政府 +å£ +è¦ +_å‡ +å‘展_ +è”åˆå›½ +_组 +å‡ _ +_我们_ +åŽ_ +_ç» +_美 +æ¥_ +åˆå›½_ +济_ +å£_ +â€_ +å…ƒ +# +府_ +军民 +并_ +èŒå·¥ +_8 +æˆ_ +约_ +_ç‰ +级_ +次_ +_社会主 +_ï¼’ +_政治_ +_â€_ +_英 +得了_ +_# +_å‘展_ +_社 +生产 +_æ¡ +军民_ +†+_èŒ +_å–得了 +_è” +å°±_ +_我们 +å‘展 +两 +è®® +_å° +_è”åˆå›½ diff --git a/libtextcat/data/new_fingerprints/lm/chinese_traditional.lm b/libtextcat/data/new_fingerprints/lm/chinese_traditional.lm new file mode 100644 index 000000000000..6708981ba6e7 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/chinese_traditional.lm @@ -0,0 +1,400 @@ +_ +ï¹_ +_ï¹_ +ï¹ +_ï¹ +çš„ +_çš„ +çš„_ +_çš„_ +。 +_。 +。_ +_。_ +å¹´ +_會 +會_ +_會_ +å¹´_ +_å¹´_ +å +é›» +會 +_å¹´ +_在 +é›»_ +_是_ +_é›»_ +一 +æ–¼ +å¸_ +_æ–¼_ +_æ–¼ +在 +_é›» +å¸ +是_ +港 +_å¸_ +在_ +_是 +æ–¼_ +_在_ +_å¸ +是 +ä¸_ +_å +_ä¸ +_港_ +_港 +_ä¸_ +å_ +_一 +å“¡ +å“¡_ +_一_ +ä¸ +_å_ +一_ +港_ +_ç½² +_å“¡_ +_å“¡ +_æ¥ +æ¥ +_ç½²_ +æ¥_ +_æ¥_ +ç½²_ +ç½² +_事 +_åŒ +æ©Ÿ +_國_ +, +〠+_,_ +_å…§ +ã€_ +_外_ +_åŒ_ +_政_ +_å…¬_ +為 +外 +å…¬ +_å…¬ +有 +_å…§_ +åŒ +_政 +國_ +國 +å…¬_ +香 +_事_ +外_ +事 +å…§_ +_, +_〠+政 +_外 +_ã€_ +香_ +政_ +å…§ +åŒ_ +事_ +,_ +_國 +為_ +_é•·_ +_較_ +_æ–¹_ +_用_ +ç‰_ +ç² +錦 +_æ–‡_ +_ä¸ +_錦 +用 +_通 +_較 +_ç‰ +有_ +_é•· +_è·¯ +通 +_è¦_ +_åŠ +_åŠ_ +_è¦ +_香 +錦_ +_訴 +ä¸_ +_話 +_有_ +通_ +時_ +ç²_ +_予 +è·¯_ +較_ +æ–¹_ +ç”° +è¦ +æ©Ÿ_ +_訴_ +_ç”°_ +ç‰ +_有 +話_ +_æ©Ÿ_ +_予_ +è·¯ +時 +æ–¹ +æ—¥ +è¦_ +_通_ +訴_ +_用 +_æ–¹ +åŠ_ +ç”°_ +_話_ +_ä¸_ +_香_ +訴 +_ç”° +_æ–‡ +_錦_ +三 +較 +用_ +_時_ +ä¸ +_ç‰_ +åŠ +話 +_æ©Ÿ +_ç² +ä¹ +æ–‡ +_時 +_è·¯_ +_ç²_ +é•· +_為_ +_為 +予 +é•·_ +予_ +æ–‡_ +_æ—¥_ +ä¹_ +途_ +_高 +è£_ +æ•…_ +_室 +_發_ +_控_ +å‰_ +回: +_查_ +_三 +_安 +_以 +_控 +途 +_安_ +_è£ +_士_ +_雇 +_大_ +_上_ +_èµ· +出_ +_三_ +_人_ +_èµ·_ +室_ +_人 +_分_ +_至_ +上_ +_出_ +第 +_* +至 +_查 +程 +_ç…™_ +_生_ +_以_ +_åª +者 +_å¯ +三_ +_*_ +分_ +_途 +_å‰ +_分 +_ç·š_ +_出 +發 +_與_ +生 +ç”± +å¯_ +ç…™_ +_ç”±_ +_ç·š +者_ +至_ +æ—¥_ +åª_ +ç…™ +發_ +_使 +_使_ +查 +_第 +_雇_ +以_ +_åª_ +控_ +安_ +* +與_ +ç·š +_與 +_至 +改 +人_ +大_ +_上 +_高_ +æ•… +èµ·_ +高 +ç·š_ +大 +控 +士 +_æ•…_ +_作_ +使 +_ç”± +ç”±_ +_途_ +安 +作 +*_ +以 +_生 +來 +_大 +回 +_æ—¥ +_å‰_ +å›› +_ä¹_ +_ä¹ +人 +_è£_ +上 +_發 +_作 +士_ +_æ•… +作_ +高_ +雇 +åª +å¯ +: +è£ +使_ +èµ· +出 +å‰ +_ç…™ +_å¯_ +雇_ +_室_ +生_ +å…« +_士 +查_ +分 +室 +與 +也_ +肇_ +å’Œ_ +_物_ +_如_ +_霧_ +主_ +_商_ +ç´„ +_府_ +到_ +é” +é“ +都_ +黨_ +éŽ +_é‡_ +_者_ +未_ +_ä¿®_ +_ç´„_ +別_ +郵_ +_能_ +來_ +_霧 +_車 +ï¼ +_å¸_ +_排 +_該_ +_柙_ +被_ +總_ +_越 +å‘Ž_ +_資 +_排_ +期_ +_夜_ +_ä¿® +å·´_ +_育_ +_æŒ_ +_號_ +_åœ_ +_調 +_亦_ +_æ ¡ +_) +_後_ +å…¥_ +_便 +_訊 +_期_ +脹_ +_而_ +_é‡ +o +_該 +_o +_( +程_ diff --git a/libtextcat/data/new_fingerprints/lm/croatian.lm b/libtextcat/data/new_fingerprints/lm/croatian.lm new file mode 100644 index 000000000000..b054ac34ab89 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/croatian.lm @@ -0,0 +1,400 @@ +_ 36598 +a 9456 +o 9050 +i 8526 +e 7955 +n 5513 +j 5379 +s 4971 +t 4125 +r 3889 +u 3423 +je 3281 +l 3231 +e_ 3177 +d 3128 +k 2992 +v 2891 +a_ 2835 +o_ 2679 +m 2645 +p 2316 +_s 2212 +i_ 2205 +y 2174 +c 1894 +z 1867 +je_ 1613 +_n 1593 +g 1581 +_p 1490 +b 1386 +u_ 1382 +, 1208 +,_ 1208 +st 1203 +_j 1202 +na 1159 +_je 1118 +_d 1105 +_i 1104 +. 1069 +._ 1058 +ra 1017 +ko 981 +ni 980 +_je_ 918 +ij 914 +ti 911 +no 891 +da 884 +to 879 +_k 863 +_o 856 +cy 839 +li 838 +ne 790 +sy 786 +- 785 +pr 760 +_u 748 +po 734 +ta 711 +_b 705 +_t 704 +la 699 +ja 699 +an 691 +m_ 680 +ov 674 +ije 673 +_z 669 +ka 662 +ri 652 +lo 651 +vo 643 +re 641 +is 626 +in 623 +se 623 +va 622 +_m 617 +oj 614 +_po 594 +_pr 592 +bi 588 +en 582 +il 578 +os 576 +vi 572 +nj 569 +ak 564 +_na 558 +im 558 +da_ 553 +od 550 +ao 550 +al 549 +om 545 +na_ 533 +_se 524 +_i_ 523 +h 517 +_g 511 +ma 508 +at 502 +cj 501 +og 497 +-- 493 +ro 493 +on 485 +av 480 +_da 474 +_ne 474 +ed 472 +zy 471 +se_ 466 +_se_ 458 +_bi 457 +ao_ 452 +ad 452 +lj 445 +es 444 +ji 441 +_v 433 +_c 430 +za 421 +go 418 +_r 413 +_u_ 411 +yi 410 +ek 406 +di 403 +sa 396 +et 393 +ic 391 +io 390 +_da_ 389 +to_ 389 +as 386 +_ko 383 +ye 378 +ar 378 +mo 373 +le 367 +or 364 +ju 363 +el 354 +tr 354 +io_ 352 +ve 345 +lo_ 344 +su 344 +ol 344 +am 344 +iz 343 +li_ 338 +ti_ 336 +sto 323 +n_ 321 +it 319 +_za 319 +sta 317 +_ni 316 +te 312 +nu 312 +a, 309 +a,_ 309 +ya 309 +do 304 +om_ 303 +la_ 300 +no_ 299 +_na_ 299 +d_ 297 +ko_ 296 +aj 294 +ik 292 +ru 291 +ga 291 +em 288 +nje 283 +dj 281 +ne_ 281 +k_ 277 +_st 276 +koj 276 +ec 274 +_ka 272 +_su 270 +ob 265 +-_ 263 +az 261 +sv 260 +_koj 260 +im_ 260 +ije_ 259 +pa 258 +ot 257 +yt 256 +ok 255 +su_ 255 +ih 254 +me 253 +dn 253 +_cy 253 +iv 251 +syt 248 +g_ 247 +--_ 246 +_- 246 +kr 246 +--- 246 +a. 245 +e,_ 245 +---_ 245 +e, 245 +er 245 +a._ 244 +_iz 244 +mi 243 +_---_ 242 +cyi 242 +_-- 242 +_--- 242 +jed 240 +h_ 239 +_a 239 +_sa 237 +j_ 236 +_l 231 +_sv 229 +_to 229 +sk 228 +ih_ 224 +ja_ 223 +pro 223 +yn 222 +t_ 222 +ost 221 +_do 221 +oc 219 +gl 218 +_su_ 216 +og_ 216 +uc 214 +s_ 214 +bil 213 +oz 213 +ki 212 +ni_ 212 +nij 209 +ako 208 +eg 208 +ut 205 +pre 205 +ci 204 +ji_ 203 +_od 203 +ilo 202 +ati 202 +ac 201 +ns 200 +_mo 197 +rij 196 +bo 195 +ovo 195 +ku 195 +dje 194 +ma_ 192 +_bil 191 +cje 186 +sti 186 +_go 186 +de 185 +sl 183 +_pro 182 +ju_ 182 +nije 181 +tv 180 +lje 179 +isy 179 +pri 178 +_pre 177 +dr 177 +e._ 177 +e. 177 +op 176 +ima 176 +anj 175 +jen 175 +us 172 +ilo_ 172 +_ra 170 +S 167 +ecj 166 +iti 166 +sp 163 +_S 161 +vr 161 +i. 161 +i._ 161 +zn 161 +ali 161 +i,_ 160 +i, 160 +ap 157 +nije_ 157 +nst 156 +pi 156 +ga_ 156 +_sy 155 +_nj 155 +jes 155 +ran 155 +vo_ 155 +yto 154 +ev 153 +_to_ 152 +_pri 151 +est 150 +N 150 +ins 150 +ist 149 +ir 149 +o, 148 +vj 148 +vje 148 +o,_ 148 +B 148 +zi 147 +jec 147 +gov 147 +yto_ 147 +syto 146 +ton 146 +od_ 145 +O 144 +rije 144 +lik 143 +on_ 142 +ocy 142 +W 142 +ba 142 +_W 141 +kao 141 +Wi 141 +_N 141 +inst 141 +_nij 141 +_nije 140 +_Wi 140 +syto_ 140 +lic 139 +P 139 +ovi 138 +_tr 138 +rs 137 +ez 137 +edn 136 +_P 136 +si 136 +ili 136 +du 136 +Winst 135 +cye 135 +nston 135 +ston 135 +ud 135 +kao_ 135 +Wins 135 +insto 135 +oji 135 +nsto 135 +Win 135 +raz 135 +zye 135 +_Win 134 +ova 134 +_Wins 134 +_on 133 +ako_ 133 +odi 133 +cya 133 +ila 133 +icy 133 +oj_ 133 +ke 133 +va_ 132 +ija 132 +_is 132 +jel 132 +oje 130 +pu 130 +cje_ 130 +bi_ 129 +rat 128 +ce 128 +tu 128 +mu 128 +ve_ 127 diff --git a/libtextcat/data/new_fingerprints/lm/czech.lm b/libtextcat/data/new_fingerprints/lm/czech.lm new file mode 100644 index 000000000000..097bbc5b2241 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/czech.lm @@ -0,0 +1,400 @@ +_ 26378 +o 5870 +e 5354 +a 4740 +n 4462 +t 3745 +s 3438 +i 3187 +v 3118 +l 2891 +r 2754 +k 2566 +d 2508 +m 2198 +u 2197 +p 2032 +à 1924 +c 1678 +h 1572 +z 1544 +á 1522 +_p 1299 +e_ 1266 +y 1231 +a_ 1219 +j 1219 +_s 1156 +b 1079 +o_ 1065 +Ä› 1043 +_v 1038 +, 952 +,_ 946 +st 945 +_n 911 +é 878 +. 870 +Ã_ 848 +Å™ 822 +._ 803 +nà 782 +ov 757 +_z 723 +i_ 716 +u_ 684 +ro 674 +en 656 +ý 647 +ž 639 +po 630 +ch 629 +Ä 625 +na 599 +_a 598 +sk 592 +Å¡ 563 +ho 552 +_d 551 +ra 545 +m_ 539 +y_ 530 +_t 512 +ko 504 +_k 503 +le 502 +_j 501 +_o 486 +to 479 +pr 471 +ne 468 +nÃ_ 462 +je 458 +é_ 456 +ti 455 +od 433 +li 432 +va 432 +_po 429 +_m 428 +al 424 +te 424 +ou 423 +ed 418 +se 415 +la 410 +no 397 +os 382 +lo 377 +an 376 +_pr 375 +ů 375 +v_ 372 +Å™e 370 +_a_ 364 +em 363 +at 360 +ta 359 +do 357 +t_ 357 +_b 355 +or 349 +h_ 345 +_v_ 338 +ch_ 327 +S 325 +ce 323 +av 323 +pÅ™ 322 +ni 319 +ké 316 +er 315 +nÄ› 315 +_na 313 +na_ 312 +_ne 311 +de 308 +ic 307 +in 306 +_se 306 +l_ 304 +dn 302 +za 298 +_pÅ™ 293 +Ä›_ 291 +ol 290 +_je 281 +ob 280 +is 277 +ve 274 +ho_ 272 +es 270 +ot 268 +ak 265 +vo 263 +ná 260 +il 257 +se_ 257 +it 256 +et 253 +ad 250 +by 249 +P 242 +_r 242 +k_ 242 +ost 241 +_se_ 241 +tr 238 +me 237 +pro 234 +že 234 +ka 230 +_za 227 +om 224 +el 223 +_P 223 +on 218 +_pro 216 +ou_ 216 +tu 215 +O 212 +mi 212 +ku 211 +_u 210 +_do 208 +_l 207 +_na_ 206 +N 205 +ské 205 +Ãm 205 +Ãc 205 +ý_ 203 +mÄ› 203 +_S 200 +oz 200 +V 200 +ze 198 +da 194 +sl 192 +á_ 191 +ova 190 +mo 190 +re 189 +so 187 +vy 186 +ej 185 +rá 184 +ar 184 +s_ 183 +vÄ› 183 +A 181 +ru 180 +_st 178 +f 178 +éh 177 +ého 176 +kt 176 +tn 175 +g 174 +bo 174 +ez 173 +ci 172 +ký 172 +né 170 +M 170 +án 169 +as 168 +vi 167 +Å™i 167 +ac 166 +že_ 165 +ýc 165 +ž_ 165 +ck 164 +K 164 +B 164 +ých 164 +vá 162 +_c 162 +ého_ 162 +enà 161 +lá 160 +_ž 160 +řà 160 +pÅ™e 159 +ec 158 +ů_ 157 +J 156 +vn 156 +_h 155 +ké_ 155 +ok 154 +sta 154 +to_ 152 +vý 152 +nt 151 +ých_ 149 +lo_ 149 +_by 149 +dy 149 +_pÅ™e 148 +ce_ 147 +R 146 +n_ 146 +ád 146 +pa 145 +vé 145 +am 145 +mu 145 +ný 145 +ud 144 +_Ä 144 +_B 142 +ter 141 +nÄ›_ 141 +Äe 140 +Å¡e 140 +_V 140 +_ko 140 +li_ 139 +dÄ› 137 +hl 137 +je_ 137 +ji 137 +ist 135 +jà 135 +- 134 +ik 133 +si 133 +ál 132 +em_ 132 +_to 132 +_vy 131 +sp 130 +ut 130 +_J 130 +_že 130 +_M 129 +di 129 +kon 128 +la_ 128 +tà 128 +_ro 127 +ns 127 +ek 126 +ick 126 +T 126 +yl 125 +Än 125 +rn 125 +_že_ 124 +op 124 +sti 124 +kte 124 +Å™ed 124 +edn 123 +us 121 +Ãm_ 120 +" 119 +z_ 119 +str 118 +ti_ 118 +ém 118 +vat 118 +d_ 118 +_ve 118 +áv 118 +enÃ_ 117 +iv 117 +oj 117 +_kt 117 +Ä›l 117 +val 116 +eb 116 +_K 115 +tÅ™ 115 +_kte 115 +át 115 +_i 114 +_N 114 +D 114 +kl 114 +ny 114 +byl 113 +ev 113 +ri 113 +ú 113 +ky 111 +sm 111 +oh 111 +E 110 +ma 110 +Ä›t 110 +ský 110 +kter 109 +nu 109 +le_ 108 +ja 107 +zá 107 +tÄ› 106 +tá 106 +zn 106 +let 105 +aj 105 +sto 105 +Å¡Ã 105 +me_ 105 +-_ 105 +u, 105 +sv 104 +_le 104 +pol 104 +L 104 +_- 104 +ln 104 +pod 104 +H 103 +zi 103 +kéh 103 +kého 103 +_ná 103 +ent 102 +u,_ 102 +_od 102 +né_ 101 +_-_ 101 +al_ 101 +_kter 101 +do_ 100 +nos 100 +cà 100 +rav 99 +ran 99 +_sv 99 +ká 98 +eÄ 98 +nÃm 98 +bu 97 +_z_ 97 +Ãch 97 +_byl 97 +ánà 96 +rov 96 +kr 96 +dá 95 +ové 95 +dl 95 +uj 95 +nost 95 +ém_ 95 +ech 94 +ly 94 +oc 94 +vé_ 93 +_o_ 93 +dy_ 93 +ak_ 93 +nsk 93 +_so 93 +_ob 92 +nc 92 diff --git a/libtextcat/data/new_fingerprints/lm/danish.lm b/libtextcat/data/new_fingerprints/lm/danish.lm new file mode 100644 index 000000000000..5e5a61a98638 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/danish.lm @@ -0,0 +1,400 @@ +_ 21274 +e 9291 +r 5307 +n 4733 +i 3976 +t 3948 +s 3751 +a 3296 +l 3063 +d 3025 +o 2868 +g 2471 +er 2164 +k 2002 +m 1680 +e_ 1655 +en 1613 +f 1507 +de 1484 +r_ 1379 +v 1245 +u 1176 +t_ 1081 +n_ 1032 +er_ 992 +b 942 +. 870 +ge 868 +._ 831 +re 816 +h 816 +et 813 +te 813 +p 806 +in 788 +or 775 +_s 753 +_a 749 +en_ 712 +_e 691 +ti 689 +an 687 +, 681 +,_ 677 +_f 655 +_d 645 +el 642 +ng 635 +nd 634 +g_ 634 +se 615 +le 615 +st 607 +s_ 601 +_o 572 +ne 560 +li 537 +et_ 524 +es 521 +_i 512 +ri 511 +sk 510 +_de 498 +Ã¥ 497 +ar 475 +ed 473 +ig 463 +at 452 +_m 446 +is 443 +fo 441 +æ 441 +ve 438 +_k 434 +ø 432 +der 429 +ke 428 +ing 427 +og 426 +_b 412 +me 408 +il 407 +for 405 +ns 394 +y 389 +_h 380 +_t 374 +on 371 +d_ 370 +al 362 +be 359 +_fo 351 +af 336 +de_ 335 +_og 333 +_p 332 +og_ 325 +om 325 +_for 324 +_og_ 313 +l_ 308 +nge 302 +i_ 295 +_v 294 +c 289 +ter 283 +ll 280 +ni 278 +nde 278 +rs 277 +_af 277 +un 275 +ra 271 +ko 271 +den 270 +_i_ 268 +id 265 +til 265 +j 265 +vi 264 +D 260 +ere 256 +ma 255 +si 253 +f_ 252 +af_ 238 +_af_ 235 +ik 235 +m_ 234 +Ã¥_ 232 +_ti 227 +_D 226 +_u 226 +_er 225 +nt 224 +_en 224 +ls 221 +es_ 216 +lig 216 +ger 216 +re_ 210 +ag 210 +_me 207 +at_ 204 +lle 200 +ge_ 200 +_til 200 +ige 199 +_er_ 199 +der_ 199 +em 199 +ds 197 +r. 195 +io 195 +r._ 195 +ud 193 +_at 192 +_at_ 191 +ta 190 +els 190 +_l 190 +ha 190 +il_ 189 +or_ 189 +ke_ 186 +rt 185 +gen 184 +ka 183 +- 180 +rk 180 +ning 178 +ol 178 +nin 178 +la 177 +ld 175 +De 175 +it 173 +ede 172 +ed_ 171 +_ko 171 +lse 171 +ek 168 +else 167 +inge 167 +pÃ¥ 167 +ng_ 167 +_pÃ¥ 167 +iv 166 +ør 166 +so 165 +he 165 +ens 165 +ske 165 +ind 164 +til_ 163 +rn 163 +ide 162 +ev 162 +den_ 162 +to 162 +sen 160 +_be 160 +sa 160 +bl 158 +_g 158 +an_ 157 +det 156 +om_ 156 +ru 156 +va 155 +_til_ 155 +ste 154 +rd 153 +_pÃ¥_ 152 +k_ 152 +pÃ¥_ 152 +di 152 +kr 152 +K 151 +_De 149 +for_ 148 +te_ 148 +kon 148 +ver 147 +mm 146 +am 146 +_en_ 145 +_r 145 +ne_ 144 +ing_ 144 +tr 143 +le_ 142 +del 142 +_in 142 +gt 140 +_st 138 +S 138 +eg 138 +gs 138 +tt 138 +r, 137 +ser 137 +r,_ 137 +er. 137 +ro 137 +er._ 137 +_for_ 136 +ent 136 +kt 136 +eri 135 +ur 134 +lin 134 +B 133 +A 133 +sti 133 +ner 133 +da 133 +ris 132 +ion 132 +_K 131 +ern 131 +ers 130 +ist 130 +ær 130 +ige_ 130 +_si 130 +tte 129 +E 128 +_n 128 +nn 127 +_B 126 +_ha 126 +_. 126 +rne 125 +H 125 +_ud 125 +rin 124 +na 124 +und 124 +ft 124 +_der 124 +ku 123 +_A 122 +ler 120 +and 120 +end 120 +ns_ 120 +rg 119 +op 119 +er,_ 119 +er, 119 +ar_ 118 +P 118 +_S 117 +_H 117 +_._ 116 +ov 116 +erne 115 +tio 115 +med 115 +tion 115 +_E 115 +_P 115 +det_ 114 +pr 114 +e. 113 +ter_ 113 +: 113 +kk 113 +e._ 113 +e,_ 113 +e, 113 +od 113 +kke 113 +ten 113 +ling 113 +:_ 112 +mi 112 +eli 112 +lo 111 +som 111 +_den 111 +rb 110 +se_ 110 +ell 110 +sid 110 +nne 109 +fi 108 +lt 107 +v_ 107 +_de_ 107 +ark 106 +lige 106 +ngen 106 +ie 105 +_med 105 +_der_ 105 +ring 105 +a_ 105 +_vi 104 +-_ 104 +ys 103 +gel 103 +_so 103 +ia 103 +ive 102 +ej 101 +ati 101 +ren 101 +_det 101 +side 101 +ske_ 101 +br 100 +gi 100 +F 100 +M 100 +ul 99 +isk 99 +men 99 +n,_ 99 +age 99 +fr 99 +n, 99 +tu 98 +ts 98 +_ma 98 +nder 98 +ot 97 +dt 97 +R 97 +med_ 96 +ho 96 +ans 95 +_kon 95 +pe 95 +ce 94 +gr 93 +mme 92 +ret 92 +lige_ 92 +mu 91 +_med_ 91 +hv 91 +væ 91 +Det 91 +ens_ 91 +kl 91 +_M 90 +T 90 +ingen 90 +rm 90 +ill 89 +elle 89 +ef 89 +ene 89 +nds 89 +ove 89 +som_ 89 +C 88 +_den_ 88 diff --git a/libtextcat/data/new_fingerprints/lm/drents.lm b/libtextcat/data/new_fingerprints/lm/drents.lm new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/drents.lm diff --git a/libtextcat/data/new_fingerprints/lm/dutch.lm b/libtextcat/data/new_fingerprints/lm/dutch.lm new file mode 100644 index 000000000000..17a0626982a5 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/dutch.lm @@ -0,0 +1,400 @@ +_ 20104 +e 9848 +n 5323 +a 3733 +t 3683 +i 3490 +r 3195 +d 2876 +o 2845 +n_ 2443 +en 2439 +s 2195 +e_ 1842 +l 1837 +g 1522 +en_ 1500 +de 1489 +er 1388 +t_ 1377 +v 1253 +u 1217 +k 1204 +_d 1136 +h 1102 +m 1084 +an 939 +te 875 +j 857 +in 810 +_v 793 +r_ 751 +de_ 742 +ee 737 +p 732 +et 718 +ge 716 +aa 708 +b 703 +_e 686 +st 669 +z 668 +ie 662 +_de 655 +w 631 +c 611 +. 604 +s_ 582 +_de_ 576 +_h 572 +el 570 +ij 564 +._ 554 +et_ 531 +an_ 522 +he 505 +_o 497 +nd 478 +_i 475 +ar 459 +_m 451 +re 442 +ve 441 +' 428 +or 424 +ng 421 +at 418 +_s 415 +oo 403 +_z 401 +le 395 +_b 394 +_a 391 +_he 386 +va 385 +er_ 381 +me 372 +_w 368 +f 361 +on 351 +_t 351 +_va 345 +_g 342 +di 342 +nt 340 +, 335 +g_ 335 +,_ 334 +van 327 +ch 326 +is 326 +ing 325 +be 325 +ni 320 +it 317 +een 316 +_van 315 +al 310 +den 309 +ti 309 +van_ 307 +oe 302 +ke 302 +_van_ 299 +aar 299 +d_ 295 +we 293 +da 292 +tu 290 +_ee 290 +ud 287 +een_ 286 +li 284 +es 282 +_st 281 +ver 281 +ten 281 +ri 275 +nde 275 +der 274 +_in 270 +k_ 268 +vo 267 +het 266 +oor 264 +_het 262 +het_ 262 +_het_ 259 +_een 258 +l_ 258 +ze 257 +_n 254 +ro 248 +gen 243 +_een_ 241 +at_ 240 +op 238 +n. 238 +_en 237 +rs 237 +_da 235 +stu 232 +in_ 230 +_be 229 +_ge 228 +_k 226 +rd 226 +tud 220 +_en_ 220 +n._ 217 +te_ 209 +ei 208 +ent 206 +_me 203 +la 202 +ek 202 +ed 201 +ra 200 +stud 200 +en. 200 +ie_ 197 +ste 196 +_vo 195 +_in_ 193 +_stu 191 +zi 191 +om 189 +ui 189 +en._ 186 +ten_ 185 +_stud 185 +ude 184 +die 183 +ns 183 +_j 181 +D 179 +aan 179 +se 179 +ma 178 +_ve 176 +ne 174 +_p 174 +eg 173 +p_ 172 +ar_ 172 +aar_ 171 +_te 170 +ng_ 169 +_we 169 +'' 167 +_D 165 +ers 164 +_op 163 +dat 161 +dat_ 160 +ig 160 +ere 159 +eer 158 +_zi 158 +voor 156 +voo 156 +nge 155 +nder 151 +nte 151 +or_ 150 +ta 150 +je 149 +ing_ 148 +ll 148 +_ver 147 +jk 146 +oor_ 146 +_dat 145 +ijk 145 +ren 145 +is_ 145 +_dat_ 144 +_l 144 +and 144 +lij 143 +ter 143 +na 142 +uden 139 +tude 138 +_voor 136 +_voo 136 +ond 136 +ken 135 +cht 135 +_al 135 +ht 135 +wa 134 +ho 133 +em 133 +den_ 133 +pe 132 +sc 132 +un 131 +ur 131 +_di 130 +gen_ 130 +zo 129 +rt 129 +ev 128 +mo 128 +lijk 127 +_is 126 +stude 124 +ha 123 +to 122 +el_ 121 +og 121 +op_ 121 +sch 120 +ol 120 +ente 119 +_u 118 +pr 118 +end 118 +mi 117 +iet 116 +_aa 116 +eli 115 +dent 115 +ijn 115 +jn 115 +ou 115 +men 114 +_' 114 +tie 113 +_is_ 113 +nie 113 +tr 112 +ak 112 +id 112 +udent 111 +tuden 111 +uit 110 +_te_ 109 +aan_ 109 +ld 109 +S 108 +_aan 108 +ede 108 +ja 107 +nten 107 +it_ 107 +je_ 107 +ts 107 +erd 106 +est 106 +E 105 +_op_ 105 +ad 104 +al_ 104 +_ze 104 +_on 104 +rk 104 +lle 103 +ens 103 +gel 103 +m_ 103 +len 103 +_r 102 +ec 102 +inge 102 +met 102 +_met 101 +si 100 +die_ 100 +us 100 +onde 99 +_ni 99 +De 99 +eu 99 +dente 99 +enten 99 +ic 99 +_met_ 98 +f_ 98 +met_ 98 +no 97 +ko 96 +voor_ 96 +rde 96 +H 96 +ngen 95 +lo 95 +ot 95 +as 94 +zij 93 +_nie 92 +vi 92 +eb 92 +_De 92 +_zij 91 +ep 91 +wi 91 +_zo 91 +kt 91 +ege 91 +G 91 +bi 90 +j_ 90 +ij_ 90 +ze_ 90 +do 90 +lan 89 +ov 89 +udi 89 +ord 89 +onder 89 +V 88 +elij 88 +_wa 88 +elijk 88 +ef 88 +_die 87 +ag 86 +erk 86 +eren 86 +R 85 +ik 85 +_ma 85 +gr 85 +am 85 +_mo 84 +ul 84 +nn 83 +eve 83 +De_ 83 +maa 83 +ingen 83 +wo 83 +_'' 83 +O 83 +tudi 82 +I 82 +nt_ 82 +tudie 81 +ven 81 +udie 81 +nten_ 81 +_die_ 81 +jaa 80 +ka 80 +eke 80 +ite 80 +a_ 80 +_je 80 +ac 80 +jaar 80 +_je_ 79 +_H 79 +_zijn 79 +zijn 79 +n, 78 +nen 78 +N 78 +n,_ 78 +ijn_ 77 diff --git a/libtextcat/data/new_fingerprints/lm/english.lm b/libtextcat/data/new_fingerprints/lm/english.lm new file mode 100644 index 000000000000..ab71632c6214 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/english.lm @@ -0,0 +1,400 @@ +_ 20326 +e 6617 +t 4843 +o 3834 +n 3653 +i 3602 +a 3433 +s 2945 +r 2921 +h 2507 +e_ 2000 +d 1816 +_t 1785 +c 1639 +l 1635 +th 1535 +he 1351 +_th 1333 +u 1309 +f 1253 +m 1175 +p 1151 +_a 1145 +the 1142 +_the 1060 +s_ 978 +er 968 +_o 967 +he_ 928 +d_ 888 +t_ 885 +the_ 844 +_the_ 843 +on 842 +in 817 +y 783 +n_ 773 +b 761 +re 754 +, 734 +,_ 732 +an 732 +g 728 +w 718 +_i 707 +en 676 +f_ 599 +y_ 595 +of 594 +_of 592 +es 589 +ti 587 +v 580 +_of_ 575 +of_ 575 +nd 568 +at 549 +r_ 540 +_w 534 +it 522 +ed 496 +_p 494 +nt 485 +_c 462 +o_ 457 +io 450 +_an 439 +te 432 +or 425 +_b 418 +nd_ 407 +to 406 +st 402 +is 401 +_s 396 +_in 389 +ion 385 +and 385 +de 384 +ve 382 +ha 375 +ar 366 +_m 361 +and_ 360 +_and 360 +_and_ 358 +se 353 +_to 347 +me 346 +to_ 344 +ed_ 339 +. 330 +be 329 +_f 329 +._ 329 +_to_ 320 +co 317 +ic 316 +ns 308 +al 307 +le 304 +ou 304 +ce 293 +ent 279 +l_ 278 +_co 277 +tio 275 +on_ 274 +_d 274 +tion 268 +ri 266 +_e 264 +ng 253 +hi 251 +er_ 249 +ea 246 +as 245 +_be 242 +pe 242 +h_ 234 +_r 232 +ec 227 +ch 223 +ro 222 +ct 220 +_h 219 +pr 217 +in_ 217 +ne 214 +ll 214 +rt 213 +s,_ 210 +s, 210 +li 209 +ra 208 +T 207 +wh 204 +a_ 203 +ac 201 +_wh 199 +_n 196 +ts 196 +di 196 +es_ 195 +si 194 +re_ 193 +at_ 192 +nc 192 +ie 190 +_a_ 188 +_in_ 185 +ing 184 +us 182 +_re 182 +g_ 179 +ng_ 178 +op 178 +con 177 +tha 175 +_l 174 +_tha 174 +ver 173 +ma 173 +ion_ 171 +_con 171 +ci 170 +ons 170 +_it 170 +po 169 +ere 168 +is_ 167 +ta 167 +la 166 +_pr 165 +fo 164 +ho 164 +ir 162 +ss 161 +men 160 +be_ 160 +un 159 +ty 159 +_be_ 158 +ing_ 157 +om 156 +ot 156 +hat 155 +ly 155 +_g 155 +em 153 +_T 151 +rs 150 +mo 148 +ch_ 148 +wi 147 +we 147 +ad 147 +ts_ 145 +res 143 +_wi 143 +I 143 +hat_ 142 +ei 141 +ly_ 141 +ni 140 +os 140 +ca 139 +ur 139 +A 138 +ut 138 +that 138 +_that 137 +ati 137 +_fo 137 +st_ 137 +il 136 +or_ 136 +for 136 +pa 136 +ul 135 +ate 135 +ter 134 +it_ 134 +nt_ 133 +that_ 132 +_ha 129 +al_ 128 +el 128 +as_ 127 +ll_ 127 +_ma 125 +no 124 +ment 124 +an_ 124 +tion_ 122 +su 122 +bl 122 +_de 122 +nce 120 +pl 120 +fe 119 +tr 118 +so 118 +int 115 +ov 114 +e, 114 +e,_ 114 +_u 113 +ent_ 113 +Th 113 +her 113 +j 112 +atio 112 +ation 112 +_Th 111 +le_ 110 +ai 110 +_it_ 110 +_on 110 +_for 109 +ect 109 +k 109 +hic 108 +est 108 +der 107 +tu 107 +na 106 +_by_ 106 +by_ 106 +E 106 +by 106 +_by 106 +ve_ 106 +_di 106 +en_ 104 +vi 104 +m_ 103 +_whi 102 +iv 102 +whi 102 +ns_ 102 +_A 101 +ich 100 +ge 100 +pro 99 +ess 99 +_whic 99 +ers 99 +hich 99 +ce_ 99 +which 99 +whic 99 +all 98 +ove 98 +_is 98 +ich_ 97 +ee 97 +hich_ 97 +n,_ 96 +n, 96 +im 95 +ir_ 94 +hei 94 +ions 94 +sti 94 +se_ 94 +per 93 +The 93 +_pa 93 +heir 93 +id 93 +eir 93 +eir_ 93 +ig 93 +heir_ 93 +_no 93 +ev 93 +era 92 +_int 92 +ted 91 +_The 91 +ies 91 +art 91 +thei 90 +_ar 90 +_thei 90 +their 90 +_pro 90 +et 89 +_pe 88 +_mo 88 +ther 88 +x 87 +gh 87 +S 87 +_is_ 87 +ol 87 +ty_ 87 +_I 86 +nde 86 +am 86 +rn 86 +nte 86 +mp 85 +_su 84 +_we 84 +par 84 +_v 84 +pu 82 +his 82 +ow 82 +mi 82 +go 81 +N 81 +ue 81 +ple 81 +ep 80 +ab 80 +;_ 80 +; 80 +ex 80 +ain 80 +over 80 +_un 79 +q 79 +qu 79 +pp 79 +ith 79 +ry 79 +_as 79 +ber 79 +ub 78 +av 78 +uc 78 +s._ 77 +s. 77 +enc 77 +are 77 +iti 77 +gr 76 +his_ 76 +ua 76 +part 76 +ff 75 +eve 75 +O 75 +rea 74 +ous 74 +ia 74 +The_ 73 +ag 73 +mb 73 +_go 73 +fa 72 +on,_ 72 +ern 72 +t,_ 72 +on, 72 +t, 72 +_me 71 diff --git a/libtextcat/data/new_fingerprints/lm/esperanto.lm b/libtextcat/data/new_fingerprints/lm/esperanto.lm new file mode 100644 index 000000000000..0eef3ec8894a --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/esperanto.lm @@ -0,0 +1,400 @@ +_ 57050 +a 16035 +i 12706 +e 12227 +o 12102 +n 10393 +s 8344 +l 7707 +r 7492 +t 7134 +k 5376 +u 4558 +j 3946 +a_ 3875 +m 3783 +d 3710 +p 3693 +la 2840 +s_ 2769 +e_ 2751 +. 2706 +_l 2635 +_k 2619 +v 2531 +n_ 2504 +o_ 2444 +i_ 2333 +._ 2278 +on 2238 +, 2193 +,_ 2182 +_la 2100 +en 2080 +j_ 2050 +as 2028 +la_ 2012 +ta 1956 +_la_ 1907 +an 1882 +_p 1850 +g 1831 +_e 1791 +_d 1778 +is 1737 +aj 1658 +st 1635 +_s 1575 +c 1526 +de 1517 +oj 1498 +er 1476 +ti 1456 +f 1443 +_a 1442 +b 1427 +ro 1379 +_m 1351 +ra 1341 +nt 1293 +ka 1270 +ri 1258 +al 1249 +as_ 1248 +aj_ 1213 +to 1209 +_de 1203 +_t 1200 +te 1179 +_n 1176 +is_ 1171 +in 1151 +ko 1145 +or 1114 +es 1083 +re 1034 +ia 1029 +li 1022 +de_ 1016 +_de_ 979 +ar 974 +_v 966 +vi 942 +lo 932 +x 928 +io 917 +ne 855 +no 848 +ni 843 +mi 835 +ma 819 +_ka 816 +el 815 +pr 771 +z 744 +un 734 +l_ 732 +po 730 +_f 725 +ø 724 +est 691 +na 687 +ki 679 +kaj 676 +si 665 +u_ 663 +kaj_ 660 +" 654 +tas 651 +le 650 +oj_ 648 +_i 643 +tr 642 +_pr 630 +_es 628 +jn 626 +pe 618 +_kaj 616 +ig 616 +_kaj_ 611 +do 608 +sta 606 +on_ 602 +ek 602 +ci 597 +r_ 595 +ý 594 +_r 593 +il 592 +_est 587 +di 586 +am 586 +_mi 582 +aý 578 +_vi 577 +mo 575 +ant 565 +_ne 562 +en_ 561 +o. 559 +æ 543 +iu 538 +o, 529 +ur 527 +o._ 527 +om 525 +o,_ 524 +at 521 +va 521 +- 519 +_en 518 +: 513 +:_ 512 +_ti 500 +M 496 +h 488 +nd 484 +me 484 +_al 481 +_ko 479 +ve 478 +ie 478 +_ki 473 +it 473 +L 466 +_b 465 +se 462 +em 452 +ol 450 +nta 449 +tu 448 +ik 444 +ov 443 +da 443 +_M 440 +_po 439 +tas_ 438 +ne_ 437 +et 437 +_ma 436 +_en_ 435 +su 429 +pl 426 +_L 425 +pa 420 +_o 417 +vo 408 +an_ 407 +ro_ 406 +sti 406 +nu 399 +kon 396 +stas 391 +m_ 391 +ir 388 +n. 386 +fa 386 +jn_ 382 +ku 382 +os 376 +ke 375 +n, 375 +esta 374 +n,_ 372 +_su 362 +ta_ 362 +stas_ 359 +xi 359 +Mi 358 +_ne_ 356 +al_ 355 +nk 353 +so 353 +n._ 352 +id 349 +_g 348 +estas 347 +ga 346 +_h 345 +per 345 +_Mi 340 +ok 339 +K 339 +mp 337 +_esta 337 +s,_ 335 +s, 335 +_se 333 +anta 332 +ul 326 +ran 325 +_" 323 +ý_ 322 +te_ 320 +ak 320 +aý_ 320 +ed 320 +rt 319 +ojn 318 +gi 318 +_æ 317 +tis 316 +gx 316 +mal 316 +ia_ 315 +ks 310 +_al_ 310 +mi_ 309 +S 309 +lu 309 +ns 308 +kt 305 +io_ 302 +ent 300 +? 300 +_K 300 +ec 300 +el_ 299 +_- 299 +li_ 299 +E 298 +þ 298 +_li 297 +fo 296 +ter 296 +_re 296 +A 295 +nto 294 +vi_ 292 +La 292 +_mal 290 +nte 288 +sp 287 +sa 287 +_mi_ 279 +ut 278 +op 278 +_ke 277 +bo 277 +ajn 276 +un_ 276 +T 274 +to_ 272 +-_ 272 +bl 272 +_an 271 +_La 271 +øi 269 +_S 268 +_pl 267 +_fa 266 +ni_ 266 +La_ 265 +_E 264 +N 263 +tis_ 263 +_tr 263 +' 262 +! 262 +_-_ 262 +pro 261 +iu_ 261 +iø 261 +nc 260 +_si 259 +du 257 +_kon 256 +ru 255 +_vi_ 254 +_j 253 +ce 251 +ke_ 249 +ap 248 +us 247 +be 247 +im 247 +B 246 +_ku 246 +_La_ 246 +tra 245 +ad 245 +uj 245 +ac 245 +ita 243 +pre 242 +_pro 242 +co 241 +rm 241 +_ni 238 +_pe 236 +?_ 234 +on. 234 +toj 234 +"_ 234 +j. 234 +_ke_ 233 +s. 232 +_A 231 +av 230 +ri_ 230 +_el 229 +por 229 +` 224 +ev 224 +las 223 +P 223 +j._ 221 +eni 220 +_T 220 +_B 219 +j,_ 218 +j, 218 +era 217 +_in 216 +on._ 216 +cx 216 +_N 215 +ion 215 +ab 215 +.. 214 +) 213 +fi 213 +or_ 212 +pri 212 +s._ 212 +_por 210 +ez 210 +in_ 210 +am_ 209 +on,_ 209 +ll 209 +æi 209 +on, 209 +_ve 208 +ris 208 +esti 208 +!_ 207 +men 206 +vas 205 +iel 204 +taj 203 +_c 201 +aro 201 +ank 200 +_pri 200 +jo 200 +ja 200 +ont 200 +lt 199 +_P 199 +igi 199 +_pa 197 +oj. 197 +( 196 +au 195 +oro 195 +ng 195 +_( 194 +sto 194 +ast 194 +ag 193 diff --git a/libtextcat/data/new_fingerprints/lm/estonian.lm b/libtextcat/data/new_fingerprints/lm/estonian.lm new file mode 100644 index 000000000000..74a7aa014324 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/estonian.lm @@ -0,0 +1,400 @@ +_ 20738 +a 7004 +e 5699 +i 5321 +s 4731 +t 3769 +l 3448 +u 3446 +n 2902 +k 2584 +d 2202 +m 2043 +a_ 1758 +o 1684 +r 1429 +g 1174 +v 1151 +e_ 1139 +i_ 1136 +_k 1136 +s_ 1077 +h 1009 +, 995 +,_ 995 +_t 953 +p 953 +j 940 +ä 900 +is 896 +st 851 +se 841 +_s 822 +. 821 +as 801 +d_ 801 +le 800 +ta 794 +in 793 +_m 790 +ô 754 +._ 753 +t_ 746 +ma 710 +_p 680 +si 677 +_v 660 +es 636 +al 626 +us 619 +el 602 +_o 596 +_e 586 +ja 580 +_j 563 +te 562 +ü 549 +li 532 +va 515 +id 501 +ol 498 +tu 497 +da 490 +_n 480 +ku 478 +ud 459 +nu 455 +na 438 +ei 432 +ks 418 +mi 411 +ee 411 +u_ 407 +ka 400 +n_ 394 +b 394 +ga 386 +_l 384 +_a 380 +an 366 +ja_ 365 +et 358 +me 358 +l_ 350 +at 348 +la 341 +ad 340 +st_ 339 +ne 336 +ll 333 +_ta 332 +ra 330 +_ja 328 +ik 323 +en 318 +ni 308 +ul 305 +sa 302 +_ol 302 +nd 299 +_ja_ 299 +nud 296 +ii 291 +ko 286 +_se 285 +le_ 283 +aa 281 +is_ 281 +gi 270 +_te 269 +ag 269 +_va 268 +_ku 267 +ed 262 +em 255 +_mi 255 +ma_ 247 +ti 246 +ri 245 +_h 242 +gu 239 +id_ 238 +ast 237 +it 236 +ga_ 236 +un 232 +de 230 +ud_ 230 +ha 230 +ak 228 +ah 228 +uu 228 +il 227 +ôi 226 +as_ 223 +ke 222 +ar 220 +a, 220 +am 220 +_ko 220 +a,_ 220 +_ka 220 +ai 220 +eg 216 +sin 214 +est 214 +ui 214 +he 214 +ks_ 213 +ö 213 +oo 213 +ju 207 +är 205 +ut 203 +in_ 203 +oli 201 +ki 199 +su 199 +es_ 199 +lt 198 +ist 188 +li_ 186 +ea 186 +vi 184 +im 181 +mu 181 +se_ 180 +ts 180 +on 178 +ise 178 +ta_ 177 +ek 176 +_oli 176 +sel 173 +nud_ 173 +_ü 172 +a. 171 +nn 170 +ema 169 +ng 168 +lu 168 +ge 167 +_si 166 +_ei 165 +_i 165 +_ei_ 164 +ei_ 164 +_r 163 +ole 161 +pa 160 +lle 160 +a._ 160 +ust 159 +du 156 +er 156 +vô 153 +da_ 153 +min 152 +et_ 151 +d,_ 149 +_M 149 +ht 149 +d, 149 +M 149 +kui 148 +_et 147 +K 147 +_K 146 +pe 145 +gi_ 145 +_vô 145 +or 144 +_tu 142 +lt_ 141 +_ma 141 +asi 140 +ve 139 +us_ 138 +ig 136 +sin_ 136 +ur 135 +_ta_ 134 +di 134 +_et_ 134 +s,_ 132 +tas 132 +s, 132 +_kui 131 +sk 131 +re 130 +po 129 +oli_ 129 +om 129 +äi 128 +inu 128 +_na 128 +_oli_ 128 +_sa 128 +aj 128 +mis 127 +ui_ 127 +_me 127 +_pa 126 +tus 125 +pi 125 +te_ 124 +ül 123 +- 123 +est_ 122 +_on 121 +kk 121 +tt 120 +aga 119 +na_ 119 +_T 119 +T 119 +b_ 118 +al_ 118 +sta 118 +_mu 116 +_ju 116 +ida 116 +aks 116 +gu_ 116 +_ni 116 +s. 116 +ad_ 116 +_pe 114 +eks 114 +ev 114 +end 113 +s._ 113 +use 111 +ära 111 +_po 111 +_min 110 +S 110 +aja 110 +_la 110 +ele 109 +el_ 108 +on_ 108 +ab 108 +_S 108 +av 107 +ing 107 +kui_ 106 +_on_ 106 +au 104 +ne_ 104 +ti_ 104 +ell 103 +ae 101 +kô 101 +ed_ 100 +_ke 99 +ata 99 +iis 99 +! 98 +!_ 98 +sid 98 +nda 98 +eh 98 +lle_ 97 +pu 97 +ää 97 +vôi 97 +ine 96 +t, 96 +e,_ 96 +ale 96 +_vôi 96 +t,_ 96 +e, 96 +eda 96 +uk 95 +ast_ 95 +ld 95 +? 94 +_kui_ 94 +_sel 93 +_kô 93 +tul 93 +ega 93 +lg 92 +sii 92 +val 92 +e. 92 +_su 92 +ug 92 +oh 92 +kü 92 +d. 91 +ee_ 91 +see 91 +e._ 91 +öö 91 +oma 91 +_ole 90 +ses 90 +stu 90 +ôt 90 +üü 90 +_om 89 +me_ 89 +ot 89 +d._ 89 +_sii 88 +to 88 +_en 87 +atu 87 +?_ 87 +A 86 +J 86 +pea 86 +jä 85 +_A 85 +_see 85 +ime 84 +_pi 84 +_ha 84 +mô 84 +nä 84 +_J 84 +les 84 +ste 84 +kas 84 +_ä 84 +vä 83 +E 83 +pä 83 +_ve 83 +_E 83 +eis 82 +_jä 81 +_pea 81 +_mô 80 +um 80 +_kü 80 +iku 80 +üd 80 +all 79 +eid 79 +ba 79 +_vä 79 +ina 78 +lj 78 +sid_ 78 +hu 78 +tun 78 +lä 78 +_oma 77 +i,_ 77 +i, 77 +agu 77 +uh 77 +lm 76 +ras 76 +ss 76 +kä 76 +ees 76 diff --git a/libtextcat/data/new_fingerprints/lm/finnish.lm b/libtextcat/data/new_fingerprints/lm/finnish.lm new file mode 100644 index 000000000000..328f88604279 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/finnish.lm @@ -0,0 +1,400 @@ +_ 19984 +a 9133 +i 8384 +t 7797 +e 6481 +n 6431 +s 5897 +l 4504 +o 4163 +u 4106 +k 4013 +ä 3354 +n_ 2868 +m 2569 +a_ 1987 +v 1905 +r 1827 +ta 1580 +en 1553 +is 1515 +h 1508 +y 1462 +st 1390 +in 1375 +p 1342 +j 1333 +an 1139 +si 1073 +tt 1030 +te 1008 +en_ 982 +_k 980 +it 974 +ll 947 +aa 942 +ä_ 902 +va 878 +el 855 +_t 851 +ka 846 +i_ 835 +. 832 +se 818 +li 806 +tä 804 +oi 767 +ai 744 +._ 739 +tu 734 +_o 719 +mi 715 +al 703 +on 684 +d 681 +_v 662 +et 654 +_j 641 +t_ 635 +ti 632 +_m 628 +_s 620 +ja 616 +ma 596 +sa 595 +la 582 +ist 575 +_e 565 +to 565 +ks 557 +in_ 554 +es 551 +il 538 +an_ 536 +ki 527 +, 525 +ku 525 +,_ 524 +us 520 +as 514 +nt 512 +ri 495 +ke 494 +at 491 +_p 485 +le 484 +ik 483 +ss 477 +ut 469 +ö 469 +sta 460 +ee 459 +uu 458 +ol 457 +ta_ 451 +ne 445 +ää 445 +ei 443 +uo 436 +ko 433 +un 430 +lu 421 +ii 420 +e_ 418 +nn 413 +_h 412 +ar 408 +er 402 +än 396 +ja_ 386 +im 381 +on_ 365 +_va 363 +aan 354 +_a 352 +me 350 +ak 345 +ssa 331 +na 330 +ie 329 +pa 327 +_ja 326 +ia 325 +tä_ 322 +_l 319 +vi 317 +ise 316 +tta 315 +de 314 +os 312 +lli 309 +_ja_ 304 +jo 295 +vä 290 +su 289 +au 287 +lis 286 +_on 285 +sä 284 +uk 280 +am 280 +ot 280 +ty 275 +ett 271 +ttä 270 +ni 269 +lä 267 +ksi 264 +nk 264 +ht 263 +ul 261 +ell 261 +sa_ 259 +ha 257 +sen 257 +a. 254 +isi 253 +ste 253 +aan_ 252 +_on_ 252 +_ka 252 +sk 251 +kk 246 +itt 245 +ok 242 +a._ 239 +all 239 +yt 239 +mä 237 +mu 237 +av 237 +_y 236 +lla 233 +taa 231 +ais 231 +een 230 +K 230 +lt 228 +s_ 227 +ast 227 +iv 226 +ssa_ 225 +ra 225 +- 223 +kse 223 +oit 220 +om 220 +T 219 +_ku 218 +än_ 216 +aa_ 214 +at_ 214 +tel 211 +ui 210 +si_ 208 +rk 207 +sta_ 207 +_jo 203 +kä 202 +_K 201 +est 200 +em 200 +he 199 +_n 199 +vo 198 +_ta 196 +eh 196 +_ol 196 +S 196 +nta 196 +_ko 194 +je 194 +stä 194 +är 193 +ust 191 +mis 191 +ns 190 +pu 189 +nen 188 +ät 188 +toi 188 +iin 187 +ten 187 +min 186 +ista 185 +hd 184 +a, 184 +a,_ 184 +sen_ 183 +E 182 +lle 181 +vat 179 +ill 177 +no 176 +pä 176 +lm 176 +llis 175 +n. 175 +io 172 +ine 171 +n._ 170 +pi 169 +uks 168 +ava 168 +ään 166 +nen_ 165 +ah 165 +_mu 164 +tus 163 +mm 162 +_to 162 +ek 160 +int 159 +_r 159 +lin 158 +oim 158 +_T 158 +A 158 +imi 157 +tö 157 +la_ 157 +jä 157 +aj 156 +yh 155 +o_ 154 +lo 154 +oli 153 +een_ 153 +le_ 153 +_si 153 +g 152 +aik 151 +vat_ 150 +L 149 +ur 149 +ti_ 149 +sia 148 +ite 147 +inen 147 +ain 146 +sti 146 +lla_ 146 +ys 145 +_mi 145 +val 144 +stu 144 +äm 144 +alli 143 +pe 143 +utt 142 +et_ 141 +_tu 141 +eri 140 +_E 140 +: 140 +nki 139 +ir 139 +llä 138 +up 138 +äi 137 +ama 137 +_ha 135 +id 135 +_se 135 +po 134 +inen_ 134 +tte 133 +nna 133 +ten_ 132 +or 132 +ts 131 +nä 131 +yk 131 +äs 131 +_S 130 +ses 130 +ve 130 +ess 129 +äl 129 +ita 129 +lai 129 +H 129 +van 127 +äk 127 +kin 127 +N 127 +_te 126 +den 126 +tee 126 +P 126 +kaa 126 +iin_ 125 +kun 125 +ois 125 +sit 125 +oh 124 +V 124 +yö 124 +äv 124 +tav 124 +voi 124 +ia_ 123 +I 123 +oll 123 +maa 122 +ih 122 +oj 122 +rj 121 +ro 121 +ikk 120 +so 120 +oo 120 +oimi 120 +do 120 +pp 119 +M 119 +_ei 118 +toim 118 +op 118 +uut 118 +tet 118 +_i 118 +_ma 117 +vai 117 +lä_ 116 +u_ 116 +sy 116 +kau 116 +utta 116 +un_ 115 +eu 115 +ssä 115 +tti 115 +_sa 115 +mp 114 +eis 114 +ka_ 112 +että 112 +taa_ 111 +_et 111 +hu 111 +itu 111 +suu 111 +den_ 111 +ksen 110 +ap 110 +_ke 110 +uv 110 +tam 110 +yv 109 +aup 109 +stä_ 109 +asta 109 +äy 109 +kan 108 +nu 108 +ukse 108 +_toi 107 +ien 107 +hi 107 +iss 107 diff --git a/libtextcat/data/new_fingerprints/lm/french.lm b/libtextcat/data/new_fingerprints/lm/french.lm new file mode 100644 index 000000000000..5080d9cba9d5 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/french.lm @@ -0,0 +1,400 @@ +_ 20800 +e 7258 +i 4051 +s 4003 +a 3972 +n 3903 +r 3650 +t 3590 +u 2968 +o 2823 +l 2723 +e_ 2632 +d 2241 +s_ 1721 +_d 1693 +c 1663 +p 1528 +é 1320 +m 1297 +es 1164 +t_ 1106 +_l 1079 +de 1048 +on 959 +_de 940 +en 939 +_p 852 +nt 825 +le 808 +es_ 791 +re 777 +, 721 +,_ 720 +n_ 703 +de_ 685 +' 670 +an 667 +_de_ 645 +v 641 +_s 610 +r_ 596 +_c 594 +er 585 +ai 575 +_a 558 +_e 554 +ou 554 +q 549 +qu 538 +is 530 +te 528 +ti 525 +ur 519 +it 514 +g 498 +a_ 490 +f 480 +la 476 +in 475 +_le 441 +me 436 +nt_ 432 +. 427 +b 427 +ra 423 +io 416 +ent 415 +._ 404 +ne 395 +ns 392 +ion 383 +h 381 +ue 376 +se 371 +le_ 370 +ar 370 +ie 362 +co 361 +at 359 +tr 359 +et 349 +pr 342 +ce 336 +au 328 +u_ 321 +il 314 +_r 313 +_la 304 +un 303 +eu 303 +st 300 +re_ 296 +ro 290 +la_ 288 +on_ 287 +_m 286 +_la_ 283 +que 281 +_qu 280 +_q 280 +po 275 +tio 273 +tion 273 +pa 273 +li 271 +_t 269 +nc 268 +si 266 +_pr 265 +ri 264 +al 263 +ui 262 +_co 259 +i_ 255 +ta 255 +é_ 251 +x 247 +em 244 +l_ 243 +et_ 238 +_l' 236 +l' 236 +les 233 +ns_ 233 +ir 232 +_le_ 228 +ent_ 227 +or 226 +ré 224 +_f 224 +ne_ 222 +à 221 +ve 220 +ch 220 +it_ 219 +di 219 +oi 217 +- 216 +ni 215 +à _ 215 +les_ 215 +d' 214 +el 212 +ss 212 +_n 212 +ut 211 +our 210 +des 210 +" 208 +ur_ 207 +nd 207 +er_ 206 +ait 206 +ion_ 204 +rs 202 +_en 201 +_et 200 +j 200 +_d' 200 +ll 199 +_des 198 +des_ 197 +_pa 197 +té 196 +_et_ 195 +_à 195 +_à _ 195 +om 193 +ma 192 +ati 190 +_des_ 189 +L 188 +so 187 +_u 185 +è 184 +_" 183 +sa 182 +_po 181 +tre 181 +dé 181 +ue_ 180 +pe 179 +en_ 179 +ont 178 +_un 178 +_L 178 +us 176 +_les 176 +_les_ 176 +rt 176 +is_ 173 +_i 173 +du 172 +e,_ 171 +e, 171 +na 171 +s, 170 +s,_ 170 +as 169 +men 169 +M 167 +ait_ 167 +'a 166 +vi 162 +ci 159 +ant 158 +_au 158 +da 157 +_M 157 +ation 155 +atio 155 +con 154 +que_ 153 +ons 153 +eur 151 +est 149 +me_ 149 +mi 149 +par 148 +tion_ 148 +_so 147 +te_ 147 +res 144 +lo 144 +ment 144 +és 144 +ans 143 +_du 142 +du_ 141 +ux 141 +un_ 140 +y 138 +pro 138 +_du_ 136 +_dé 136 +ce_ 135 +_se 134 +_re 134 +pl 133 +A 132 +ge 131 +ic 131 +su 130 +x_ 129 +ien 129 +nce 129 +"_ 129 +ac 128 +il_ 128 +qui 128 +_pro 127 +no 127 +av 126 +_v 125 +_o 125 +rs_ 125 +ans_ 124 +eme 124 +bl 123 +emen 122 +_en_ 122 +iqu 122 +ct 122 +iq 122 +lle 122 +nn 121 +ts 121 +ement 121 +ét 120 +_"_ 120 +ér 119 +té_ 119 +_ce 119 +mp 119 +ire 119 +ui_ 119 +to 118 +he 117 +_é 117 +ca 117 +_j 116 +ec 116 +va 116 +_par 116 +ée 115 +_con 115 +se_ 114 +tre_ 113 +ique 112 +dan 111 +éc 111 +ha 110 +une 110 +P 110 +lu 110 +ux_ 109 +_b 108 +s. 108 +pou 108 +_pou 108 +ier 107 +C 107 +ais 106 +s._ 105 +ain 104 +_un_ 104 +nte 103 +'e 103 +mo 103 +mm 103 +ment_ 102 +une_ 102 +com 101 +_P 101 +'i 101 +_ma 100 +do 99 +ant_ 98 +anc 98 +che 97 +ap 97 +ont_ 97 +_que 97 +os 97 +urs 96 +_di 96 +fi 96 +im 96 +pour 96 +_pour 96 +ê 95 +ts_ 95 +_g 95 +our_ 94 +_sa 94 +ntr 94 +_da 94 +_ré 93 +rai 93 +rm 93 +_qui 93 +e. 92 +am 92 +_com 91 +uv 91 +_C 91 +D 91 +qui_ 90 +e._ 90 +pu 89 +_qui_ 88 +ia 87 +_dan 87 +_dans 87 +dans 87 +ter 87 +fo 87 +son 87 +dans_ 87 +id 86 +ag 86 +ine 86 +tu 85 +ran 85 +au_ 85 +ol 85 +oc 84 +est_ 84 +st_ 84 +enc 84 +F 82 +_tr 81 +'u 81 +tai 81 +ell 80 +R 79 +_su 79 +S 79 +ions 79 +pré 79 +sé 78 +ab 78 +né 77 +_que_ 77 +_in 77 +_av 76 +pour_ 76 +fa 76 +rr 76 +air 75 +_ch 75 +_a_ 75 +ba 74 +_pl 74 +gr 74 +tt 74 +ssi 74 +rd 73 +pas 73 +bi 73 diff --git a/libtextcat/data/new_fingerprints/lm/frisian.lm b/libtextcat/data/new_fingerprints/lm/frisian.lm new file mode 100644 index 000000000000..9efa35f370c8 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/frisian.lm @@ -0,0 +1,400 @@ +_ 46446 +e 15767 +n 9616 +i 7837 +a 7830 +t 7562 +r 7297 +s 6307 +o 4784 +n_ 4595 +d 4564 +e_ 4213 +l 4032 +k 3951 +t_ 3079 +en 2863 +m 2605 +er 2492 +y 2462 +f 2371 +_d 2253 +de 2124 +h 1943 +in 1899 +w 1898 +en_ 1857 +_i 1815 +u 1769 +g 1738 +an 1690 +j 1678 +p 1651 +r_ 1619 +_f 1602 +. 1601 +ar 1561 +te 1545 +b 1488 +s_ 1476 +._ 1435 +_s 1370 +de_ 1235 +_de 1224 +_w 1181 +it 1172 +ie 1140 +, 1078 +_o 1056 +,_ 1056 +oa 1045 +_e 1040 +st 1038 +an_ 1010 +_b 972 +_de_ 965 +ke 949 +_m 947 +_t 933 +ne 920 +er_ 895 +at 863 +sk 856 +c 850 +it_ 848 +_h 838 +ei 832 +k_ 817 +yn 790 +ch 784 +le 772 +is 769 +je 765 +el 761 +me 758 +ea 754 +_k 740 +fa 736 +in_ 735 +' 733 +_it 733 +_it_ 727 +ge 727 +re 725 +al 724 +_fa 684 +yn_ 640 +aa 628 +fan 626 +_y 618 +ar_ 617 +et 616 +ri 615 +_fan 613 +_n 612 +li 611 +_yn 610 +_en 598 +fan_ 594 +oe 589 +_fan_ 584 +_en_ 583 +at_ 581 +_in 570 +oar 565 +_in_ 560 +y_ 555 +F 549 +es 540 +_a 536 +ng 523 +be 514 +sj 512 +nt 510 +l_ 510 +_F 506 +ns 506 +D 499 +te_ 499 +der 497 +_yn_ 497 +ti 493 +ek 490 +ro 476 +rs 474 +rd 473 +se 470 +fo 470 +ys 462 +op 461 +û 461 +we 456 +ry 451 +da 445 +d_ 443 +is_ 442 +_D 440 +ur 433 +i_ 432 +ha 426 +_me 425 +ear 422 +_l 417 +ed 410 +as 409 +om 407 +ei_ 406 +Fr 405 +nd 404 +_fo 394 +_p 393 +oc 390 +rys 389 +ol 386 +_Fr 386 +och 383 +fe 381 +ik 379 +ma 379 +ra 377 +nn 374 +_g 368 +_da 367 +di 363 +ts 362 +ta 361 +a_ 360 +ko 359 +et_ 358 +ysk 356 +Fry 354 +Frys 354 +z 354 +ll 350 +_be 349 +ke_ 348 +I 347 +ing 346 +_' 345 +m_ 343 +h_ 340 +ske 339 +_ha 338 +sje 336 +_Fry 336 +_Frys 336 +wi 335 +_op 334 +p_ 334 +_is 333 +ch_ 333 +tr 330 +ten 328 +ers 327 +wa 325 +ter 322 +ji 322 +rysk 319 +_ne 319 +je_ 312 +foa 311 +ê 309 +jo 307 +_is_ 307 +ste 307 +_te 306 +â 301 +n. 301 +nne 300 +rt 300 +foar 299 +S 299 +mei 299 +_r 298 +_oa 297 +wu 293 +ak 291 +si 290 +wur 290 +ni 290 +pe 288 +Frysk 287 +oan 286 +n._ 285 +_mei 284 +der_ 281 +_foar 281 +_foa 281 +ht 278 +cht 277 +- 275 +ú 275 +_j 274 +ne_ 273 +ken 273 +fer 271 +ûn 270 +am 270 +on 268 +nde 264 +_S 264 +B 262 +ki 261 +id 261 +le_ 261 +dat 260 +v 259 +_I 256 +oar_ 255 +op_ 254 +lle 252 +la 244 +tt 243 +dat_ 242 +_fe 242 +mm 240 +inn 239 +_wi 239 +_dat 237 +g_ 236 +mei_ 236 +al_ 235 +_B 234 +e. 234 +ld 231 +_al 230 +_wur 230 +urd 230 +_wu 230 +_op_ 230 +De 229 +inne 228 +ng_ 227 +_mei_ 227 +'t 226 +ks 226 +'t_ 224 +_dat_ 224 +_ko 223 +_st 220 +ân 219 +rk 219 +sa 219 +e._ 219 +tte 218 +en. 218 +n, 217 +ier 216 +ten_ 216 +_fer 215 +mi 215 +hi 215 +ien 214 +_wa 213 +n,_ 213 +no 213 +_te_ 212 +ig 212 +_De 212 +ske_ 212 +_der 212 +W 211 +H 211 +_oan 210 +ee 209 +dy 208 +ek_ 207 +en._ 207 +ic 207 +mme 206 +yk 204 +pr 204 +net 203 +foar_ 203 +he 203 +wurd 201 +_der_ 199 +jen 199 +_dy 199 +kr 198 +ka 197 +im 196 +_H 196 +il 196 +ze 196 +_ma 195 +by 194 +oer 194 +kt 193 +us 193 +M 193 +sk_ 192 +wo 192 +_hi 191 +or 190 +ing_ 190 +_W 190 +ich 189 +De_ 187 +rr 186 +int 185 +_ú 184 +: 184 +ij 184 +_ek 183 +eg 182 +:_ 180 +gen 180 +as_ 180 +_se 179 +e, 178 +_net 177 +e,_ 177 +ins 177 +N 177 +ls 176 +st_ 176 +_wurd 176 +ie_ 175 +E 175 +nne_ 175 +_De_ 175 +sy 175 +wer 174 +gr 174 +f_ 173 +nk 172 +och_ 172 +net_ 169 +ad 169 +_we 169 +rde 168 +sl 168 +bi 168 +of 168 +so 168 +_no 167 +_ta 167 +re_ 167 +to 167 +den 167 +J 167 +t. 166 +It 165 +út 165 +inne_ 165 +ysk_ 165 +lik 164 +sp 163 +_ek_ 162 +ou 162 +tsj 162 +It_ 161 +_sa 160 +A 160 +wol 160 +lâ 159 +_wo 159 +ge_ 159 +lân 159 +ige 158 diff --git a/libtextcat/data/new_fingerprints/lm/georgian.lm b/libtextcat/data/new_fingerprints/lm/georgian.lm new file mode 100644 index 000000000000..0e88ab08a895 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/georgian.lm @@ -0,0 +1,400 @@ +_ 14926 +À 7221 +È 5780 +Ä 4137 +à 2966 +Ñ 2908 +Ë 2607 +Ê 2372 +à 2260 +Ã… 2080 +à 2078 +ÃŒ 2005 +Ç 1598 +à 1383 +Ó 1363 +È_ 1248 +À_ 1149 + 1145 +ÊÈ 1009 +Þ 974 +_Ë 965 +Ñ_ 964 +ÃÀ 912 +ÈÑ 901 +Àà 897 +Ø 820 +. 813 +Äà 767 +" 762 +._ 735 +, 720 +,_ 718 +ÑÀ 684 +_À 669 +Ú 653 +ËÀ 631 +ÀÌ 622 +ÄÊ 575 +ÃÈ 570 +ÅÄ 567 +Õ 551 +_à 550 +Ã’ 542 +_Ñ 526 +É 518 +ÀË 517 +ÅÀ 485 +ÅÈ 479 +ÂÀ 478 +_ 474 +ÓÊ 468 +_È 436 +ÃÀ 435 +ÀÅ 420 +ËÈ 419 +ÌÈ 418 +Äà 416 +ÄÌ 412 +ÈÑ_ 407 +_ÃÀ 404 +ÃÀ_ 393 +ÀÊ 384 +Ëà 382 +Ãœ 376 +_" 374 +ÊÈ_ 371 +× 369 +Ãà 364 +ÀÑ 360 +ÈÇ 358 +Àà 353 +ÌÀ 349 +Ô 349 +ÃÀ 342 +Æ 341 +ËÄ 335 +ÈÀ 334 +ÈÊ 332 +ÃÇ 326 +_à 322 +ÃÈ 321 +ØÈ 319 +_Ä 319 +_Ø 319 +ÃÄ 317 +_ÂÀ 316 +ÇÀ 316 +ÄÑ 306 +Ä_ 299 +_Ç 288 +ÃÄ 279 +ÓÊÈ 273 +Êà 271 +Ö 270 +ÃŒ_ 267 +ÌÄ 267 +_ÑÀ 266 +Ãà 263 +Óà 260 +Ç_ 256 +ÄÊÈ 255 +ÃÈ 238 +ÊÄ 238 +ÑÈ 234 +ÊÀ 233 +Ãà 230 +ÈÌ 229 +_Þ 227 +"_ 225 +ÄÃÈ 224 +: 224 +:_ 223 +È. 221 +_ÃÀ_ 217 +Û 215 +ÞÄ 213 +È, 213 +È,_ 212 +_Ó 211 +Ã_ 209 +_ËÀ 208 +ÈÑÀ 208 +ÃË 206 +_Àà 204 +ÇÅ 203 +ÀÃÇ 203 +ØÄ 203 +È._ 201 +À. 200 +- 193 +ÀÞ 192 +ÅÄÊ 192 +Ú_ 189 +Èà 188 +Ù 188 +ÕÀ 187 +Ã_ 185 +ÈÊÈ 183 +Äà 179 +À._ 177 +Ã_ 177 +Ãà 174 +ÞÀ 174 +Èà 173 +Ìà 172 +_Ãà 172 +ÃÑ 172 +ÄÃÀ 171 +_Ëà 170 +ÃÈ_ 170 +_ËÈ 170 +_Ãœ 169 +ÇÈ 166 +ÃÄ 166 +_Õ 165 +ØÈ_ 165 +ÄÑ_ 163 +ÀÈ 162 +_ØÄ 160 +ÄÅ 158 +_É 155 +ÀÚ 154 +ÊÈÑ 153 +ÃŽ 151 +Ë_ 149 +ÕÀà 148 +À,_ 147 +À, 147 +ÀÇ 147 +À 145 +ÕÀÃÇ 145 +È 143 +ËÀà 143 +ÃÊ 143 +ÀÊÈ 142 +ÂÈ 142 +ÌÄà 141 +ÄË 140 +ÀØ 139 +ÓÊÈ_ 139 +ÄÇ 139 +ÇÓ 138 +_ÃŒ 136 +ÈÇ_ 135 +ÀÃÈ 133 +ÀÌ_ 132 +ÊÄà 131 +×à 130 +ÑÄ 130 +ÈË 130 +ÃÄà 128 +ÀÕ 125 +ÆÄ 125 +ÔÄ 125 +Ê_ 124 +ÀÅÈ 124 +ÀÃ_ 124 +ÅÈÑ 123 +_ËÄ 123 +ÀËÈ 122 +_ÕÀ 121 +ÉÈ 121 +_Ã… 120 +×Å 120 +ÃÃ… 116 +_à 114 +ÀÌÈ 113 +ÀËà 113 +ÂÀÌ 113 +ÃÃŒ 112 +ÉÀ 112 +ÈÚ 112 +ØÀ 112 +ÀÃÀ 111 +ÃÓ 111 +ÞÅ 109 +ÀÖ 109 +Âà 107 +ÃÀ_ 107 +ÌÈ_ 107 +ÅÀà 107 +ÄÊÈ_ 106 +Ãà 106 +ÀÃ_ 106 +_ÀË 105 +ß 104 +ÄÃÇ 104 +_Äà 104 +ÅÄÌ 103 +_ÄÑ 103 +ÃÃË 103 +ÄÌÈ 103 +_Ú 103 +Ãà 102 +_Ù 102 +ËÈÑ 102 +ÃÈÑ 102 +Þà 102 +_ÃÃË 101 +_ÕÀà 101 +ÈÅ 100 +_ÕÀÃÇ 100 +ÓÃÈ 99 +ÒÈ 99 +ÂÄ 99 +ÈÒ 99 +ÀÀ 97 +ÀÒ 97 +ÃÓÊ 96 +ÕÅ 94 +ÈÄ 94 +_ÇÀ 94 +Ñ,_ 93 +ÃÑ 93 +ÅÊ 93 +_ØÀ 93 +Ñ, 93 +_ÑÈ 93 +ÀÉ 93 +ÀÆ 92 +ÃÃÀ 92 +ÀÑ_ 92 +Ìà 91 +ÄÃÓ 91 +ÇÕ 91 +ËÓ 90 +ÄÌ_ 90 +ÇÀÅ 89 +ÄÃÓÊ 89 +ÊÑ 89 +ÀØÈ 89 +ÃÄà 89 +Àà 89 +È" 89 +Ñ. 88 +ÚÞ 88 +ÂÀË 88 +ÃÑ_ 87 +_ÄÃÇ 87 +È× 87 +ÃÈÑ 87 +ÌÃÀ 87 +ØÅ 87 +ÞÄÊ 87 +ÃÈÇ 85 +ÑÀ_ 85 +ÇÅÄ 85 +ÓÌ 85 +ÒÀ 85 +_ÃŽ 84 +ÊÈ. 84 +_ÃÀ 83 +Ñ._ 83 +_Ô 83 +_ÂÀË 83 +ÊÈÀ 83 +ÊÈ._ 82 +ÄÃÈ 82 +ÈÀ_ 82 +ÈÀÌ 82 +ÜÈ 81 +ÀÚ_ 81 +"Ë 81 +ÈÓ 80 +_"Ë 80 +ÃÇÅ 80 +_ÄÑ_ 79 +_È 79 +ÀÓ 79 +ÈÕ 79 +ÀÃÇÅ 79 +ÇÈ_ 79 +ÑÀÞ 79 +ÃÇÓ 78 +ÊÈ, 78 +ÚÈ 78 +ÞÈ 78 +ÃÇÓÊ 78 +ÇÓÊ 78 +ÊÈ,_ 78 +ÀÃÀ 78 +ÃÑ 78 +_ÀÃ_ 77 +ÞÊ 77 +ÃÓ 77 +ÀÃÇÓÊ 77 +_ÌÀ 77 +ÅÈÇ 77 +ÈÂÈ 77 +ÀÃÇÓ 77 +ÜÀ 76 +ÅÀ_ 75 +_ÞÀ 75 +ÉÅ 75 +ÒÄ 75 +ÃÀ_ 75 +ÅÀÊ 75 +ÇÅÄÊ 74 +ÃÇÅÄ 74 +ÀÃÇÅÄ 74 +ÄÃÀ_ 74 +ÃË 73 +ÑÞ 73 +ÑÒ 73 +ÅÑ 73 +ÑÓ 73 +ÃÇÅÄÊ 73 +ËÞ 73 +ÃÊà 72 +ÃÚ 72 +_× 72 +Âà 71 +ÅÄ_ 71 +Ã’Ã 71 +ÇÄ 71 +à71 +ÅÄà 71 +ÕÀÃÇÓ 71 +ÃÀà 70 +ÀË_ 70 +Åà 69 +ËÀ_ 69 +ÃÄ 69 +_ÀÃÀ 67 +à 67 +ÄÃÀ 67 +ÇÀÌ 67 +È×à 67 +ÕÀÃÇÅ 67 +Ò× 67 +ÂÀËà 66 +_ÂÀËà 66 +ÄÒ 66 +ÃÃÀ 66 +ÅÄÊÈ 66 +ÀÌÀ 66 +ÄÃÈÇ 66 +_ÈÂÈ 66 +ÄÊà 65 +Ä 65 +ËÈÊ 65 +ÊË 65 +ÈÊÈ_ 65 +ÃÃÈ 65 +Ã…Ã 65 +_ÈÑ 64 +Ó_ 63 +Ëà 63 +", 63 +Ãà 63 +ÊÃà 63 +ÂÅ 63 +à63 +Äà 62 +À" 62 +ÀÅÀ 62 +? 62 +",_ 62 +! 62 +_È× 62 diff --git a/libtextcat/data/new_fingerprints/lm/german.lm b/libtextcat/data/new_fingerprints/lm/german.lm new file mode 100644 index 000000000000..eb4eda0f8239 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/german.lm @@ -0,0 +1,400 @@ +_ 31586 +e 15008 +n 9058 +i 7299 +r 6830 +t 5662 +s 5348 +a 4618 +h 4176 +d 4011 +er 3415 +en 3412 +u 3341 +l 3266 +n_ 2848 +c 2636 +ch 2460 +g 2407 +o 2376 +e_ 2208 +r_ 2128 +m 2077 +_d 1948 +de 1831 +en_ 1786 +ei 1718 +er_ 1570 +in 1568 +te 1505 +ie 1505 +b 1458 +t_ 1425 +f 1306 +k 1176 +ge 1144 +s_ 1137 +un 1113 +, 1104 +,_ 1099 +w 1099 +z 1060 +nd 1039 +he 1004 +st 989 +_s 952 +_de 949 +. 909 +_e 906 +ne 906 +der 880 +._ 847 +be 841 +es 829 +ic 796 +_a 791 +ie_ 779 +is 769 +ich 763 +an 755 +re 749 +di 732 +ein 730 +se 730 +" 720 +ng 709 +_i 706 +sc 683 +sch 681 +it 673 +der_ 652 +h_ 651 +ch_ 642 +S 630 +le 609 +p 609 +ä 607 +ü 603 +au 603 +v 602 +che 599 +_w 596 +d_ 585 +die 576 +_di 572 +m_ 562 +_die 559 +el 548 +_S 540 +_der 529 +li 527 +_der_ 523 +si 515 +al 514 +ns 507 +on 501 +or 495 +ti 490 +ten 487 +ht 486 +die_ 485 +_die_ 483 +D 479 +rt 478 +nd_ 476 +_u 470 +nt 468 +A 466 +in_ 464 +den 461 +cht 447 +und 443 +me 440 +_z 429 +ung 426 +ll 423 +_un 421 +_ei 419 +_n 415 +hr 412 +ine 412 +_A 408 +_ein 405 +ar 404 +ra 403 +_v 400 +_g 400 +as 395 +zu 392 +et 389 +em 385 +_D 380 +eine 376 +gen 376 +g_ 376 +da 368 +we 366 +K 365 +lt 360 +B 354 +_" 353 +nde 349 +ni 347 +und_ 345 +E 345 +ur 345 +_m 342 +ri 341 +ha 340 +eh 339 +ten_ 338 +es_ 336 +_K 336 +_und 335 +ig 335 +_b 335 +hen 334 +_und_ 332 +_au 329 +_B 327 +_da 325 +_zu 324 +_in 322 +at 321 +us 318 +wi 307 +n, 305 +n,_ 304 +nn 304 +te_ 301 +eit 301 +_h 300 +ter 299 +M 298 +n. 295 +ß 294 +ng_ 289 +sche 289 +- 283 +rs 282 +den_ 282 +_si 280 +G 280 +im 278 +_ge 277 +chen 276 +rd 273 +_E 273 +n._ 270 +icht 270 +rn 268 +uf 267 +isch 264 +isc 264 +nen 263 +_in_ 262 +_M 260 +_er 257 +ich_ 255 +ac 253 +lic 252 +_G 252 +ber 252 +la 251 +vo 251 +eb 250 +ke 249 +F 248 +as_ 248 +hen_ 248 +ach 245 +en, 244 +ung_ 243 +lich 243 +ste 243 +en,_ 243 +_k 241 +ben 241 +_f 241 +en. 241 +_be 239 +it_ 239 +L 238 +_se 237 +mi 236 +ve 236 +na 236 +on_ 236 +P 235 +ss 234 +ist 234 +ö 234 +ht_ 233 +ru 233 +st_ 229 +_F 229 +ts 227 +ab 226 +W 226 +ol 225 +_eine 225 +hi 225 +so 224 +em_ 223 +"_ 223 +ren 222 +en._ 221 +chen_ 221 +R 221 +ta 221 +ere 220 +ische 219 +ers 218 +ert 217 +_P 217 +tr 217 +ed 215 +ze 215 +eg 215 +ens 215 +ür 213 +ah 212 +_vo 212 +ne_ 211 +cht_ 210 +uc 209 +_wi 209 +nge 208 +lle 208 +fe 207 +_L 207 +ver 206 +hl 205 +V 204 +ma 203 +wa 203 +auf 201 +H 198 +_W 195 +T 195 +nte 193 +uch 193 +l_ 192 +sei 192 +nen_ 190 +u_ 189 +_den 189 +_al 189 +_V 188 +t. 188 +lte 187 +ut 186 +ent 184 +sich 183 +sic 183 +il 183 +ier 182 +am 181 +gen_ 180 +sen 179 +fü 178 +um 178 +t._ 177 +f_ 174 +he_ 174 +ner 174 +nst 174 +ls 174 +_sei 173 +ro 173 +ir 173 +ebe 173 +mm 173 +ag 172 +ern 169 +t,_ 169 +t, 169 +eu 169 +ft 168 +icht_ 167 +hre 167 +Be 166 +nz 165 +nder 165 +_T 164 +_den_ 164 +iche 163 +tt 163 +zu_ 162 +and 162 +J 161 +rde 160 +rei 160 +_we 159 +_H 159 +ige 159 +_Be 158 +rte 157 +hei 156 +das 155 +aus 155 +che_ 154 +_das 154 +_zu_ 154 +tz 154 +_ni 153 +das_ 153 +_R 153 +N 153 +des 153 +_ve 153 +_J 152 +I 152 +_das_ 152 +men 151 +_so 151 +_ver 151 +_auf 150 +ine_ 150 +_ha 150 +rg 149 +ind 148 +eben 148 +kt 147 +mit 147 +_an 147 +her 146 +Ge 146 +Sc 145 +_sich 145 +U 145 +Sch 145 +_sic 145 +end 145 +Di 144 +abe 143 +ck 143 +sse 142 +ür_ 142 +ell 142 +ik 141 +o_ 141 +nic 141 +nich 141 +sa 141 +_fü 140 +hn 140 +zi 140 +no 140 +nicht 140 +im_ 139 +von_ 139 +von 139 +_nic 139 +_nich 139 +eine_ 139 +oc 138 +wei 138 +io 138 +schen 138 +gt 138 diff --git a/libtextcat/data/new_fingerprints/lm/greek.lm b/libtextcat/data/new_fingerprints/lm/greek.lm new file mode 100644 index 000000000000..6dff6cd4f767 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/greek.lm @@ -0,0 +1,400 @@ +_ 89284 +α 19666 +Ï„ 16086 +ο 15826 +ε 14848 +ι 12766 +ν 12189 +Ï€ 8776 +σ 8653 +Ï 8399 +κ 7761 +μ 7529 +Ï… 6850 +ά 6284 +_Ï„ 5918 +λ 5802 +Ï‚ 5371 +α_ 5272 +η 5236 +Î 4739 +ί 4609 +ÏŒ 4489 +το 4461 +Ï‚_ 4111 +ου 4075 +ι_ 4033 +ε_ 4019 +. 3916 +_κ 3897 +ο_ 3836 +._ 3810 +ν_ 3661 +_Ï€ 3414 +_σ 3333 +" 3247 +_μ 3242 +_το 3118 +, 3106 +,_ 3068 +γ 3058 +_α 2912 +ω 2689 +να 2593 +δ 2455 +τα 2420 +χ 2411 +κα 2406 +στ 2406 +_ε 2353 +- 2335 +-_ 2192 +Ï 2171 +αν 2162 +τη 2151 +ή 2141 +Ï…_ 2133 +αι 2100 +θ 2053 +φ 1989 +ου_ 1974 +ει 1889 +_κα 1882 +εί 1864 +το_ 1802 +πο 1771 +αι_ 1727 +σε 1709 +_ν 1647 +_" 1619 +η_ 1609 +ια 1602 +να_ 1518 +τι 1501 +ον 1501 +του 1495 +με 1460 +_του 1407 +_Î 1405 +_στ 1396 +ÏŽ 1364 +πε 1359 +τε 1300 +μα 1298 +β 1274 +Ïι 1252 +ÏŒ_ 1241 +_δ 1231 +ξ 1223 +ντ 1220 +_το_ 1196 +απ 1187 +Ïο 1184 +_γ 1183 +_τη 1177 +ζ 1158 +_εί 1150 +ά_ 1150 +_να 1124 +Îν 1115 +και 1110 +_και 1093 +αν_ 1082 +Ïα 1078 +και_ 1061 +_να_ 1060 +_και_ 1053 +μπ 1049 +νο 1048 +ατ 1036 +ιο 1024 +ια_ 1018 +ÎµÏ 1003 +Î¿Ï 993 +_απ 984 +σε_ 974 +ικ 973 +_ο 967 +εν 947 +ος 936 +ει_ 935 +πό 901 +λο 892 +_με 890 +νε 884 +του_ 871 +ον_ 869 +ας 865 +_του_ 854 +ασ 841 +με_ 840 +σα 834 +κο 833 +Î±Ï 832 +Ï€Ï 824 +ίν 820 +κά 808 +_πο 804 +πι 796 +Κ 796 +μΠ783 +μο 777 +ÎÏ 772 +αλ 766 +ην 762 +Îœ 759 +ισ 745 +κε 742 +τα_ 740 +στο 738 +ω_ 730 +ην_ 728 +Ο 717 +Τ 714 +_φ 711 +ετ 705 +δε 704 +πα 697 +ας_ 688 +τά 684 +ος_ 683 +_ÏŒ 680 +_Κ 675 +οι 671 +_χ 670 +την 663 +την_ 658 +_στο 647 +ή_ 645 +πό_ 638 +_θ 633 +_ο_ 631 +", 630 +_Ï€Ï 626 +_Îœ 624 +ίπ 624 +άν 623 +",_ 623 +από 620 +που 619 +ότ 618 +λα 617 +τον 617 +_από 616 +μι 612 +Ο_ 611 +Ïε 607 +Ï…Ï„ 604 +λε 595 +_λ 594 +ÏÏŒ 590 +_με_ 586 +ιά 580 +τον_ 577 +γι 577 +_Ο 575 +από_ 572 +Î¬Ï 570 +πά 570 +_από_ 569 +Ïά 562 +ταν 554 +Îνα 553 +υν 552 +που_ 551 +δι 547 +Ï„Ï 547 +Ï„ÏŒ 544 +_β 540 +χε 536 +εν_ 534 +ησ 528 +_Ο_ 525 +ης 520 +". 519 +_που 516 +_Τ 515 +"._ 513 +τη_ 512 +Ï‚. 510 +είπ 508 +ταν_ 504 +_είπ 503 +Ï‚._ 501 +_τα 500 +ξε 497 +στο_ 496 +λλ 493 +ακ 492 +Α 491 +_που_ 490 +άλ 489 +ίπε 489 +είπε 485 +_είπε 483 +_την 483 +τι_ 482 +_την_ 481 +άτ 480 +λι 480 +_δε 479 +άμ 477 +_στο_ 475 +σο 473 +_ά 468 +"_ 463 +μÎν 463 +ιν 461 +ις 452 +_αν 452 +κό 451 +αυ 451 +_τον 450 +_γι 449 +θα 447 +ες 446 +ση 446 +_μι 445 +_τον_ 443 +ε. 441 +ε._ 441 +ους 439 +λά 439 +Ï…Ï‚ 439 +ολ 438 +πιο 437 +Î¿Ï 432 +_πα 428 +_κά 427 +τε_ 427 +αμ 423 +; 423 +_μο 422 +σκ 421 +της 421 +_ξ 418 +στη 415 +Κά 411 +νη 408 +για 405 +α. 403 +ÎºÏ 402 +κι 402 +Ï‚, 402 +Ï‚,_ 401 +α._ 398 +ηκ 397 +_Κά 397 +ελ 396 +_Îν 395 +_ή 393 +_μα 392 +ία 391 +ως 391 +λη 390 +ίνα 389 +πί 389 +μου 388 +μά 388 +_αυ 387 +ης_ 386 +συ 384 +ναι 384 +Î 383 +αυτ 382 +ί_ 376 +μπι 375 +ίσ 372 +_της 370 +_τα_ 367 +_για 365 +_Îνα 362 +_μπ 361 +θε 361 +ιον 359 +ις_ 358 +τή 358 +_θα 354 +_αυτ 354 +άμπ 352 +κ. 352 +κ._ 351 +είν 351 +ομ 350 +ίναι 348 +ντα 348 +ναι_ 348 +ως_ 347 +χα 346 +Ε 346 +Ï…. 346 +για_ 346 +Ï…._ 345 +δεν 345 +Îνα_ 345 +α, 344 +α,_ 344 +δεν_ 344 +ÏÏ 343 +όν 343 +α- 342 +Ïσ 341 +_κ. 340 +στε 339 +Κάμ 339 +_κ._ 339 +Ï„Î 339 +α-_ 338 +ευ 338 +ιλ 338 +Ïί 338 +Κάμπ 338 +_μου 338 +_Κάμ 338 +_Κάμπ 337 +υμ 336 +σει 336 +πιον 336 +μπιον 336 +μπιο 336 +_κο 334 +Κάμπι 334 +_η 334 +άμπι 334 +θα_ 333 +νι 332 +της_ 331 +ψ 331 +όμ 330 +ησε 330 +_σα 329 +μα_ 328 +ός 328 +Σ 326 +_δεν 325 +_δεν_ 325 +σμ 324 +ες_ 324 +Ï€ÎµÏ 324 +ίχ 323 +ίναι_ 323 +τικ 322 +_Ï 321 +άμπιο 321 +Ï…Ï‚_ 321 +ους_ 321 +_πε 321 +σω 320 +_για_ 320 +Ï€Ïο 320 +γε 318 +;" 316 +;"_ 316 +Δ 315 +Ï„Ï… 314 +Ïα_ 313 +_συ 312 +Ïω 312 +_θα_ 310 +όσ 309 +ου. 309 diff --git a/libtextcat/data/new_fingerprints/lm/hebrew.lm b/libtextcat/data/new_fingerprints/lm/hebrew.lm new file mode 100644 index 000000000000..31b4ee0af280 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/hebrew.lm @@ -0,0 +1,400 @@ +_ 81560 +×™ 23072 +ו 19215 +×” 15606 +ל 12612 +ר 11293 +ת 11070 +מ 10090 +ב 9648 +× 9601 +ש 9081 +×”_ 7811 +× 6685 +×¢ 6326 +_ת 5878 +× 5542 +ד 5019 +×— 4669 +_× 4570 +ב_ 4267 +×› 3984 +_×” 3812 +ק 3769 +פ 3699 +ל_ 3693 +××™ 3535 +מ_ 3489 +ס 3374 +×_ 3362 +תו 3130 +, 3067 +_ל 3037 +_, 3035 +_×™ 2936 +ש_ 2886 +_××™ 2840 +×’ 2768 +ט 2626 +ן 2424 +_ר 2395 +_תו 2291 +. 2256 +צ 2215 +_. 2205 +×™× 2144 +×¢_ 2003 +_ן 1959 +_ו 1913 +" 1815 +יב 1797 +לש 1742 +יל 1687 +יר 1665 +×™_ 1647 +וי 1620 +ו_ 1578 +ור 1525 +×ª× 1475 +×›_ 1469 +רו 1411 +×– 1383 +×•× 1353 +מה 1351 +תי 1343 +×™×™ 1343 +לו 1315 +יד 1285 +רי 1236 +מה_ 1211 +הל 1206 +ומ 1192 +× ×™ 1163 +רש 1155 +×™× 1146 +×ר 1138 +_×ª× 1137 +שי 1134 +יש 1087 +× _ 1080 +×œ× 1074 +וח 1067 +_× 1062 +שמ 1059 +ימ 1052 +×”×™ 1047 +- 1024 +לש_ 1008 +וה 973 +רב 967 +×•× 954 +ת×_ 951 +ול 948 +_ד 941 +×™×” 896 +× ×• 888 +וע 883 +×™×› 873 +וש 871 +לע 867 +×ל 850 +עו 843 +_- 830 +×—_ 830 +דו 824 +ןו 820 +ר_ 806 +וב 805 +_לש 799 +יט 784 +××” 773 +_לש_ 748 +×™×¢ 746 +די 743 +_ש 726 +_ת×_ 723 +ך 720 +_תי 719 +-_ 716 +_-_ 713 +בי 709 +בו 706 +ות 699 +××”_ 690 +רמ 686 +שה 683 +וד 678 +×™×— 675 +פ_ 672 +×”× 669 +_ב 668 +_×¢ 659 +_ך 655 +יס 652 +ןי 649 +_לע 640 +יו 635 +×מ 635 +יב_ 632 +ת_ 631 +×ž× 628 +שו 627 +_ןו 624 +לי 624 +לע_ 621 +תה 619 +ית 600 +הל_ 599 +וכ 599 +יפ 596 +פה 595 +וק 586 +הש 578 +×¢×™ 575 +_ןי 569 +מו 564 +_לע_ 561 +קי 560 +×™×’ 557 +×™×”_ 557 +רשי 554 +×ו 548 +×ל_ 548 +תוי 548 +ל×ר 546 +×¨× 542 +הר 540 +"_ 540 +מב 539 +שה_ 538 +ופ 538 +×רש 535 +רע 534 +×—×” 533 +וג 532 +×רשי 530 +ל×רשי 530 +ל×רש 530 +×ו 527 +מי 525 +_×™×› 518 +המ 518 +פה_ 511 +×™×›_ 510 +_×™× 509 +לכ 506 +תמ 502 +מב_ 500 +סו 498 +×—× 497 +יק 497 +וו 494 +_ק 485 +×’_ 481 +×יל 477 +_×”×™ 477 +דמ 472 +בה 470 +,×” 470 +_,×” 470 +ק_ 469 +עב 468 +_×ו 467 +הב 467 +×—×”_ 466 +_×™×›_ 463 +×ב 462 +רח 462 +_×ל 461 +×¥ 455 +מל 454 +×™× ×™ 454 +×©× 453 +רה 453 +יצ 452 +×יר 451 +_×”× 447 +טי 443 +ד_ 441 +מע 440 +××™×™ 439 +וה_ 439 +' 435 +×מ_ 430 +.× 429 +×ª× 429 +_.× 428 +רק 427 +תר 423 +וס 417 +× ×© 417 +_הל 414 +סל 413 +× ×ª 408 +ס_ 405 +סה 400 +לפ 400 +בש 399 +,× 399 +_,× 399 +ממ 397 +שי_ 396 +× ×‘ 396 +×¢×” 394 +תה_ 393 +×ª×•× 393 +××™× 389 +_רו 385 +×¨×ž× 384 +×™×_ 384 +לב 384 +תב 381 +בר 378 +בה_ 377 +טס 374 +_×¥ 374 +עמ 374 +×—×™ 373 +רפ 373 +הו 371 +חו 370 +בל 370 +_×œ× 370 +קו 367 +_הר 366 +_×יר 364 +חמ 363 +×–_ 362 +_×— 360 +× ×› 360 +_לו 360 +כו 359 +,ת 358 +_,ת 357 +מת 356 +×™× ×• 353 +ורי 353 +ו×_ 349 +רד 348 +תור 348 +××™ 345 +×£ 345 +לשמ 344 +×›×™ 340 +_×יל 340 +וצ 338 +תל 338 +_××™×™ 336 +×¨×—× 335 +_תוי 335 +מ×_ 332 +לח 331 +_תר 329 +_×ª×•× 325 +×’×™ 325 +×¢×”_ 325 +: 324 +פל 324 +×ב_ 324 +שר 322 +רט 321 +תש 320 +צ_ 320 +מע_ 319 +וי_ 319 +_: 319 +צמ 316 +שממ 315 +_ס 315 +×ª×™× 315 +סמ 315 +הד 313 +רה_ 312 +וט 312 +_×£ 310 +ש×ר 309 +רשי_ 305 +×רשי_ 305 +.××™ 305 +_.××™ 304 +הת 303 +יטס 303 +_' 303 +×’×” 302 +שמ_ 302 +_××™× 301 +לה 298 +רג 294 +חט 293 +דע 293 +×יד 292 +×ש 292 +לשממ 292 +××¢ 292 +ידי 290 +של 289 +פו 289 +דב 289 +צו 287 +_××¢ 286 +,××™ 285 +_,××™ 285 +× ×ž 284 +סי 282 +שב 282 +_רש 281 +דר 281 +_תור 281 +קה 280 +תוש 278 +הש_ 276 +מל_ 276 +_×ל_ 275 +והי 274 +_ל×ר 274 +פי 274 +עב_ 271 +ל×_ 271 +×‘× 270 +×—×_ 269 +חל 268 +עפ 267 +בע 267 +.×” 266 +_רי 266 +_ל×רש 266 +×•× ×™ 266 +_.×” 265 +יטסל 265 +טסל 265 +×™×œ× 265 +תע 264 +× ×” 263 +קל 262 +× ×™×˜ 260 +_.ת 260 +.ת 260 +פל_ 260 +הו_ 259 +סלפ 258 +טסלפ 258 +יטסלפ 258 +× ×™×˜×¡×œ 257 +× ×™×˜×¡ 257 +ומ_ 256 +סה_ 256 +מש 255 +ירו 255 +× ×©_ 254 +ומת 254 +×”×” 252 +בק 251 +יש_ 251 +_×•× 251 +ירב 251 +_רב 249 +_יד 249 +_×יד 249 +×›×” 248 +×’× 247 +_דו 247 diff --git a/libtextcat/data/new_fingerprints/lm/hindi.lm b/libtextcat/data/new_fingerprints/lm/hindi.lm new file mode 100644 index 000000000000..3b4e1584a943 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/hindi.lm @@ -0,0 +1,400 @@ +_ 75620 +æ 19109 +U 16333 +ð 11131 +¤ 11107 +· 10241 +·¤ 7855 +Ú 6993 +ÚU 6598 +ç 6322 +è 6151 +Ù 5887 +_· 5800 +ã 5370 +â 5168 +U_ 5118 +æ_ 4935 +ÃŒ 4508 +× 4467 +Ø 4292 +ð_ 3962 +_·¤ 3937 +è_ 3904 + 3718 +¤_ 3609 +Ãœ 3563 +ãU 3514 +_ç 3394 +´ 3363 +ß 2962 +Â¥ 2907 +ÚU_ 2851 +_Â¥ 2668 +_ã 2632 +_â 2575 +ô 2527 +ÃŽ 2513 +´_ 2512 +à 2451 +Uæ 2336 +Õ 2314 +_ 2280 +_× 2260 +ñ 2187 +¢ 2082 +» 2078 +¤æ 1988 +ð´ 1913 +·¤æ 1886 +·ð 1881 +ð¤ 1877 +·ð¤ 1860 +à 1842 +Ã_ 1830 +·¤_ 1746 +é 1714 +æð 1703 +ð¤_ 1695 +·ð¤_ 1694 +æÚ 1631 +ü 1610 +_·ð 1579 +_·ð¤ 1567 +ð´_ 1556 +æÚU 1536 +Øæ 1528 +Ùð 1525 +Uè 1515 +_·ð¤_ 1488 +Ù_ 1411 +Ùð_ 1407 +à 1390 +_ãñ 1357 +ãñ 1357 +Ã… 1337 +ÃŒ_ 1319 +_Õ 1315 +×ð 1300 +ç· 1279 +Ö 1270 +_à 1258 +_·¤æ 1209 +ç·¤ 1203 +¤è 1195 +_Ù 1187 +° 1181 +§ 1166 +×ð´ 1163 +à æ 1162 +ý 1157 +¿ 1155 +_×ð 1150 +×ð´_ 1140 +·¤è 1127 +¤è_ 1092 +_×ð´ 1086 +_×ð´_ 1077 +ñU 1075 +_ãU 1071 +·¤è_ 1065 +ãñU 1062 +_ãñU 1062 +æÙ 1053 +¥æ 1052 +Ìæ 1038 +¤Ú 1035 +_ç· 1009 +¤ÚU 972 +âð 970 +_ç·¤ 967 +_·¤è 959 +ÚUæ 936 +ãUæ 930 +_·¤è_ 929 +·¤Ú 928 +à 926 +_¥æ 904 +Ã…U 904 +Ç 904 +© 890 +ê 887 +©U 885 +·¤ÚU 866 +_© 865 +_©U 862 +_Ö 854 +âð_ 850 +_Ú 847 +_ÃŒ 837 +S 834 +Uà 831 +_ÃŽ 830 +UÃ_ 823 +_ß 798 +Ùæ 788 +â_ 783 +_ÚU 773 +Uè_ 770 +ô´ 753 +æØ 752 +à 749 +Øæ_ 732 +×æ 732 +ô_ 704 +_§ 704 +Ãæ 704 +_âð 697 +Âý 695 +ãUè 693 +¤æ_ 690 +ü_ 688 +æÚU_ 685 +, 684 +_» 682 +·¤æ_ 679 +,_ 674 +_·¤Ú 670 +¹ 665 +ðU 664 +ßæ 648 +_Âý 647 +UÃŒ 644 +Ãœ_ 643 +_Ãœ 641 +ç·¤_ 639 +Ø_ 639 +Üæ 633 +_âð_ 633 +æç 623 +Uô 621 +ô´_ 615 +_·¤ÚU 611 +Uæ_ 599 +ãU_ 595 +Üð 594 +UÙ 589 +ñUà 580 +_ãñUà 580 +ãñUà 580 +_° 576 +_Ãæ 573 +ñUÃ_ 572 +ãñUÃ_ 572 +æÜ 569 +_Ø 569 +_Ùð 569 +ÂÚ 561 +_ç·¤_ 557 +‡ 556 +¤ô 552 +ææ 550 +ÂÚU 549 +çß 544 +Õæ 538 +_·¤æ_ 535 +×_ 532 +çÜ 525 +âæ 523 +·¤ô 519 +æð_ 502 +æ¢ 501 +¸ 498 +_Ùð_ 495 +‹ 494 +_ÂÚ 493 +Âæ 493 +Ìæ_ 490 +_ÂÚU 485 +çÙ 484 +õ 481 +È 478 +‡æ 469 +. 459 +ÂÚU_ 458 +Öæ 449 +Øð 449 +_çß 445 +§â 444 +¤ÚU_ 443 +Öè 442 +_§â 440 +_ÂÚU_ 439 +æð´ 437 +Ùæ_ 435 +€ 434 +_¿ 433 +ÚUè 431 +⢠431 +_·¤ô 430 +Îð 427 +æÌ 425 +ÃŽ_ 420 +Öè_ 419 +¸U 415 +˜ 412 +˜æ 412 +§ü 410 +´U 405 +ÇU 399 +Ⱦ 399 +·¤ÚU_ 394 +Uã 394 +æÙ_ 393 +çÌ 393 +¤ô_ 392 +·¤ô_ 392 +çÎ 389 +ÚUÃŒ 385 +æà 385 +Ǹ 383 +æð´_ 382 +Ìè 381 +Ãæ 379 +çÚ 375 +°_ 374 +ãUô 374 +ÚUã 374 +æà 373 +æÃ_ 372 +_·¤ô_ 371 +_Õæ 369 +æè 364 +çÚU 364 +ðU_ 362 +¤æð 358 +Ȥ 357 +Uæð 354 +è´ 353 +â· 352 +ß_ 350 +U· 349 +¤æÚ 346 +Ìð 346 +·¤æð 346 +æü 345 +õÚ 344 +õÚU 342 +·¤æÚ 342 +_×æ 341 +_Öè 341 +_çÜ 340 +ñU_ 337 +_ãñU_ 337 +ÿ 337 +ãñU_ 337 +Ùè 336 +ãUè_ 334 +¿æ 334 +ñ´ 334 +_Öè_ 332 +æ× 327 +¤æÚU 327 +ÿæ 326 +_à 325 +U·¤ 323 +·¤æÚU 323 +Uè´ 322 +ãUè´ 321 +_âæ 320 +ǸU 319 +_¥õ 319 +¥õ 319 +õÚU_ 319 +_ÚUã 318 +Úð 317 +è´_ 316 +_⢠316 +æ· 313 +Øô 310 +_ãUæ 309 +Øã 309 +À 308 +ØãU 308 +_Øã 308 +_ØãU 307 +_·¤æð 304 +_¥õÚ 304 +¥õÚ 304 +_¥õÚU 304 +_Öæ 304 +¥õÚU 304 +¥õÚU_ 303 +_çÙ 303 +ãUè´_ 300 +Uè´_ 300 +_S 300 +Îæ 300 +UÃœ 298 +ÚUè_ 296 +æÎ 296 +æß 294 +Ã…U_ 294 +Øð_ 293 +Ùã 292 +âè 291 +_Ùã 290 +æè_ 290 +ðà 290 +Üð_ 289 +UãU 288 +ÙãU 288 +Uâ 288 +_ÙãU 286 +_à 286 +_ÙãUè 285 +_Âæ 285 +ÙãUè 285 +æ·¤ 284 +_à æ 284 +ÙãUè´ 283 +UÃŒ_ 282 +ãñ´ 280 +_ãñ´ 280 +ñ´U 280 +â× 279 +_çÎ 278 +_ãñ´U 278 +ãñ´U 278 +»_ 277 +_Îð 275 +ðà æ 274 +à æ_ 273 +æñ 272 +·¤ã 272 +¤ã 272 +Ìð_ 272 +_ãUô 272 +‡æ_ 272 +- 271 +¤ãU 271 +·¤ãU 271 +¢_ 271 +_·¤ã 270 +ãUæ_ 270 +_·¤ãU 269 +ÚUãU 268 +ãé 267 +æâ 265 +°· 263 +¤Ø 263 +¤æð_ 262 +·¤æð_ 262 +°·¤ 262 +ÚðU 258 +_°· 258 +Ùè_ 258 +_°·¤ 257 +ÀU 256 +v 253 +ÂÙ 252 +_ÚUæ 252 +Üè 249 +ç× 247 +çâ 246 +_Ã… 246 +ÚUÙ 246 +×é 245 +._ 245 +UÚ 244 +éU 243 diff --git a/libtextcat/data/new_fingerprints/lm/hungarian.lm b/libtextcat/data/new_fingerprints/lm/hungarian.lm new file mode 100644 index 000000000000..307348b47789 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/hungarian.lm @@ -0,0 +1,400 @@ +_ 19186 +e 5753 +a 4627 +t 4522 +s 3480 +l 3437 +n 3137 +k 3036 +i 2527 +r 2437 +z 2399 +o 2303 +á 2039 +é 1995 +g 1978 +m 1695 +y 1338 +_a 1256 +b 1186 +d 1148 +a_ 1108 +v 1057 +t_ 901 +sz 889 +el 832 +, 819 +,_ 818 +h 792 +k_ 769 +. 767 +et 743 +gy 711 +s_ 705 +_m 702 +_a_ 695 +en 671 +ö 662 +n_ 646 +_k 645 +j 623 +._ 613 +i_ 606 +eg 601 +p 586 +_e 580 +u 579 +le 576 +ó 542 +er 495 +f 485 +ek 477 +te 477 +és 473 +_s 471 +al 464 +ta 458 +à 453 +_h 444 +_t 442 +an 426 +ze 425 +me 406 +at 405 +l_ 401 +es 395 +õ 387 +y_ 381 +z_ 375 +tt 374 +ke 372 +_v 369 +ás 368 +ak 367 +_é 365 +ny 363 +tá 359 +c 358 +re 350 +to 347 +A 343 +e_ 340 +ü 332 +ne 330 +os 326 +ál 320 +_f 320 +az 317 +zt 317 +ár 317 +_n 315 +ko 312 +_A 303 +_sz 302 +is 301 +ve 299 +gy_ 297 +Ãt 293 +_b 293 +ra 291 +or 289 +ol 284 +_i 281 +em 279 +_l 274 +la 264 +ez 262 +be 260 +lt 260 +ok 260 +ye 256 +_me 252 +on 251 +en_ 247 +ar 245 +_az 245 +in 243 +án 242 +se 242 +ég 238 +egy 237 +ha 237 +r_ 237 +té 237 +ér 235 +sze 233 +én 226 +ly 224 +g_ 221 +" 221 +ll 219 +iz 214 +de 214 +ek_ 213 +mi 212 +rt 211 +ba 209 +ho 209 +A_ 209 +és_ 209 +az_ 205 +va 204 +ag 203 +ka 202 +na 202 +og 201 +ik 201 +nt 200 +_A_ 199 +ô 199 +- 198 +tt_ 198 +_az_ 197 +ni 195 +cs 194 +ki 190 +kö 187 +át 187 +_eg 187 +nd 186 +fe 185 +_és 185 +lá 182 +bi 181 +nk 180 +_le 179 +an_ 179 +_és_ 177 +tás 175 +ké 174 +meg 173 +_egy 172 +ma 171 +as 170 +sa 170 +sà 169 +ge 167 +ot 167 +za 167 +E 166 +m_ 165 +_meg 165 +_el 165 +bb 164 +ro 164 +zá 163 +he 162 +má 161 +sé 160 +_r 160 +sÃt 160 +tos 159 +ti 159 +st 158 +_j 158 +él 157 +it 156 +_ho 156 +ül 156 +_ha 155 +vé 154 +am 152 +oz 152 +ele 151 +ya 151 +zto 150 +ú 149 +biz 147 +so 147 +et_ 145 +izto 144 +izt 144 +ap 141 +"_ 141 +ed 141 +ss 140 +bizt 140 +ék 140 +bizto 140 +iztos 139 +ét 139 +ztos 139 +osÃt 138 +zet 138 +osà 138 +mé 137 +_is 137 +t, 136 +tosÃt 136 +tosà 136 +t,_ 136 +ó_ 135 +agy 135 +ztosà 135 +li 134 +om 134 +_fe 134 +ere 133 +ág 133 +t. 132 +nek 131 +vi 129 +_d 129 +zo 128 +k,_ 128 +k, 128 +_kö 127 +_p 127 +M 126 +let 126 +ak_ 125 +já 125 +ett 125 +û 124 +si 124 +ész 123 +_E 123 +ép 123 +vá 123 +rá 123 +t._ 123 +is_ 123 +S 123 +Ãtás 122 +Ãtá 122 +kor 121 +ai 121 +fel 120 +da 120 +_mi 120 +pe 119 +ogy 118 +ban 118 +ad 117 +ga 116 +_va 116 +ott 114 +_ne 114 +_ki 113 +ör 113 +zé 112 +ben 112 +_te 111 +zi 111 +sá 110 +ség 109 +do 109 +tó 108 +em_ 108 +_" 108 +_ta 108 +_M 107 +ogy_ 107 +_á 107 +k. 106 +ól 105 +_ke 105 +_g 104 +: 103 +gye 102 +ák 102 +hog 102 +ri 102 +mo 101 +ok_ 101 +:_ 101 +hogy 101 +il 101 +el_ 100 +zer 100 +ete 99 +nn 99 +nak 98 +je 98 +sÃtá 98 +szá 98 +yo 98 +osÃtá 98 +sÃtás 98 +_ké 98 +_hog 98 +lé 97 +_S 97 +_hogy 97 +ig 97 +_- 96 +hogy_ 96 +ban_ 96 +ese 95 +_bi 94 +fo 94 +ja 94 +ul 94 +õ_ 94 +k._ 94 +_c 93 +ká 91 +es_ 91 +ná 91 +ény 91 +gé 91 +ás_ 91 +egy_ 90 +áb 90 +rd 89 +I 89 +ány 89 +_biz 89 +_fel 88 +öv 88 +ala 88 +szer 88 +po 88 +_ma 88 +leg 88 +tö 88 +ket 87 +un 87 +di 87 +ai_ 87 +nek_ 87 +rm 86 +tal 86 +év 85 +_is_ 85 +nem 85 +ti_ 84 +öz 84 +szt 84 +ut 83 +ter 83 +dé 83 +kk 83 +or_ 83 +b_ 82 +né 82 +os_ 82 +re_ 82 +rs 82 +_bizt 82 +min 82 +ben_ 81 +ra_ 81 +ik_ 81 +go 80 +len 80 +lm 80 +öt 80 +ely 80 +aj 80 +öl 80 +_sze 80 +_be 79 +ev 79 +ré 79 +ssz 79 +nt_ 79 +gya 79 +K 79 +si_ 79 +sza 78 diff --git a/libtextcat/data/new_fingerprints/lm/icelandic.lm b/libtextcat/data/new_fingerprints/lm/icelandic.lm new file mode 100644 index 000000000000..b1fe0f2c27dd --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/icelandic.lm @@ -0,0 +1,400 @@ +_ 26104 +a 6496 +r 6044 +n 5160 +i 5123 +s 3987 +e 3891 +u 3582 +t 3300 +ð 3126 +l 3071 +g 2726 +m 2459 +k 2256 +f 2230 +r_ 1967 +v 1641 +ar 1472 +ð_ 1420 +_s 1332 +á 1248 +o 1221 +í 1197 +a_ 1155 +in 1150 +i_ 1114 +h 1023 +j 975 +d 974 +st 929 +að 925 +. 922 +n_ 875 +._ 875 +_v 874 +m_ 854 +nn 842 +_f 840 +ur 824 +_a 806 +ó 785 +_h 782 +æ 779 +ið 771 +er 765 +um 727 +g_ 716 +y 711 +_e 709 +þ 672 +b 672 +ir 671 +ri 670 +an 667 +ö 650 +_á 648 +að_ 645 +u_ 633 +na 631 +í_ 625 +ar_ 622 +_þ 608 +á_ 606 +_í 595 +ta 567 +ei 566 +la 558 +_m 549 +_í_ 549 +um_ 547 +t_ 535 +ti 529 +_o 524 +ur_ 523 +_að 519 +ið_ 512 +ði 496 +ve 494 +og 494 +ú 482 +og_ 479 +_og 478 +_og_ 478 +nd 469 +p 464 +ra 455 +un 454 +ir_ 452 +_að_ 451 +ni 439 +en 439 +ðu 439 +_á_ 436 +ng 434 +il 404 +ga 395 +_t 395 +nu 393 +ki 392 +ja 383 +inn 379 +_b 375 +sk 352 +s_ 350 +vi 349 +rð 347 +ða 345 +ef 339 +ag 336 +_u 330 +se 324 +lu 324 +af 321 +_ve 320 +tu 318 +em 307 +eg 304 +nn_ 303 +_l 303 +va 301 +_k 296 +, 295 +ns 292 +re 292 +tt 291 +,_ 291 +l_ 286 +am 286 +es 285 +yr 285 +al 281 +da 277 +S 275 +gu 273 +_se 271 +ver 268 +_g 266 +ing 266 +_n 262 +is 258 +_er 257 +sa 256 +ður 255 +le 255 +_st 255 +_S 254 +sem 254 +ll 254 +me 253 +ha 251 +li 249 +kk 249 +rs 247 +_vi 247 +rn 246 +sl 244 +gi 243 +ss 242 +rf 241 +fy 240 +ði_ 240 +mi 238 +ka 237 +ma 231 +ld 230 +é 229 +rir 227 +sta 227 +fyr 227 +ví 227 +di 226 +ru 224 +var 224 +_fy 222 +ku 221 +em_ 221 +nar 220 +_sem_ 220 +_sem 220 +sem_ 220 +he 219 +yri 217 +_fyr 216 +si 216 +yrir 215 +au 212 +er_ 212 +ek 211 +_ha 210 +þe 209 +fyri 207 +fyrir 207 +_þe 205 +fi 204 +fr 203 +ge 201 +or 200 +ne 200 +ann 198 +jó 198 +_va 196 +_fyri 196 +_ver 194 +fl 192 +_er_ 191 +_um 189 +ík 188 +til 187 +_he 186 +fa 186 +il_ 182 +_ti 178 +_til 177 +gar 176 +_var 176 +na_ 176 +ý 175 +eð 171 +fu 170 +nni 169 +_me 168 +ki_ 167 +við 166 +ey 165 +fn 165 +arf 164 +til_ 163 +st_ 162 +_til_ 162 +þa 161 +num 161 +_þa 161 +as 160 +_við 160 +rt 159 +el 158 +uð 156 +inn_ 155 +_um_ 154 +ra_ 153 +bæ 153 +tar 151 +ta_ 151 +erð 151 +ór 148 +and 148 +_sa 146 +ig 146 +_en 146 +nga 145 +rir_ 145 +us 144 +jar 143 +et 143 +ár 142 +_sk 140 +ndi 140 +æð 139 +var_ 139 +_r 138 +av 138 +æk 137 +nna 137 +ður_ 136 +ál 136 +ko 135 +nin 135 +við_ 135 +ól 135 +ins 134 +ik 133 +E 133 +K 133 +yrir_ 133 +ns_ 133 +on 133 +ein 132 +_við_ 132 +ög 132 +já 132 +Þ 132 +öl 132 +ðs 132 +_mi 131 +f_ 131 +sí 131 +sj 131 +stu 131 +nda 130 +_var_ 130 +gr 129 +ús 128 +tæ 127 +ri_ 126 +haf 126 +_sí 125 +vík 124 +rin 124 +te 124 +r. 124 +r._ 123 +H 123 +nes 123 +ót 123 +ru_ 123 +kr 122 +F 122 +ær 121 +num_ 121 +k_ 121 +a. 121 +_H 121 +_fr 120 +_ge 120 +rá 120 +_E 120 +_Þ 120 +ug 120 +ngu 119 +an_ 119 +inga 118 +_K 118 +_haf 118 +enn 117 +ars 117 +rið 117 +en_ 117 +sin 116 +kur 116 +it 116 +ða_ 116 +ti_ 115 +rði 114 +tj 114 +ni_ 114 +at 114 +tarf 114 +br 113 +slu 113 +kki 113 +rg 113 +má 113 +kv 113 +_en_ 113 +a._ 112 +gar_ 112 +du 112 +ju 110 +eið 110 +und 110 +lag 110 +tur 110 +ega 109 +hú 109 +íð 109 +gn 109 +hef 109 +kj 109 +_hef 109 +_sta 108 +B 108 +V 108 +sam 107 +_ei 106 +_B 106 +ft 106 +ga_ 106 +G 106 +_G 105 +lö 105 +kki_ 105 +star 104 +in_ 104 +R 104 +með 104 +_ár 103 +_með 103 +ekk 103 +inu 103 +tö 103 +_V 103 +m. 103 +aði 103 +jö 102 +æj 102 +_bæ 102 +ess 102 +hús 101 +ut 101 +gs 101 +aví 101 +mu 101 +_R 101 +_ú 101 +rst 100 +æjar 100 +leg 100 +æja 100 +ja_ 99 +avík 99 diff --git a/libtextcat/data/new_fingerprints/lm/indonesian.lm b/libtextcat/data/new_fingerprints/lm/indonesian.lm new file mode 100644 index 000000000000..3fa5a09b4691 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/indonesian.lm @@ -0,0 +1,400 @@ +_ 19406 +a 10666 +n 5455 +e 4535 +i 4387 +r 2936 +t 2902 +an 2853 +u 2841 +k 2761 +s 2311 +m 2178 +d 2134 +g 2105 +l 1780 +a_ 1506 +n_ 1476 +ng 1449 +p 1397 +b 1275 +an_ 1270 +o 1246 +h 1130 +i_ 1108 +er 1038 +ka 1032 +_d 1006 +y 997 +, 951 +en 941 +ar 914 +,_ 900 +_m 880 +ya 842 +ta 838 +ang 797 +di 787 +da 773 +. 754 +la 742 +._ 738 +me 732 +ak 728 +_s 718 +at 690 +ra 688 +ga 683 +_k 671 +_me 650 +in 628 +ah 601 +_t 583 +_p 570 +g_ 551 +_b 551 +_di 548 +ng_ 544 +ma 536 +se 526 +tu 511 +na 506 +al 500 +ri 490 +as 483 +k_ 482 +j 480 +si 470 +ny 467 +h_ 457 +sa 452 +ang_ 439 +it 424 +kan 423 +ti 418 +_se 417 +pe 412 +S 409 +ba 407 +ke 407 +em 405 +men 405 +be 403 +un 401 +te 401 +am 396 +pa 395 +nya 390 +_men 374 +el 374 +t_ 371 +_a 360 +_i 356 +u_ 355 +kan_ 345 +_ke 339 +is 335 +ah_ 324 +_S 324 +eng 321 +nga 320 +ia 318 +_pe 316 +ha 313 +ap 311 +r_ 308 +w 305 +li 301 +_da 300 +s_ 299 +P 296 +nd 290 +_be 287 +ik 283 +ja 281 +yan 281 +ad 275 +ek 273 +uk 272 +di_ 270 +bu 269 +ya_ 268 +yang 268 +ak_ 266 +ber 265 +_y 265 +_ya 264 +_P 263 +ru 260 +K 259 +yang_ 256 +_yang 256 +_yan 256 +nt 255 +de 253 +_te 250 +wa 249 +et 247 +at_ 246 +ara 245 +gan 243 +A 237 +ari 235 +ala 230 +itu 229 +c 225 +ol 225 +ni 225 +us 225 +dan 224 +_K 224 +M 224 +B 223 +ata 222 +ai 221 +ur 219 +nya_ 217 +ua 215 +_ka 214 +_ber 210 +eb 209 +ran 206 +D 206 +ela 206 +_di_ 205 +_l 204 +ngan 204 +ter 203 +re 201 +- 199 +aka 198 +l_ 194 +_A 191 +era 191 +a, 191 +e_ 190 +ir 187 +I 186 +tan 185 +_B 184 +ut 184 +ku 183 +a,_ 183 +il 182 +J 181 +um 180 +_it 180 +_itu 180 +_ta 179 +su 179 +dan_ 177 +es 177 +on 177 +or 177 +_dan 176 +lu 174 +_M 172 +tu_ 172 +_dan_ 172 +enga 171 +mb 169 +R 169 +si_ 168 +per 168 +gan_ 168 +ngan_ 165 +" 162 +_ter 162 +a. 161 +man 161 +gk 160 +a._ 160 +asi 160 +ngk 160 +ep 160 +ag 159 +ul 158 +da_ 157 +m_ 155 +du 155 +ada 153 +ki 153 +rt 150 +mp 150 +T 150 +ama 148 +ing 148 +na_ 147 +_J 147 +_D 145 +ung 145 +ana 145 +n, 144 +ju 144 +ud 144 +rin 143 +gi 143 +aw 141 +lah 138 +lan 138 +_sa 136 +ri_ 136 +meng 136 +_meng 135 +_ma 134 +n,_ 134 +awa 134 +st 134 +eka 133 +mi 133 +mu 132 +_T 132 +po 131 +ge 131 +ar_ 130 +id 129 +ko 129 +le 128 +_h 128 +ena 127 +_j 126 +emb 126 +ina 125 +_r 124 +itu_ 124 +ay 123 +ngg 123 +gg 123 +rang 123 +pi 120 +nan 120 +_ba 119 +_la 119 +apa 119 +_I 118 +p_ 118 +bi 117 +ai_ 117 +ta_ 116 +san 116 +Ke 116 +ro 115 +eri 114 +kar 113 +lah_ 113 +_itu_ 112 +aya 111 +i, 110 +an, 110 +ra_ 110 +_per 110 +im 110 +ika 109 +isi 109 +mem 109 +tah 108 +_Ke 108 +ian 108 +_mem 108 +akan 108 +Se 108 +to 107 +ab 107 +ngka 106 +rs 106 +gka 106 +uk_ 105 +seb 104 +_de 104 +pu 104 +i,_ 104 +ita 104 +nda 103 +_ti 103 +ni_ 103 +ca 103 +_Se 103 +ers 103 +pen 103 +ini 102 +an,_ 102 +angk 101 +uh 101 +han 101 +nta 100 +_in 99 +f 99 +nj 99 +ok 99 +aga 99 +_R 98 +as_ 98 +tr 98 +mer 97 +lam 97 +and 97 +end 96 +anga 96 +ne 96 +Sa 96 +ka_ 96 +arin 95 +gu 95 +_ha 94 +Z 94 +al_ 94 +ga_ 94 +_Z 93 +_pen 93 +dar 93 +Ad 93 +i._ 93 +ada_ 93 +atan 93 +tak 93 +i. 93 +ia_ 92 +asa 92 +ap_ 92 +ari_ 92 +kat 92 +_seb 92 +_Za 91 +Za 91 +den 91 +n. 90 +_u 90 +_Ad 90 +engan 89 +ib 89 +any 89 +n._ 89 +o_ 89 +Zar 88 +Zari 88 +rina 88 +_Zar 88 +_Zari 88 +Zarin 88 +arina 88 +ks 88 +angka 87 +oli 87 +eg 87 +kt 86 +_Sa 86 +hu 85 +ih 85 +us_ 85 +adi 85 +om 85 +eba 85 +anya 85 +_bu 84 +denga 83 +L 83 +ed 83 +dak 83 +deng 83 +ma_ 82 +asi_ 82 diff --git a/libtextcat/data/new_fingerprints/lm/irish_gaelic.lm b/libtextcat/data/new_fingerprints/lm/irish_gaelic.lm new file mode 100644 index 000000000000..b6874862da01 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/irish_gaelic.lm @@ -0,0 +1,400 @@ +_ 8010 +a 2622 +i 1573 +h 1334 +n 1247 +r 968 +_a 847 +e 830 +s 817 +t 747 +l 639 +c 636 +g 598 +o 590 +d 554 +n_ 501 +a_ 487 +m 432 +an 415 +u 413 +b 379 +h_ 352 +ai 350 +ch 350 +ea 346 +r_ 346 +à 334 +é 321 +_s 309 +á 306 +in 281 +ar 277 +. 269 +_d 266 +s_ 254 +ir 253 +_b 250 +f 250 +an_ 246 +, 241 +,_ 226 +ag 225 +_an 221 +bh 218 +_c 216 +._ 212 +ac 210 +ha 208 +_a_ 201 +" 199 +_m 199 +th 198 +_t 190 +ach 182 +_ag 180 +_an_ 179 +Ã_ 176 +_l 168 +na 168 +nn 160 +e_ 159 +ar_ 158 +_g 157 +ú 156 +_i 152 +il 150 +le 150 +is 143 +ó 142 +_bh 138 +ei 138 +g_ 135 +_f 135 +dh 135 +l_ 126 +t_ 125 +ig 123 +é_ 122 +_n 120 +gu 120 +á_ 120 +mh 118 +id 117 +ch_ 117 +ad 116 +he 114 +ir_ 114 +ra 109 +o_ 109 +ach_ 107 +ia 105 +_ar 105 +us 104 +ui 104 +_" 101 +us_ 100 +T 99 +am 99 +ta 98 +gus 98 +gus_ 98 +_le 97 +gh 97 +_ch 97 +agus 94 +agus_ 94 +agu 94 +éa 93 +_agus 93 +_agu 93 +ean 93 +na_ 92 +d_ 92 +ái 91 +p 89 +it 89 +A 89 +_ar_ 88 +rt 86 +al 85 +oi 84 +sa 84 +"_ 82 +hai 81 +_r 79 +nn_ 79 +hu 79 +as 79 +éi 78 +_T 78 +ma 77 +air 77 +at 77 +ann 76 +B 76 +sé 76 +hà 75 +igh 74 +st 74 +ga 73 +go 71 +ua 71 +ne 71 +la 71 +- 71 +de 71 +te 71 +re 70 +inn 70 +ith 69 +eac 69 +_sé 69 +in_ 68 +_go 68 +hi 68 +each 68 +dh_ 68 +si 67 +ag_ 67 +_go_ 66 +hea 66 +go_ 66 +tha 64 +om 64 +_sé_ 63 +sé_ 63 +hÃ_ 63 +on 62 +se 61 +úi 60 +nt 60 +C 60 +D 59 +i_ 58 +_ag_ 58 +is_ 58 +Ão 58 +_de 57 +_B 56 +il_ 56 +or 56 +_th 54 +ca 53 +fa 53 +amh 53 +_A 53 +le_ 52 +? 52 +S 51 +io 51 +_in 51 +sà 51 +li 51 +rai 50 +hf 50 +ht 50 +eo 50 +sc 50 +ri 49 +: 49 +igh_ 49 +gh_ 49 +_sà 49 +:_ 49 +há 49 +_D 49 +be 49 +aig 49 +hé 48 +oc 48 +idh 48 +rt_ 48 +ho 47 +os 47 +ann_ 47 +_C 46 +! 46 +Bh 46 +bhf 45 +_si 45 +lt 45 +_bhf 45 +irt 45 +ear 44 +_na 44 +ta_ 44 +air_ 44 +_p 44 +im 44 +aga 44 +_ma 44 +_S 44 +aigh 43 +án 43 +_dh 43 +uai 43 +ao 43 +cht 43 +ain 42 +bhe 42 +ait 42 +fh 42 +sa_ 41 +m_ 41 +adh 41 +ile 41 +_é 41 +ail 41 +eir 41 +ói 41 +_Bh 40 +as_ 40 +cha 40 +idh_ 40 +hái 39 +_i_ 39 +bh_ 39 +th_ 39 +ad_ 39 +och 39 +mh_ 39 +tr 39 +rea 38 +_se 38 +ro 38 +rà 38 +hair 38 +_is 38 +uil 37 +iú 37 +áin 37 +I 37 +ll 37 +mé 37 +_be 36 +ba 36 +eann 36 +tá 36 +_o 36 +M 36 +aid 36 +aith 36 +ib 36 +' 36 +tea 36 +_mé 35 +chu 35 +ibh 35 +each_ 35 +ean_ 34 +irt_ 34 +_na_ 34 +N 34 +ist 34 +fu 34 +mha 34 +bea 34 +h. 34 +_bhe 34 +lá 34 +ic 34 +_sÃ_ 33 +eis 33 +bhà 33 +ni 33 +héa 33 +_sa 33 +ith_ 33 +sÃ_ 33 +har 33 +_bhà 33 +ig_ 32 +ur 32 +aà 32 +hr 32 +_am 32 +_bhÃ_ 31 +da 31 +úir 31 +hfu 31 +_chu 31 +ol 31 +ne_ 31 +_fa 31 +An 31 +BhÃ_ 31 +Bhà 31 +n, 31 +_ac 31 +bhÃ_ 31 +_bhfu 30 +_ó 30 +ana 30 +_M 30 +mé_ 30 +_fh 30 +aigh_ 30 +bhfu 30 +_mé_ 30 +tú 29 +_le_ 29 +tá_ 29 +hean 29 +háin 29 +sin 29 +eir_ 29 +nne 29 +cé 29 +_aga 29 +h._ 29 +Tá 29 +ibh_ 29 +iste 28 +An_ 28 +do 28 +hui 28 +fui 28 +nà 28 +ste 28 +acht 28 +n,_ 28 +co 28 +dea 28 +ng 28 +nach 28 +id_ 28 +hfui 28 +.. 28 +lei 28 +nac 28 +ce 27 +a. 27 +c_ 27 +lea 27 +hfuil 27 +_BhÃ_ 27 +_bea 27 +adh_ 27 +di 27 +fuil 27 +." 27 +Tá_ 27 +ha_ 27 +ú_ 27 +uil_ 27 +."_ 27 +bhfui 27 +_Bhà 27 +éan 27 +_do 27 +lta 27 +aoi 27 +_lei 27 +_mh 26 +dú 26 +fuil_ 26 +eat 26 +-_ 26 +teac 26 +ath 26 diff --git a/libtextcat/data/new_fingerprints/lm/italian.lm b/libtextcat/data/new_fingerprints/lm/italian.lm new file mode 100644 index 000000000000..543cadcfa88e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/italian.lm @@ -0,0 +1,400 @@ +_ 25028 +a 7570 +e 6477 +i 5481 +o 5104 +l 3905 +n 3866 +r 3502 +t 2934 +c 2862 +s 2862 +a_ 2504 +e_ 2404 +d 2004 +i_ 1749 +o_ 1679 +u 1650 +v 1611 +p 1561 +m 1414 +_c 1325 +, 1192 +,_ 1192 +_s 1190 +_d 1094 +g 1067 +an 925 +er 915 +_a 914 +_p 895 +la 858 +_l 830 +re 799 +ar 769 +h 762 +no 753 +co 726 +va 698 +_e 657 +n_ 656 +on 656 +ra 653 +to 651 +f 638 +di 638 +_i 634 +ch 634 +ll 633 +l_ 624 +la_ 598 +ta 593 +el 576 +in 567 +_m 558 +en 529 +b 528 +ri 525 +_co 523 +_n 523 +_di 522 +li 513 +av 507 +al 501 +le 494 +ia 492 +se 484 +ol 479 +_f 477 +or 477 +te 469 +_e_ 467 +ve 454 +at 449 +de 447 +. 443 +ne 429 +va_ 428 +ca 426 +._ 422 +tt 422 +re_ 415 +nt 415 +io 411 +_v 407 +pe 405 +z 392 +to_ 391 +_ch 389 +na 384 +si 384 +' 383 +he 382 +no_ 379 +ci 374 +_la 373 +ro 371 +_g 370 +st 368 +cc 366 +he_ 362 +di_ 362 +ma 358 +ev 354 +che 354 +es 352 +me 352 +pa 351 +_t 349 +ti 348 +_di_ 347 +ss 345 +che_ 344 +a,_ 337 +a, 337 +nd 335 +o, 333 +o,_ 333 +ell 330 +gl 323 +sa 322 +il 322 +gli 321 +da 318 +as 318 +do 314 +_che 308 +_che_ 306 +eva 306 +_la_ 300 +lla 298 +le_ 293 +un 291 +_pe 290 +_de 288 +q 283 +qu 283 +ava 280 +po 277 +on_ 275 +r_ 273 +li_ 273 +_b 269 +_il 268 +_il_ 268 +il_ 268 +lo 267 +om 263 +e, 263 +e,_ 263 +ni 258 +tr 258 +so 255 +ra_ 253 +os 251 +_in 249 +_u 248 +per 244 +are 243 +et 243 +_se 240 +ano 239 +si_ 238 +_ca 238 +_qu 238 +lla_ 238 +_q 238 +_a_ 236 +ac 236 +_r 234 +ic 233 +_no 232 +ie 227 +fa 227 +hi 226 +del 225 +ua 222 +_per 218 +ce 218 +_ma 216 +sc 216 +_del 215 +mi 212 +_un 208 +chi 206 +era 205 +i, 205 +i,_ 205 +su 203 +and 202 +vo 202 +_fa 201 +eva_ 200 +ano_ 199 +gli_ 197 +non 196 +pi 196 +vi 195 +er_ 195 +_al 194 +se_ 193 +_ne 192 +_non 191 +am 190 +is 187 +ava_ 187 +_non_ 186 +non_ 186 +in_ 185 +ent 185 +_si 184 +_pa 184 +com 183 +! 182 +_le 182 +_su 181 +uo 181 +el_ 180 +!_ 180 +l' 178 +ue 177 +te_ 177 +_com 177 +are_ 176 +pr 176 +_in_ 176 +van 172 +mo 172 +ta_ 171 +gn 167 +ere 166 +na_ 166 +tto 163 +it 161 +_per_ 161 +per_ 161 +é 161 +all 160 +ess 159 +ut 159 +col 158 +acc 157 +gi 155 +lo_ 154 +oc 154 +vano 153 +io_ 153 +_av 151 +ndo 151 +é_ 151 +ato 149 +ave 148 +_st 147 +me_ 147 +'a 146 +ia_ 144 +con 143 +mp 143 +fi 142 +ett 142 +_si_ 141 +_pi 140 +era_ 140 +ti_ 140 +ó 140 +vano_ 140 +_gl 139 +qua 139 +ella 139 +sta 138 +ome 137 +S 137 +_gli 137 +_S 137 +ad 136 +_ve 134 +ant 134 +ne_ 134 +ó_ 133 +sp 133 +do_ 133 +_po 132 +ro_ 132 +ov 132 +_le_ 131 +ella_ 130 +sse 129 +_con 128 +ir 128 +_vi 128 +ig 127 +_gli_ 127 +_ave 127 +vev 127 +un_ 126 +ot 126 +veva 125 +dell 125 +que 125 +a. 125 +_o 125 +a._ 124 +tu 124 +cia 123 +za 123 +_que 123 +_da 121 +par 121 +_pr 120 +cch 120 +_dell 120 +eg 119 +_sa 119 +o._ 119 +o. 119 +_col 118 +lt 118 +_un_ 118 +rt 118 +ur 117 +_vo 117 +_me 117 +ome_ 117 +L 116 +ap 116 +_L 116 +zi 116 +nto 116 +og 115 +_an 115 +_so 115 +em 114 +ag 114 +be 111 +ni_ 111 +im 110 +cchi 110 +ver 110 +lle 109 +nz 109 +cci 109 +_ri 109 +nc 108 +_er 108 +come_ 107 +come 107 +aveva 107 +ui 107 +avev 107 +tto_ 107 +_come 106 +ed 106 +P 105 +man 105 +_P 105 +rs 105 +occ 104 +ndo_ 103 +ato_ 103 +_qua 103 +_era 103 +ari 102 +ba 100 +_mo 100 +nel 100 +id 99 +men 98 +_fi 98 +_all 98 +rr 97 +_do 97 +_avev 97 +att 97 +l'a 96 +ei 96 +zz 96 +; 96 +vol 95 +pp 95 +tra 95 +;_ 95 +ere_ 94 +lle_ 94 +nda 94 +utt 94 +est 93 +_nel 93 +ul 92 +ola 92 +iv 92 +ando 90 +ale 90 +lu 90 +rn 90 +e. 89 +e._ 89 +ll' 89 +tta 88 +nte 87 +_l' 87 +uel 87 diff --git a/libtextcat/data/new_fingerprints/lm/japanese.lm b/libtextcat/data/new_fingerprints/lm/japanese.lm new file mode 100644 index 000000000000..654341bfeae2 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/japanese.lm @@ -0,0 +1,400 @@ +_ +ã® +〠+ã« +ã‚’ +㨠+ã— +㟠+㪠+㦠+。 +㧠+ã‚‹ +ã‹ +ã™ +㯠+ã‚Š +ã„ +ã¾ +ら +ã +㌠+  +。_ +㣠+ã‚‚ +_  +ã—㦠+ã™ã€‚ +1 +地 +ã‹ã‚‰ +ㆠ+ã +ã +ã¾ã™ +å¹´ +ー +ä¸ +ã‚Œ +ã‚ +ã¾ã™ã€‚ +〠+ãŸã€‚ +大 +分 +「 +ス +ã—㟠+ã‘ +ã“ +人 +国 +政 +ã‚“ +å¦ +ã£ã¦ +æ–¹ +ã‚Šã¾ +ã£ãŸ +ã™ã‚‹ +æ”¹é© +é© +改 +çš„ +ã¯ã€ +ã¡ +ã• +ï¼’ +ã‚ +ã¦ã€ +部 +ã© +生 +ã¦ã„ +ã‚¿ +会 +ン +ï¼™ +よ +æžœ +ã› +ã‚ +る㨠+地方 +ã«ã€ +è¡Œ +ã‚„ +ã™ã€‚_ +3 +力 +自 +ã¨ã— +レ +ç«‹ +) +ã¨ã—㦠+_) +ãªã‚Š +ã§ã +進 +月 +æ–° +ã‚Šã¾ã™ +æ°´ +åŒ +女 +下 +ã‚Šã¾ã™ã€‚ +作 +(_ +ã“㨠+実 +å½¹ +権 +ã„㟠+( +ã—ã¦ã„ +ル +ク +( +ã¿ +ニ +ã +㤠+ãã‚‹ +ã +経 +ï¼– +時 +å¼· +家 +性 +5 +内 +調 +集 +る。 +上 +財 +改é©( +安 +的㪠+事 +å‹™ +ã€ã¨ +一 +ã£ãŸã€‚ +å +å…¨ +ã§ãã‚‹ +å· +ãŸã€‚_ +次 +æ¥ +ãªã„ +ド +) +ç¾ +é©(_ +ï¼ +る。_ +改é©(_ +é©( +ã§ã™ +ã§ã€ +マ +ミ +ジ +社 +ã§ã‚‚ +ç›® +å¹´ã‹ã‚‰ +発 +çœ +ã€ä¸ +ã‚¢ +ã¹ +ッ +ã° +済 +女性 +ï¼… +法 +ãªã‚“ +㈠+ç”» +地方分権 +ãªã +æ°‘ +構 +æ ¹ +ã—〠+ãŒã€ +高 +推 +æ–½ +ã¨ã„ +何 +é›» +調査 +éš› +も〠+å’Œ +分権 +効 +é™ +地方分 +国㮠+é•· +経済 +è¨ +計 +方分権 +野 +é‡ +å° +ã«ãª +æ§‹é€ +本 +æ ¡ +査 +ã—ã¾ +é€ +ã¾ã™ã€‚_ +ãªã‚Šã¾ +れ㟠+方分 +ã‚ã‚Š +財政 +ç† +ãª_ +ã—ãŸã€‚ +å¹´ã‹ +ç´„ +_㦠+1年 +よㆠ+ç§ +・ +メ +育 +ã‚» +ビ +ã‚· +ã—ã¦ã€ +ナ +ã‚« +ト +生㮠+ç€ +ã» +æµ +æ§‹é€ æ”¹ +ん㦠+ム+ã‚ +èŒ +æ²» +時㫠+%〠+æ°— +㊠+ドレ +ãªã£ +ãªã© +ãŸã‚Š +何㋠+ã‚‹ã“ +ã€ãã—㦠+ã¦ã +ãŸã¡ +ã§ã¯ +ã¦ã¯ +進゠+é€ æ”¹ +æ§‹é€ æ”¹é©( +ã®å +率 +期 +度 +æ—¥ +éƒ¨çœ +計画 +ã_ +æ— +æ–‡ +æ§‹é€ æ”¹é© +ã¨ã„ㆠ+ã—ã¾ã™ +ãªã£ãŸ +å±€ +踊り +ãªã‚“㦠+æ£ +夫 +増 +多 +食 +使 +ã ã£ãŸ +戻 +ã㪠+ã„ã¾ +体 +スを +é¡Œ +ãŸãŒã€ +ã„ã‚‹ +ã„ㆠ+身 +も㪠+ã„。 +ä¸éƒ¨ +ã‹ã‘ +踊 +_ã™ +é¢ +_ã« +ã ã£ãŸã€‚ +çš„ã« +é–€ +ドレス +レス +ï¼’å¹´ +ã¾ã—㟠+制 +åˆ +ã—ã¾ã™ã€‚ +ã¾ã§ +ã¾ã— +ã‚ã‚Šã¾ +ã〠+冬 +ã¨ã‚’ +æ–‡éƒ¨çœ +を進゠+推進 +å…ƒ +ãã— +é ƒ +è¾² +å…¥ +域 +解 +ã¦ã„ã‚‹ +ら〠+ã€ãã— +ã¨ã€ +é€ æ”¹é© +å®¶æ— +見 +è¦ +ã—ãŸã€‚_ +è¦ +8 +5年 +ã‚ã‚Šã¾ã™ã€‚ +葉 +ï¼ï¼… +é€ æ”¹é©( +ã‚ã‚Šã¾ã™ +å…¬ +ã¯ãª +ã«ã‚ +ã—㪠+çµæžœ +表 +ã‚㟠+ãã—㦠+ã ã‘ +ã 㣠+_ã™ã€‚ +ï¼” +第 +ã“ã¨ã‚’ +ã„ã¾ã™ +文部 +ã®ç›® +ã€ã +を進 +効果 +ã‚‹ã“㨠+ã®ã¾ +ã‚ã‚‹ +ï¼“ï¼ +ã€åœ° +自分 +組 +çµ + 「 +ã«ã‚‚ +ãŸãŒ +é€ æ”¹é©(_ +ã‚‹é™ã‚Š +ã女性㟠+ナ増強 +戻る。 +ã£ãŸã€‚_ +ç¾ã™ diff --git a/libtextcat/data/new_fingerprints/lm/korean.lm b/libtextcat/data/new_fingerprints/lm/korean.lm new file mode 100644 index 000000000000..159493270c3f --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/korean.lm @@ -0,0 +1,400 @@ +_ 11636 +À 2659 +° 1629 +Ç 1578 +¸ 1458 +¿ 1397 +´ 1299 +µ 1118 +à 1005 +± 928 +» 849 +¼ 840 +_À 809 +¹ 808 +º 780 +¡ 773 +à 764 +³ 674 +¾ 640 +ÃŒ 591 +ÃŽ 580 +ÀÌ 538 +½ 526 +à 519 + 518 +_° 518 +Ù 506 +· 483 +È 464 +ö 463 + 454 +´Ù 451 +ø 404 +Æ 395 +ë 384 +_¿ 382 +à 377 +. 374 +Ã… 373 +ç 358 +._ 347 +_Ç 344 +´Â 338 +¡_ 333 +Â_ 329 +¿¡ 327 +_à 319 +´Â_ 317 +´Ù. 313 +Ù. 313 +â 308 +¶ 308 +´Ù._ 301 +Ù._ 301 +»_ 299 +_¹ 291 +î 288 +® 282 +À» 279 +Ö 278 +¦ 274 +À»_ 271 +ÃŽ_ 269 +, 266 +Ñ 266 +°à 261 +Ãö 258 +,_ 258 +°¡ 250 +ß 248 +_¼ 246 +Ä 245 +ÀÇ 244 +ÃŒ_ 243 +_¸ 243 +_µ 242 +ü 238 +é 235 +¼ 234 +ÀÌ_ 234 +Çà 234 +_± 233 +´ë 228 +·Î 227 +ÇÑ 222 +½Ã 222 +_³ 215 +±â 212 +_à 210 +Ã_ 208 +_´ 206 +¦_ 206 +Ç_ 205 +_¾ 205 +º_ 204 +÷ 198 +˼ 198 +°Ã_ 194 +·Î_ 193 +ó 193 +¿¡_ 193 +¸¦_ 192 +¸¦ 192 +_ÀÌ 190 +Â_ 189 +˼_ 189 +_½ 189 +µµ 188 +Àà 188 +ÀÇ_ 188 +_º 186 +à 177 +Ú 175 +ú 171 +ȍ 161 +Ô 154 +_Àà 153 +Ø 152 +¿ø 151 +±¸ 150 +µ¿ 147 +Ã’ 147 +¸¸ 145 +¼ö 143 +¤ 142 +ºÎ 142 +_» 141 +ÀÖ 140 +æ 139 +à 138 +ù 138 +ª 137 +µé 136 +è 134 +À¸ 134 +_ÀÖ 133 +² 132 +Ñ_ 127 +ÀÎ 125 +°¡_ 123 +ÀÚ 122 +ÇÑ_ 121 +Àü 121 +¾Æ 118 +ý 117 +Ã¥ 116 +Ã_ 115 +¾î 115 +µ_ 115 +¢ 114 +Ö´ 113 +°ú 112 +¸· 111 +¯ 109 +ÇØ 109 +§ 108 +à 108 +_È 108 +±¹ 107 +¼Â_ 107 +× 107 +£ 106 +ÀÖ´ 105 +ȸ 103 +¸® 101 +ö_ 101 +û 100 +_ÀÖ´ 100 +õ 100 +¸·Î 100 +" 98 +À¸·Î 97 +ÃÖ 97 +À¸· 97 +Àå 95 +_´ë 94 +¡¼ 94 +³ª 94 +_½Ã 94 +Â¥ 94 +µî 94 +ä 92 +°ø 92 +_Ã… 91 +á 91 +ð 90 +Çà 90 +¸·Î_ 90 +Ê 89 +© 89 +Ã_ 89 +¸_ 89 +À¸·Î_ 89 +éÀ 89 +Þ 88 +¡¼ 88 +¿¡¼ 88 +¿¡¼ 88 +ê 87 +µµ_ 87 +±³ 85 +_Æ 85 +î_ 84 +°æ 84 +µéÀ 84 +¸à 84 +ëÇ 83 +¿ù 83 +_Ãö 82 +Çð 82 +¿À 81 +¶ó 80 +¿¡¼Â_ 79 +¡¼Â_ 79 +ÀÃ_ 78 +à 78 +æ 78 +ç_ 78 +øÀ 78 +' 77 +Çß 77 +ì 77 +Ãœ 77 +Ãß 77 +ú_ 76 +Ãö_ 76 +Ó 76 +â_ 75 +( 74 +) 74 +»ó 74 +°ü 74 +»ý 73 +_¿ù 72 +_°¡ 72 +_¼ö 72 +- 72 +¿© 72 +Ö´Ù 72 +º¸ 71 +ÀÖ´Ù 71 +Ȑ 70 +°³ 70 +½º 70 +¼± 69 +¿ë 69 +°ú_ 69 +_ÀÖ´Ù 69 +_µî 69 +ø_ 69 +ß´ 69 +°à 68 +³» 68 +_¡ 68 +_ÇÑ 68 +ù_ 67 +ü 67 +¸¿ 66 +_ÀÃ_ 66 +Çß´ 66 +°Ô 65 +ñ 65 +_¶ 65 +_°à 65 +Çß´Ù 64 +ß´Ù 64 +_Çà 63 +¹Î 62 +« 62 +ô 62 +¼Ò 62 +¿ù_ 62 +ö 61 +³â 61 +ðà 61 +Ì´ 60 +¾È 60 +Çðà 60 +õ 59 +_¿ù_ 59 +_¼ 59 +Åë 59 +ÆÄ 58 +®_ 58 +_¸¸ 58 +Ø_ 58 +´ç 57 +ß´Ù. 57 +¬ 57 +Õ 57 +Çß´Ù. 57 +¹° 57 +¾÷ 57 +Ö´Ù. 56 +ß_ 56 +ÀÖ´Ù. 56 +¿à 56 +ß´Ù._ 56 +Ãø 56 +Àç 55 +¸° 55 +ÃÀ 55 +Æ® 55 +¹é 55 +ÀÌ´ 55 +Ö´Ù._ 54 +_¹é 54 +¿Ã_ 53 +Àû 53 +ð 53 +¹® 53 +_ÃÖ 53 +´Ü 53 +¼º 53 +ÇØ_ 53 +°Ç 53 +ÃÀ 52 +_Àü 52 +Ô_ 52 +¿¬ 52 +_°æ 51 +°ÃÀ 51 +°£ 51 +¿ì 51 +È 51 +ä 50 +_¾Æ 50 +´ëÇ 50 +Ä¡ 50 +ðÃ_ 49 +ÇðÃ_ 49 +_ÀÎ 49 +Û 49 +É 49 +_" 48 +öµ 48 +ˤ 48 +ÀÌ´Ù 48 +ÀÔ 48 +_°ÃÀ 48 +Ì´Ù 48 +¡ 47 +Ì´Ù. 47 +_°ø 47 +ÀÌ´Ù. 47 +°Ô_ 47 +_ȍ 47 +¸ç 47 +½Å 47 +Ã_ 46 +Ã…Ã 46 +é_ 46 +Ã’_ 46 +ï 46 +Çô 46 +_°³ 45 +_´Ù 45 +_ÀÚ 45 +¸¶ 45 +°è 45 +Çà 45 +Ì´Ù._ 45 +÷À 44 +îà 44 +ÈÄ 44 +±â_ 43 +(_ 43 +¸é 43 +¿ 43 +ü_ 43 +ºñ 42 +´ 42 +ë_ 42 +ÀÎ_ 42 +_ÀÌ_ 42 +µî_ 42 +° 42 +Îà 41 +µÇ 41 +¿øÀ 41 +¿µ 41 +À½ 40 +Îõ 40 +±× 40 +Ë 40 +¿¹ 40 +_¿À 40 +øÇ 40 +¸» 40 +_Çà 40 +¡Â_ 39 +_³â 39 +³² 39 +ÇÒ 39 +¿_ 39 +_³ª 39 +³ 38 +¸ÃÀ 38 +âÀ 38 +ª_ 38 diff --git a/libtextcat/data/new_fingerprints/lm/latin.lm b/libtextcat/data/new_fingerprints/lm/latin.lm new file mode 100644 index 000000000000..177cd35aa174 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/latin.lm @@ -0,0 +1,400 @@ +_ 20136 +e 6892 +i 5604 +a 5443 +u 4581 +t 4552 +s 4354 +r 3923 +n 3375 +m 3063 +o 2921 +c 2224 +l 1805 +e_ 1625 +s_ 1503 +p 1424 +d 1397 +, 1285 +,_ 1276 +er 1077 +qu 1028 +q 1028 +a_ 1019 +t_ 1018 +is 942 +_a 921 +re 902 +m_ 891 +v 858 +b 821 +um 808 +_s 773 +us 772 +en 766 +nt 733 +in 729 +ue 727 +te 720 +g 718 +_i 710 +_p 679 +it 676 +_c 669 +et 653 +que 652 +_e 643 +at 643 +ue_ 616 +ra 614 +que_ 611 +f 601 +or 598 +ri 576 +ti 572 +ta 559 +tu 552 +an 551 +ae 527 +_m 513 +am 501 +_t 493 +us_ 488 +is_ 487 +es 479 +em 479 +_f 451 +um_ 443 +_v 442 +ia 442 +li 438 +_d 436 +. 432 +i_ 430 +et_ 429 +ni 412 +ne 409 +h 406 +de 404 +ur 396 +._ 392 +ar 388 +os 388 +mi 382 +pe 382 +la 376 +st 371 +s, 368 +di 367 +_et 366 +s,_ 365 +_in 363 +on 360 +o_ 359 +_n 351 +_et_ 351 +as 346 +im 336 +na 327 +se 320 +ma 315 +cu 307 +vi 306 +si 303 +ro 303 +r_ 302 +su 299 +un 295 +_l 291 +to 291 +ec 290 +ci 288 +co 287 +_r 287 +ere 286 +ce 284 +tr 280 +re_ 278 +ent 275 +x 275 +ct 274 +ve 271 +ru 259 +ul 256 +me 255 +ui 255 +c_ 252 +_o 250 +ic 249 +ns 247 +_qu 242 +_q 242 +no 241 +ant 235 +am_ 235 +_co 233 +sa 231 +ca 230 +t, 226 +mu 225 +t,_ 225 +_re 223 +el 222 +ib 222 +id 218 +om 212 +_te 211 +al 209 +le 209 +it_ 208 +mo 208 +ol 206 +_u 203 +; 199 +_h 199 +ac 198 +;_ 198 +bu 197 +nu 196 +ua 195 +n_ 195 +ll 194 +tis 191 +A 189 +rt 188 +ge 188 +nd 187 +au 187 +lu 186 +iu 185 +squ 185 +per 185 +sq 185 +ter 185 +pa 183 +_A 183 +em_ 183 +ia_ 180 +ed 179 +_pe 178 +m, 176 +sque 175 +_su 175 +ae_ 175 +m,_ 175 +pr 175 +bi 175 +bus 174 +_vi 174 +os_ 173 +ta_ 172 +mqu 171 +mq 171 +ss 170 +sque_ 169 +ibu 167 +ad 166 +ibus 165 +I 164 +nte 163 +ra_ 163 +mque 162 +_de 162 +po 161 +_se 160 +ere_ 160 +nc 160 +qua 159 +T 159 +lo 157 +oc 156 +mque_ 156 +_T 155 +_pa 155 +_pr 155 +tem 154 +bus_ 152 +nti 149 +rum 149 +er_ 149 +ab 148 +ir 148 +da 147 +_ve 146 +ibus_ 146 +ex 146 +ut 145 +pi 145 +tur 145 +_ca 143 +_me 142 +es_ 142 +gi 142 +te_ 141 +_I 141 +vo 141 +do 141 +_si 140 +tus 139 +il 137 +_ar 136 +du 133 +nt_ 133 +uc 133 +fa 132 +as_ 132 +rr 131 +ba 130 +_ad 128 +ne_ 127 +_ma 127 +ens 127 +gn 126 +s. 126 +y 126 +min 125 +ris 124 +in_ 123 +tum 123 +P 123 +_g 123 +mp 123 +e, 122 +io 122 +_P 122 +ea 122 +hi 122 +e,_ 121 +era 120 +sc 120 +_la 120 +qui 120 +unt 120 +fe 119 +_in_ 118 +_no 118 +ore 118 +iam 118 +va 117 +tis_ 117 +s._ 117 +at_ 117 +eri 116 +d_ 116 +con 115 +fu 115 +pu 114 +cum 114 +ub 114 +ng 114 +ine 113 +_au 113 +: 113 +_di 112 +ag 111 +_con 111 +ect 111 +i, 111 +equ 111 +i,_ 111 +be 111 +eq 111 +_po 110 +so 110 +:_ 110 +nis 109 +ha 109 +uo 109 +_fa 108 +na_ 107 +ip 107 +is, 107 +_cu 106 +cr 106 +ate 105 +is,_ 105 +ig 105 +tor 105 +rat 104 +_qua 103 +eg 103 +a, 103 +a,_ 102 +tra 102 +_mo 101 +sp 101 +mis 100 +itu 100 +D 99 +ali 99 +eb 99 +eni 99 +_sa 98 +ie 98 +imu 98 +_ex 97 +_D 96 +res 95 +est 94 +tri 94 +ene 94 +_mi 94 +str 94 +enti 93 +t. 92 +av 92 +_per 91 +ur_ 91 +ora 91 +lt 91 +umqu 90 +_vo 90 +umq 90 +up 89 +t._ 88 +quo 88 +_ne 88 +gen 88 +rum_ 87 +tqu 87 +tq 87 +_fu 86 +ep 86 +ma_ 86 +umque 86 +it,_ 85 +ine_ 85 +it, 85 +men 85 +mus 84 +ort 83 +ven 83 +ina 83 +us,_ 83 +us, 83 +tque 82 +_ge 82 +per_ 82 +mor 82 +inc 82 +are 81 +tus_ 81 +_an 81 +rim 81 +tque_ 81 +ot 81 +ani 80 +H 80 +_tu 80 +ho 80 +tem_ 80 +u_ 80 +ser 79 +um,_ 79 +um, 79 +S 79 +ten 79 +ver 79 +sti 79 +ntu 78 +fer 78 diff --git a/libtextcat/data/new_fingerprints/lm/latvian.lm b/libtextcat/data/new_fingerprints/lm/latvian.lm new file mode 100644 index 000000000000..bffdd309b9bb --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/latvian.lm @@ -0,0 +1,400 @@ +_ 24504 +a 8735 +i 7353 +s 6064 +t 5176 +e 4390 +r 4339 +u 4200 +n 3422 +â 3055 +m 2817 +p 2755 +k 2703 +o 2643 +s_ 2595 +d 2131 +l 2035 +j 1972 +î 1958 +ie 1887 +v 1822 +as 1530 +u_ 1396 +_p 1380 +b 1357 +z 1311 +ç 1267 +a_ 1261 +as_ 1217 +. 1065 +, 1048 +,_ 1033 +g 1010 +ar 970 +ð 965 +ti 962 +c 922 +_a 876 +ja 838 +st 828 +_i 822 +_v 805 +pa 792 +_t 775 +._ 774 +um 760 +_k 749 +â_ 734 +_n 728 +es 712 +i_ 706 +at 703 +is 695 +_s 680 +ai 649 +ta 635 +an 631 +ka 621 +r_ 615 +îb 586 +ij 585 +_u 568 +_pa 548 +un 541 +no 539 +va 537 +ma 525 +ra 522 +ri 515 +iz 512 +in 509 +vi 508 +pr 506 +sa 487 +ju 486 +tâ 483 +ik 473 +n_ 472 +am 454 +en 452 +tie 440 +na 432 +ða 430 +tu 420 +ija 420 +ir 419 +o_ 415 +m_ 414 +li 411 +ci 409 +ku 404 +mi 401 +_no 398 +ek 398 +_un 397 +t_ 395 +_l 394 +un_ 393 +_d 393 +ot 392 +_un_ 392 +ies 391 +ar_ 388 +nâ 387 +da 387 +ei 385 +em 382 +_ti 382 +pi 380 +al 368 +ba 356 +ts 355 +_iz 353 +ap 350 +jas 349 +ko 346 +au 345 +ðan 339 +re 339 +ni 337 +kâ 328 +ro 327 +et 326 +rî 325 +ga 323 +mu 322 +jas_ 321 +tr 317 +_va 312 +te 310 +_pr 307 +sp 298 +sk 297 +ne 295 +îba 293 +âs 289 +ijas 287 +_ar 283 +la 283 +to 282 +ad 280 +dz 278 +îg 277 +tî 277 +çj 277 +_sa 277 +jâ 274 +_m 273 +ties 273 +lî 272 +ijas_ 269 +_at 269 +ed 267 +_pi 267 +ò 267 +s, 265 +s,_ 265 +çr 261 +gu 258 +f 258 +si 257 +mâ 256 +nt 254 +tu_ 254 +_r 253 +ru 252 +âj 251 +jum 251 +âs_ 250 +ât 249 +iem 248 +_b 248 +_vi 245 +par 244 +pie 244 +rt 243 +vç 242 +_ne 240 +âr 240 +ai_ 239 +_. 239 +_j 237 +us 237 +_ka 236 +er 232 +bas 232 +_tie 232 +_par 230 +di 229 +û 228 +ms 228 +âd 226 +îbas 224 +om 220 +bu 219 +umu 217 +iek 216 +kt 216 +it 216 +râ 215 +_ko 214 +ana 211 +âk 211 +or 210 +de 210 +inâ 207 +dî 204 +ur 203 +pri 203 +_c 202 +ta_ 202 +bas_ 201 +es_ 201 +_pie 200 +il 200 +os 199 +ðana 199 +id 198 +L 198 +_ie 197 +bi 194 +ak 194 +ja_ 194 +îbas_ 193 +çt 192 +îbu 192 +_g 191 +ï 189 +ma_ 189 +ien 188 +kum 187 +— 187 +_ir 186 +on 186 +_ties 185 +zî 185 +ce 185 +ied 184 +ist 183 +ts_ 183 +_— 183 +—_ 182 +âm 181 +vie 180 +_—_ 180 +cij 180 +ka_ 179 +_ap 175 +sî 174 +ir_ 174 +zi 174 +uma 173 +âl 172 +_ir_ 172 +oð 171 +_da 169 +î_ 168 +arî 168 +) 167 +do 166 +ve 166 +( 165 +ls 165 +bu_ 164 +val 164 +mu_ 164 +isk 163 +uz 161 +av 160 +par_ 160 +_vç 159 +_par_ 158 +vçr 158 +_ar_ 157 +rie 157 +_uz 157 +aj 156 +îbu_ 156 +oj 155 +kâ_ 155 +ld 154 +iet 154 +iku 154 +ks 153 +du 153 +ep 153 +ms_ 152 +ec 152 +V 151 +ais 150 +str 150 +nie 149 +am_ 149 +ums 148 +_( 147 +ju_ 146 +z_ 142 +ru_ 142 +îj 142 +gum 142 +u,_ 141 +u, 141 +iò 141 +uma_ 141 +çrt 141 +îgu 141 +me 140 +dâ 140 +ît 140 +ent 139 +ikum 138 +pâ 138 +em_ 138 +_L 138 +lie 136 +sta 136 +rî_ 136 +lst 136 +eik 135 +se 135 +s. 134 +pro 134 +rs 134 +s._ 134 +tik 134 +lç 134 +ska 133 +pap 133 +kas 133 +rm 133 +âju 133 +pî 132 +nu 132 +T 132 +pç 131 +tei 131 +nas 131 +_V 131 +jo 131 +lîg 131 +ut 131 +iem_ 130 +ras 129 +pu 129 +_li 129 +_ga 128 +dar 128 +_kâ 128 +umu_ 127 +îv 126 +îgum 126 +kas_ 126 +tîb 125 +vai 125 +lîgu 125 +P 125 +arî_ 125 +ev 124 +lai 124 +îr 124 +lîgum 124 +tv 123 +að 123 +_arî_ 122 +_arî 122 +_vie 121 +S 121 +_T 120 +îju 120 +teik 120 +cija 119 +rb 119 +_ð 119 +jâ_ 119 +nas_ 118 +tâj 118 +vçrt 118 +iec 118 +_la 117 +îd 116 +_vai 116 +îjum 116 +vien 116 +als 116 +_lî 116 +apîr 115 +rts 115 +isi 115 +pîr 115 +papî 115 +papîr 115 +apî 115 +tsp 115 +mç 114 +im 114 +be 114 +is_ 114 +alst 114 +_ja 114 +rtspa 113 diff --git a/libtextcat/data/new_fingerprints/lm/lithuanian.lm b/libtextcat/data/new_fingerprints/lm/lithuanian.lm new file mode 100644 index 000000000000..eca25a45b2ba --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/lithuanian.lm @@ -0,0 +1,400 @@ +_ 23070 +i 8196 +a 7763 +s 5179 +e 3458 +o 3425 +u 3371 +t 3344 +r 3240 +n 3070 +k 3019 +l 2060 +p 1889 +s_ 1756 +m 1680 +d 1640 +v 1590 +, 1501 +,_ 1497 +ó 1400 +g 1302 +j 1200 +o_ 1138 +. 1128 +ai 1113 +÷ 1075 +_p 1047 +_k 1035 +i_ 990 +._ 984 +as 975 +b 937 +au 903 +is 863 +_n 815 +ka 811 +si 804 +_s 799 +ia 796 +ý 759 +ta 735 +ie 731 +ti 728 +y 721 +_t 711 +in 697 +_i 686 +us 675 +pa 658 +ir 640 +ar 633 +ù 620 +_a 617 +r_ 609 +ne 583 +a_ 578 +ri 569 +_v 558 +_pa 541 +al 540 +ra 531 +li 522 +à 509 +u_ 493 +vi 490 +_ka 489 +da 487 +_ne 482 +_j 481 +en 478 +ù_ 445 +os 444 +ki 442 +õ 441 +an 440 +ik 434 +ma 433 +as_ 430 +st 420 +_d 420 +ai_ 417 +s,_ 396 +am 396 +is_ 396 +s, 396 +_ir 390 +jo 387 +ir_ 385 +at 384 +_ir_ 383 +i÷ 381 +ni 377 +ga 376 +_b 367 +na 366 +e_ 365 +to 352 +pr 337 +_g 332 +_m 332 +û 328 +ß 326 +io 323 +ei 323 +Ã_ 321 +su 320 +ßi 320 +uo 319 +la 317 +er 303 +va 299 +vo 298 +ýi 293 +es 291 +- 290 +ó_ 286 +-_ 285 +_vi 284 +_- 283 +ko 283 +_-_ 283 +iau 281 +sa 278 +s. 278 +iu 276 +et 274 +nu 265 +ja 262 +õ_ 262 +_ta 262 +s._ 259 +el 258 +ój 258 +os_ 256 +im 255 +it 254 +_pr 253 +no 252 +av 251 +ur 251 +_÷ 248 +mi 246 +ve 245 +ak 245 +ku 243 +ek 241 +tu 241 +_ý 241 +ad 241 +ñ 238 +_l 238 +_i÷ 236 +us_ 236 +_su 232 +oj 231 +au_ 231 +ba 224 +nt 218 +me 214 +te 210 +jo_ 208 +ro 205 +ós 204 +iù 203 +tai 203 +_õ 201 +ló 201 +pas 200 +ip 198 +iai 197 +di 195 +usi 194 +vo_ 193 +kai 190 +i, 189 +i,_ 189 +sk 187 +_r 186 +aus 184 +ap 183 +gi 182 +mo 182 +uk 181 +t_ 180 +_da 177 +be 176 +ien 176 +o, 174 +il 174 +o,_ 173 +ti_ 173 +re 172 +_nu 171 +ßia 171 +_pas 169 +pri 169 +÷_ 169 +ok 168 +dó 166 +ól 162 +bu 161 +÷i 160 +on 160 +asi 159 +id 158 +ul 157 +ji 157 +aip 156 +ia_ 155 +_sa 154 +ot 154 +gal 153 +nk 153 +om 152 +p_ 151 +_pri 151 +od 151 +vis 150 +_at 150 +vie 150 +uv 150 +ab 150 +iù_ 149 +ama 149 +nó 149 +A 149 +sta 148 +kr 148 +_A 145 +ais 144 +rt 143 +J 142 +_J 142 +ójo 142 +_õ_ 141 +K 140 +_K 139 +pi 139 +avo 139 +ng 139 +mas 138 +du 138 +ug 137 +_ga 137 +ol 136 +tik 135 +ali 135 +.. 135 +_vis 135 +iek 134 +ini 133 +kad 133 +a, 133 +le 133 +kó 133 +T 133 +tó 132 +i. 132 +_T 131 +dý 131 +a,_ 131 +go 131 +ip_ 131 +aip_ 131 +ró 129 +a÷ 129 +dýi 129 +ke 128 +k_ 128 +d_ 127 +ad_ 127 +_ti 127 +_ma 126 +_va 125 +i._ 124 +z 123 +÷k 123 +iau_ 121 +m_ 120 +kar 119 +um 119 +tù 119 +_kad 118 +uý 117 +tr 117 +_kai 117 +as, 116 +og 116 +kad_ 116 +as,_ 116 +_ji 116 +rie 115 +yt 114 +_ja 114 +_ko 114 +vó 114 +ys 114 +jau 113 +ar_ 113 +pra 112 +aý 112 +ant 112 +kl 111 +tas 111 +gu 111 +_ku 110 +si_ 110 +pe 109 +eb 109 +N 109 +ójo_ 108 +_kad_ 108 +se 108 +_ki 108 +or 107 +ly 107 +lo 107 +iai_ 107 +_la 106 +ts 106 +ñs 106 +ig 105 +_N 105 +ut 105 +_u 105 +ós_ 105 +avo_ 104 +ie_ 104 +a. 104 +kt 104 +em 103 +do 103 +je 102 +ks 102 +ats 101 +_be 101 +ris 101 +l_ 101 +_vie 101 +tin 101 +ag 100 +dam 100 +_st 100 +a._ 99 +_bu 99 +? 99 +V 98 +io_ 98 +kas 98 +_jo 98 +vien 97 +_gal 97 +ót 97 +kia 97 +lia 96 +c 96 +uri 96 +_V 96 +?_ 96 +uvo 95 +ru 95 +ty 95 +ep 94 +nd 94 +lai 94 +_tai 94 +ju 93 +man 92 +o. 92 +buv 92 +tà 92 +_÷i 91 +i÷_ 91 +_i÷_ 91 +i÷k 90 +o._ 90 +_to 90 +bi 90 +up 89 +po 89 +rs 89 +_prie 88 +ù,_ 88 +aik 88 +ui 88 +_tik 88 +ij 88 +ù, 88 +pat 88 +prie 88 +I 87 +ies 87 +tai_ 87 +n_ 87 +pasi 87 +ius 87 +÷t 87 +eik 86 +_me 86 +ina 86 diff --git a/libtextcat/data/new_fingerprints/lm/luxembourgish.lm b/libtextcat/data/new_fingerprints/lm/luxembourgish.lm new file mode 100644 index 000000000000..149c3d9d4359 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/luxembourgish.lm @@ -0,0 +1,400 @@ +_ +e +n +r +t +i +a +s +u +er +h +d +o +l +n_ +g +en +c +ch +_d +m +t_ +r_ +_a +de +e_ +en_ +er_ +an +z +é +, +un +,_ +. +ge +b +k +w +ë +f +._ +éi +_de +v +p +sc +sch +es +nn +ng +te +_v +ne +_g +ue +h_ +ie +ch_ +m_ +ou +i_ +ä +s_ +_e +ee +el +ze +at +re +ss +' +vu +_vu +D +le +se +st +_an +_s +un_ +g_ +et +_D +_h +he +an_ +ec +in +rt +_an_ +ro +der +ll +is +ht +ech +cht +d' +_d' +S +nt +uer +der_ +éi_ +em +_o +it +L +l_ +on +esc +A +esch +ti +al +us +ier +_z +ra +E +we +che +_S +ir +ei +ët +äi +vun +_w +me +_m +_A +be +ert +vun_ +_vun +_vun_ +nne +tz +ng_ +eb +_der +_E +_der_ +ar +_n +rg +n,_ +u_ +_L +n, +den +eng +um +_b +io +ns +erg +au +_en +K +M +_ge +dé +as +eg +tze +ung +a_ +n. +hu +ner +op +n._ +ur +et_ +B +oun +ën +hi +si +bu +nn_ +ëtz +_dé +déi +_f +inn +de_ +_déi +li +tt +ebu +W +ass +uerg +nd +ebue +bue +rt_ +ëtze +buer +déi_ +ma +ebuer +_déi_ +_K +_hu +nge +_M +buerg +ëtzeb +zebue +zeb +zebu +tzebu +tzeb +um_ +ioun +ss_ +iou +ges +ere +ha +den_ +Lë +éie +F +ke +_a_ +t. +ta +Lëtz +G +Lëtze +Lët +t._ +en,_ +t,_ +en, +t, +at_ +ech_ +o_ +_de_ +ren +ri +ic +ter +_W +_eng +éier +la +ol +rs +scht +ir_ +ru +_B +ert_ +_k +oc +P +ten +ht_ +_Lë +nz +em_ +p_ +ent +wa +cht_ +_den +_Lët +_Lëtz +ich +_G +_den_ +tio +il +nner +ger +_as +sse +_op +och +ll_ +_ass +R +am +- +_se +_F +sche +d_ +sch_ +ati +_be +ts +ik +nen +De +ers +_. +_P +_._ +fe +ass_ +mm +gi +aa +zu +_De +ve +pe +fi +tr +lt +en. +_ass_ +eng_ +op_ +ck +en._ +sen +na +rge +ës +kt +ed +_al +äit +so +uro +_op_ +gesc +_R +gesch +Eur +tiou +Euro +aus +Eu +erge +hen +tioun +lec +no +fir_ +chen +fir +V +hue +_si +or +ut +ac +uf +Z +gen +tu +ver +lech +da +mat +_V +_Eur +_Eu +wer +lle +_Euro +uerge +_ze +éis +J +est +ger_ +tt_ +_hue +go +_fi +dee +_fir_ +uet +vum +_zu +ni +_vum +_fir +een +_vum_ +vum_ +huet +ann +_huet +T +I +el_ +_wa diff --git a/libtextcat/data/new_fingerprints/lm/malay.lm b/libtextcat/data/new_fingerprints/lm/malay.lm new file mode 100644 index 000000000000..911d0cfef150 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/malay.lm @@ -0,0 +1,400 @@ +_ 87128 +a 50232 +n 25424 +e 18746 +i 18605 +an 14419 +u 12470 +k 11955 +t 11875 +r 11007 +d 9856 +g 9545 +m 9390 +s 8926 +l 8631 +n_ 8004 +an_ 7095 +p 6890 +a_ 6739 +b 6645 +ng 6630 +h 5964 +da 5153 +_d 5107 +er 4625 +ka 4448 +la 4339 +y 4323 +i_ 4295 +en 4159 +ya 3800 +ang 3778 +_m 3750 +o 3593 +. 3539 +._ 3310 +at 3288 +ah 3216 +_b 3199 +_s 3177 +ta 3076 +ra 3036 +_k 2949 +g_ 2939 +ng_ 2933 +ar 2915 +_p 2906 +me 2884 +ga 2781 +di 2778 +ak 2727 +al 2705 +_me 2671 +ang_ 2524 +h_ 2509 +ba 2508 +pa 2454 +kan 2449 +in 2437 +tu 2411 +_t 2376 +sa 2338 +_da 2276 +j 2276 +pe 2225 +, 2150 +k_ 2147 +ma 2143 +se 2138 +am 2131 +kan_ 2117 +_di 2082 +,_ 2079 +ke 2048 +un 2004 +be 1947 +_a 1925 +na 1871 +ti 1868 +ri 1861 +u_ 1857 +as 1854 +ny 1827 +ha 1789 +te 1788 +_pe 1768 +em 1750 +it 1737 +_i 1732 +_ke 1711 +yan 1706 +ad 1698 +ia 1673 +yang 1673 +_y 1668 +_ya 1655 +yang_ 1653 +_se 1648 +ah_ 1646 +_yan 1639 +_yang 1639 +ala 1612 +nya 1587 +el 1576 +ik 1571 +t_ 1568 +ai 1549 +men 1531 +eng 1522 +_men 1464 +nga 1441 +dan 1366 +_be 1365 +si 1343 +uk 1328 +ada 1299 +nt 1291 +__ 1287 +ap 1276 +ua 1265 +___ 1238 +- 1213 +ja 1211 +ber 1204 +gan 1203 +_ba 1193 +____ 1189 +ni 1181 +_te 1169 +c 1143 +ran 1141 +_____ 1140 +m_ 1127 +ara 1118 +per 1099 +le 1084 +_dan 1083 +dan_ 1079 +ngan 1060 +_dan_ 1050 +ya_ 1046 +at_ 1044 +da_ 1021 +li 1016 +aka 1013 +A 999 +r_ 999 +w 997 +eb 995 +lah 980 +ata 980 +ak_ 978 +nd 974 +_ber 955 +gi 936 +is 933 +il 931 +tu_ 923 +s_ 920 +gan_ 915 +mb 913 +wa 904 +ag 903 +ngan_ 898 +ter 887 +nya_ 877 +S 873 +ek 853 +ru 852 +_l 838 +ela 828 +itu 824 +ol 822 +aha 822 +ada_ 820 +pu 812 +di_ 807 +bu 807 +am_ 804 +ur 801 +tan 790 +mp 790 +_per 786 +_sa 784 +M 782 +ut 781 +us 779 +era 779 +lam 778 +lah_ 775 +asa 767 +ki 761 +ir 759 +de 756 +enga 750 +su 748 +du 741 +id 739 +" 733 +akan 732 +apa 728 +_S 724 +ul 721 +lu 717 +ari 717 +dal 704 +et 698 +es 698 +pad 688 +_ma 688 +_M 685 +ana 684 +bi 679 +pada 673 +dala 673 +l_ 671 +ep 664 +f 662 +_di_ 658 +B 655 +ing 655 +_j 654 +ika 653 +ku 650 +_. 644 +akan_ 642 +ama 637 +pen 636 +alam 634 +eh 634 +pada_ 633 +ai_ 632 +_ter 632 +K 631 +mu 628 +ju 628 +P 626 +mem 625 +au 622 +_mem 614 +lan 612 +_._ 611 +ntu 608 +lam_ 605 +um 601 +on 600 +gk 597 +_in 597 +ngk 597 +a. 584 +meng 582 +_meng 578 +alam_ 577 +_A 576 +aa 575 +uk_ 572 +_pen 569 +ban 569 +or 569 +st 566 +ay 566 +dar 565 +_pa 564 +a._ 564 +_h 562 +bah 562 +_P 560 +D 559 +ri_ 558 +ini 552 +_de 551 +rt 550 +aan 545 +_it 542 +_itu 542 +nda 540 +eri 540 +dalam 537 +_B 533 +_dal 532 +ip 532 +_dala 532 +ta_ 528 +_u 527 +ung 525 +ih 524 +aw 520 +_n 519 +atu 517 +ila 513 +mi 513 +leh 513 +ian 512 +tuk 509 +awa 508 +gu 506 +ert 506 +engan 505 +ole 504 +_K 501 +seb 497 +ca 496 +gg 493 +_ta 489 +ra_ 488 +ngg 488 +itu_ 487 +emb 482 +ni_ 482 +ida 482 +nj 482 +_ti 479 +man 478 +den 477 +_D 474 +_ka 473 +aj 470 +oleh 468 +n. 468 +n._ 464 +ngka 464 +gka 464 +dak 464 +anga 461 +ena 459 +san 458 +pat 458 +rk 458 +( 455 +ent 454 +agi 453 +) 451 +ia_ 450 +ge 450 +ab 449 +im 447 +_ini 446 +ntuk 445 +I 445 +ar_ 440 +N 439 +aan_ 436 +_la 433 +pi 432 +baha 431 +deng 430 +han 430 +bag 429 +eh_ 429 +hu 429 +denga 428 +_o 427 +na_ 427 +T 425 +leh_ 422 +_den 422 +ka_ 419 +any 415 +ud 415 +rang 414 +anya 413 +gi_ 412 +angan 412 +a, 411 +_deng 410 +ita 409 +kat 408 +re 408 +_( 406 +tuk_ 403 +aga 401 +ne 400 +and 399 +aya 398 +_dar 397 +a,_ 396 +ro 396 +ntuk_ 393 +eba 392 +aran 390 +_" 387 +ed 385 +end 384 +ko 383 +sa_ 381 +p_ 381 +ara_ 380 +_seb 379 +alah 379 +oleh_ 379 +an. 378 +dak_ 378 +eg 378 +hi 376 +dari 375 +an._ 375 +au_ 373 +bo 373 +ti_ 371 +ula 371 diff --git a/libtextcat/data/new_fingerprints/lm/manx_gaelic.lm b/libtextcat/data/new_fingerprints/lm/manx_gaelic.lm new file mode 100644 index 000000000000..e6eceebeb080 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/manx_gaelic.lm @@ -0,0 +1,400 @@ +_ 36004 +e 9455 +a 8302 +y 6395 +n 6395 +h 5736 +r 4939 +s 4799 +o 4429 +i 4129 +l 3462 +y_ 2896 +g 2731 +n_ 2549 +d 2232 +t 2160 +_a 1836 +m 1823 +sh 1774 +e_ 1746 +h_ 1645 +yn 1550 +c 1531 +ee 1515 +gh 1461 +s_ 1460 +_s 1388 +ey 1345 +_e 1316 +ag 1312 +a_ 1201 +r_ 1173 +agh 1165 +in 1148 +as 1136 +_d 1136 +u 1124 +he 1060 +yn_ 1025 +oo 1012 +ey_ 1005 +_m 1002 +er 965 +v 949 +_y 941 +_v 895 +_c 891 +_as 880 +' 869 +, 866 +_n 832 +ll 828 +,_ 827 +_sh 818 +. 802 +hi 797 +as_ 770 +ee_ 768 +ne 767 +_as_ 764 +b 762 +re 735 +ay 730 +._ 730 +an 726 +ar 725 +gh_ 722 +ny 709 +en 703 +_r 697 +ch 688 +agh_ 687 +dy 686 +t_ 677 +le 667 +k 658 +er_ 616 +oi 612 +ea 607 +_t 601 +yr 596 +_er 585 +ra 574 +_dy 572 +in_ 570 +l_ 564 +f 557 +_l 556 +ha 551 +_g 548 +_ny 534 +nn 530 +" 528 +_ch 527 +_y_ 524 +ie 514 +dy_ 514 +_dy_ 513 +aa 510 +_f 509 +j 504 +sh_ 487 +oa 480 +is 478 +_h 470 +rr 468 +ny_ 467 +_ny_ 463 +_er_ 454 +ish 445 +ho 442 +ai 441 +d_ 435 +ro 423 +ht 418 +ei 417 +shi 416 +il 409 +me 408 +_ay 403 +_b 403 +la 400 +_j 400 +my 394 +va 391 +ns 386 +on 385 +_o 381 +ys 380 +_shi 379 +ia 377 +ayn 373 +_va 371 +hen 362 +she 356 +ri 345 +lle 342 +ooi 342 +mee 340 +ley 335 +_me 331 +el 330 +rt 328 +ie_ 327 +eh 324 +w 316 +_ayn 313 +al 311 +g_ 309 +ish_ 308 +lley 307 +mee_ 305 +_mee 304 +ill 301 +es 299 +na 299 +je 298 +yns 296 +C 294 +_my 291 +_she 290 +ley_ 282 +V 280 +_yn 278 +_" 278 +_mee_ 277 +ta 272 +_V 271 +ys_ 268 +- 268 +lley_ 265 +hin 264 +_ro 259 +shin 256 +_yn_ 255 +_je 255 +do 253 +va_ 253 +ne_ 253 +_va_ 252 +ns_ 252 +_shin 251 +yns_ 250 +ayns 248 +en_ 247 +che 246 +_ayns 246 +eh_ 246 +_do 242 +ad 241 +ney 240 +o_ 240 +ym 240 +ed 239 +yr_ 239 +ayns_ 237 +ur 237 +st 234 +_C 234 +rt_ 234 +'n 232 +m_ 232 +p 231 +li 231 +or 230 +ow 228 +hin_ 225 +da 225 +shen 223 +"_ 223 +'n_ 223 +Va 221 +ght 220 +shin_ 219 +tr 217 +_Va 217 +ry 216 +ve 216 +_shen 215 +ty 214 +mo 206 +_' 205 +_ve 205 +ma 203 +be 203 +te 203 +hie 203 +hey 203 +nag 202 +ll_ 201 +yl 200 +w_ 200 +ss 200 +aa_ 198 +nagh 198 +an_ 197 +io 195 +ow_ 194 +it 194 +sy 193 +ayr 193 +ney_ 192 +E 192 +sht 192 +ni 191 +_k 190 +ha_ 190 +ain 189 +u_ 189 +hy 189 +aght 188 +oo_ 188 +ree 188 +lh 187 +_tr 186 +esh 186 +_che 183 +yrt 182 +_da 182 +oar 182 +doo 181 +k_ 181 +se 180 +au 180 +ille 179 +ar_ 179 +_lh 179 +ki 177 +arr 176 +ec 176 +ol 175 +_doo 175 +T 175 +row 175 +_row 174 +ge 173 +so 172 +oy 171 +oil 170 +_re 170 +_ag 170 +'e 169 +rey 169 +illey 169 +ck 168 +ad_ 168 +ann 168 +n, 166 +eea 166 +_ta 166 +ht_ 165 +ae 162 +_row_ 162 +! 162 +row_ 162 +ane 161 +fe 161 +dd 160 +go 159 +tyn 159 +oin 158 +ooa 158 +n. 158 +eg 156 +_ec 156 +_ma 156 +_agh 155 +n,_ 154 +_fe 154 +Ch 154 +nyn 153 +fo 152 +eay 152 +nagh_ 152 +n._ 152 +_go 151 +S 150 +ke 150 +hey_ 150 +enn 150 +cha 149 +rre 149 +_fo 149 +ghe 149 +raa 149 +G 148 +lan 148 +mm 147 +ym_ 147 +A 146 +c_ 146 +oill 145 +hee 144 +ooar 144 +_E 144 +nne 143 +tey 142 +ir 141 +de 141 +hyn 140 +_he 140 +nyn_ 140 +'s 139 +_gh 139 +_cha 138 +L 138 +yrt_ 138 +_Ch 137 +e, 137 +lla 136 +Va_ 136 +ooin 136 +ell 135 +a' 135 +os 135 +_oo 134 +am 134 +rish 133 +th 133 +_mo 133 +ris 133 +iag 133 +gg 133 +_Va_ 132 +iagh 132 +_G 132 +e,_ 132 +ass 132 +!_ 132 +my_ 132 +hoo 131 +_T 131 +nee 130 +a'n_ 129 +rag 129 +a'n 129 +_'s 129 +_so 128 +co 128 +rey_ 128 +_eh 126 +_my_ 126 +nni 126 +ou 126 +_'sy 125 +'sy 125 +_p 125 +vo 125 +_S 125 +H 125 +_agh_ 125 +tra 124 +moo 124 +hu 123 +ooy 123 +ragh 123 +al_ 123 +it_ 123 +hia 122 +id 122 diff --git a/libtextcat/data/new_fingerprints/lm/marathi.lm b/libtextcat/data/new_fingerprints/lm/marathi.lm new file mode 100644 index 000000000000..479f4fd9b760 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/marathi.lm @@ -0,0 +1,400 @@ +þ 17815 +_ 14534 +þþ 4035 +þ_ 3087 +· 2180 +ø 1762 +·þ 1669 +ú 1654 +¡ 1534 +¥ 1480 +¡þ 1433 +Å 1342 +£ 1289 +ˆ 1237 +› 1163 + 1158 +¬ 1153 +þþ_ 1133 +¿ 1099 +¨ 1084 +Ÿ 1062 + 1059 +ú_ 1054 +¥þ 1054 +ˆÅ 1050 +¡þþ 1039 +þú 989 +œ 983 +›þ 968 +‚ 965 +¹ 949 +_‚ 936 +. 936 +¨þ 915 +þ¿ 859 +þ£ 842 +¬þ 838 +Ÿþ 836 +þ· 769 +þ 764 +µ 710 +þ¥ 710 +œþ 684 +þ·þ 681 +._ 670 +þú_ 637 +½ 609 +‚þ 578 +_Ÿ 578 +_‚þ 569 +_ˆ 543 +þ 530 +_· 530 +µþ 524 +_œ 523 +Š 511 + 504 +ø_ 495 +·þ_ 477 +þ¥þ 476 +þ 474 +_¹ 474 +_¬ 457 +_ˆÅ 447 +_Ÿþ 447 +¡þþ_ 446 +þø 444 +¸ 438 +Šþ 423 +¥þþ 422 +þ 409 +þ¨ 388 +Åþ 387 +þþ 381 +£_ 366 +ˆÅþ 359 +é 339 +þ¨þ 334 +_¬þ 332 +þµ 331 +þ› 324 +›þþ 320 +_œþ 320 +Ÿþþ 318 +ª 315 +_ 315 +›þ_ 314 +Û 313 +þ¹ 306 +þ£_ 303 +þ¬ 301 +þˆ 297 + 297 +þþ 295 +ªþ 284 +þ¡ 280 +§ 274 +¿_ 272 +þþ· 270 +þ¡þ 268 +¥þþ_ 268 +£þ 264 +þ 264 +ú 263 +, 258 +þþ·þ 256 +þ¥þþ 256 +þ·þ_ 256 +,_ 255 +þˆÅ 255 +Ù 249 +¬þþ 247 +þþ¿ 243 +þþ£ 241 +_› 234 +_·þ 233 +·þþ 232 +þœ 232 +·¡ 231 +þ¿_ 231 +þ›þ 230 +þ¬þ 228 +Ä 227 +š 226 +þµþ 225 +½_ 223 +·¡þ 221 +½ 220 +_ 220 +¥ø 219 +µþ_ 218 +¨þþ 217 +þ. 216 +þŸ 215 +_›þ 213 +.. 209 +_Ÿþþ 209 +þþþ 205 +þþ 205 +·¡þþ 204 +‰ 204 +·þú 203 +þ¥þþ_ 202 +Å_ 202 +þþ¥ 201 +ê 198 +_¨ 198 +þŠ 193 +_¡ 191 +þœþ 190 +þ._ 189 +¡þ 188 +¡ 188 +¡þþ 187 +Ï 186 +ø· 186 +ž 185 +ú_ 180 +þ_ 180 +_þ 180 +‰þ 179 +Ú 178 +_ˆÅþ 176 +_·¡þ 175 +_·¡þþ 175 +þ 175 +_·¡ 175 +þŠþ 174 +þú 171 +¥¡ 170 +_ 170 +... 168 +¥¡þ 167 +þú 166 +_¥ 164 +þŸþ 164 +‚þ 163 +¥¡þþ 163 +žþ 162 +ø·þ 162 +_‚þ 162 +¡þþ_ 161 +ø 160 +ˆÅ_ 159 +þþ¥þ 159 +þ½ 157 +œþþ 154 +þ 154 +_þ 154 +_¸ 154 +ø¥ 153 +þþ 153 +šþ 151 +þ£þ 151 +_Š 151 +‚þ½ 150 +þ§ 150 +·ø 150 +ø. 149 +_‚þ½ 149 +‚¬ 148 +¥þú 148 +£ú 147 +œÏ 147 +Å£ 146 +þú_ 146 +_‚¬ 145 +ø._ 145 +þþ¨ 144 +þÄ 143 +‹ 142 + 140 +¡þþ 140 +¹¨þ 139 +¹¨ 139 +þþ 137 +þþ› 136 +_ø 136 +Ÿ 136 +_¡þ 135 +ú. 135 +·þø 135 +' 135 +ˆÅ£ 134 +_¥þ 132 +þþ¨þ 131 +þþ·þ_ 131 +µþþ 129 +'_ 129 +_›þþ 129 +ú._ 128 +þÛ 127 +.... 127 +˜ 127 +¿ 126 +þþ¬ 125 +_¨þ 124 +¡þ_ 123 +þ·þþ 123 +« 122 +; 122 +_œÏ 121 +_¡þþ 119 +» 118 +¬þ¿ 115 +¥þ¿ 114 +¥þ_ 114 +þþŠ 113 +þ¥¡þ 113 +þ¥¡ 113 +b 113 +þ£ú 112 +_' 112 +s 111 +& 111 +þþ 111 +‚¬þ 111 +þþú 111 +p 111 +ê› 111 +_£ 110 +þ¥¡þþ 110 +bs 109 +ê›þ 109 +n 109 +º 109 +_‚¬þ 108 +&n 107 +bsp; 107 +nbsp; 107 +sp; 107 +nb 107 +bsp 107 +nbs 107 +sp 107 +nbsp 107 +&nb 107 +  107 +p; 107 +&nbs 107 +þþþ 106 +_'_ 106 +þ¥ø 106 +·þ£ 106 +ø¥þ 106 +þþ_ 105 +¨þ£ 104 +þ¿ 104 +˜þ 104 +þ¸ 104 +_Šþ 103 +þ¬þþ 103 +›ø 103 +ø· 102 +þþŠþ 101 +þþú 100 +þ›þþ 99 +þ, 99 +_ 98 +þ,_ 98 +_ˆÅ£ 98 +‚þœ 98 +þþú_ 98 +_þþ 97 +;& 97 +_‚þœ 97 +‚þœþ 97 +_‚þœþ 96 +þ 96 +¡þþþ 96 +¬þ· 96 +‡ 96 +;&nb 95 +p;&nb 95 +;&n 95 +p;& 95 +;&nbs 95 +bsp;& 95 +..... 95 +ê›þ_ 95 +sp;&n 95 +sp;& 95 +p;&n 95 +° 94 +_‡ 94 +ƒ 94 +¥¡þþ_ 94 +þþ¬þ 93 +Ÿþú 93 +þµþ_ 93 +£ú_ 93 +œþµ 93 +_ž 92 +µ¡ 92 +_ø· 92 +þú_ 92 +µ¡þ 92 +·þú_ 91 +¹ 91 +½_ 90 +_¬þþ 90 +_œþþ 90 +þþ¥þþ 90 +„ 90 +µ¡þþ 90 +‹þ 89 +¹· 89 +þ¡þ 89 +þ£ú_ 89 +þÚ 89 +þ¡ 89 +þˆÅþ 89 +¿Š 89 +Šþþ 89 +_Ÿ 88 +½Å 88 +þ¡þþ 88 +_¹¨ 88 +_¹¨þ 88 +ø·þ 87 +ø 87 +ˆ½ 87 +›þþú 87 +_‹ 87 +µ 87 +›þþ 87 +œþµþ 86 +¥þú_ 86 +_‚þ¹ 85 +‚þ¹ 85 +£· 85 +_„ 85 +¹·þ 85 +_›þþ 84 +¡ø 84 +þ¹µþ 83 +þþµ 83 +¹µþ 83 +¹µ 83 +þ¹µ 83 +þþ_ 83 +_ª 82 +_‚þ¹µ 82 +þ¹µþ_ 82 +›þþ_ 82 +‚þ¹µþ 82 +‚þ¹µ 82 +Ÿþú_ 82 +_þ 82 +¹µþ_ 82 +þþ£_ 81 +øˆ 81 +½._ 80 +¿Šþ 80 +_·þ£ 80 +½. 80 +ú 79 +_žþ 79 +œþµþ_ 79 +² 79 diff --git a/libtextcat/data/new_fingerprints/lm/middle_frisian.lm b/libtextcat/data/new_fingerprints/lm/middle_frisian.lm new file mode 100644 index 000000000000..17e4f149d122 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/middle_frisian.lm @@ -0,0 +1,400 @@ +_ 60524 +e 20138 +n 10664 +t 8674 +i 7833 +a 7549 +o 7278 +r 6963 +n_ 5391 +s 5087 +d 4731 +t_ 4327 +l 4306 +e_ 4271 +en 4068 +k 3483 +m 3346 +g 2990 +er 2965 +en_ 2835 +y 2597 +w 2481 +h 2475 +j 2441 +_d 2375 +r_ 2250 +u 2059 +s_ 1921 +ie 1780 +_h 1725 +z 1710 +de 1666 +_m 1652 +_w 1629 +_i 1611 +et 1550 +te 1540 +b 1538 +. 1517 +er_ 1481 +in 1420 +an 1408 +p 1392 +f 1311 +, 1280 +,_ 1273 +k_ 1266 +._ 1258 +y_ 1238 +_z 1230 +oe 1162 +v 1144 +et_ 1138 +ne 1131 +ee 1125 +st 1120 +_o 1107 +_s 1107 +_e 1076 +_n 1068 +_b 989 +_t 988 +l_ 983 +oo 981 +ge 975 +je 963 +_g 952 +at 949 +me 946 +ij 943 +is 913 +he 899 +' 892 +aa 882 +el 870 +_v 864 +re 843 +ar 837 +_de 805 +on 800 +ke 796 +de_ 788 +ei 782 +_a 776 +_' 770 +le 744 +at_ 742 +it 741 +_k 738 +or 734 +an_ 729 +in_ 723 +da 715 +te_ 703 +_he 680 +_l 664 +H 661 +al 652 +_H 652 +a_ 650 +_da 648 +D 648 +d_ 646 +_D 643 +es 617 +g_ 608 +is_ 606 +æ 596 +_f 594 +'t 591 +_'t 584 +'t_ 583 +_me 580 +c 576 +_'t_ 576 +ri 571 +_en 564 +yn 563 +_en_ 559 +ze 538 +m_ 535 +om 535 +ik 522 +_de_ 522 +ed 521 +be 517 +wi 509 +ch 509 +ol 498 +ar_ 498 +ha 496 +_ne 495 +we 494 +ou 486 +nd 474 +ma 474 +ik_ 470 +J 459 +_ik 456 +_ik_ 451 +_J 451 +ll 450 +M 446 +li 443 +wa 442 +_M 442 +ien 435 +ro 432 +di 421 +nn 418 +ste 415 +wo 415 +yn_ 415 +_r 413 +_ha 411 +it_ 409 +tt 408 +ve 407 +S 404 +_j 399 +_S 398 +_in 395 +as 395 +der 395 +hi 389 +_al 388 +uw 384 +tte 384 +ng 381 +_wi 381 +nne 381 +het 380 +_het 378 +lle 376 +nt 374 +ns 373 +op 373 +je_ 367 +E 366 +ek 363 +B 363 +dat 362 +_B 362 +_ma 362 +_in_ 361 +_dat 359 +_wo 359 +ier 358 +og 357 +_E 354 +_ge 352 +dat_ 349 +ne_ 349 +eer 349 +het_ 348 +_het_ 348 +_is 348 +_dat_ 348 +ey 347 +W 346 +_W 343 +_te 339 +eg 337 +ra 335 +rs 335 +! 332 +zi 332 +gt 329 +_He 327 +He 327 +_be 324 +j_ 320 +ij_ 320 +u_ 319 +f_ 317 +_di 316 +; 315 +_we 315 +_is_ 314 +rt 313 +!_ 313 +;_ 312 +n. 311 +p_ 310 +la 310 +_wa 310 +ea 308 +_u 308 +pe 304 +ta 304 +il 301 +my 300 +ig 300 +n, 300 +n,_ 298 +iet 297 +sc 297 +ter 296 +sch 296 +oor 295 +no 294 +_my 293 +n._ 291 +ti 290 +_zi 289 +st_ 288 +el_ 283 +gen 282 +se 280 +K 279 +_oo 278 +ui 278 +e, 275 +ten 274 +net 274 +oon 274 +ho 274 +jn 273 +e,_ 273 +ijn 272 +ien_ 272 +ko 272 +N 271 +ni 271 +_N 269 +den 269 +za 268 +_net 267 +i_ 266 +wol 266 +een 264 +va 264 +am 264 +do 263 +ol_ 262 +le_ 261 +_te_ 260 +rd 260 +ke_ 259 +ey_ 259 +ers 258 +_K 255 +_ie 255 +ver 254 +to 254 +_hi 253 +nde 253 +: 253 +der_ 252 +jo 251 +net_ 251 +al_ 250 +_wol 250 +_p 250 +_no 250 +aar 248 +_za 245 +_net_ 245 +nk 242 +est 241 +om_ 241 +_va 241 +ak 241 +tj 239 +ae 238 +_op 237 +mo 236 +tr 236 +_st 233 +ier_ 233 +_ve 233 +mm 232 +T 231 +eu 229 +_ze 228 +:_ 228 +mi 228 +ag 227 +zo 227 +_ko 226 +vo 225 +_T 225 +nne_ 225 +F 224 +.. 224 +fo 224 +_F 223 +en, 223 +oe_ 222 +wie 222 +en,_ 221 +kk 221 +_ien 220 +on_ 218 +wol_ 218 +_fo 218 +I 217 +_do 216 +eit 214 +ei_ 214 +mme 214 +G 213 +_I 213 +as_ 213 +or_ 211 +A 211 +_G 210 +_A 210 +lle_ 209 +Da 208 +_Da 207 +_wol_ 207 +ren 206 +_ee 206 +e. 205 +van 205 +jen 205 +een_ 205 +_van 204 +kke 204 +en. 204 +O 204 +_vo 204 +V 203 +_O 203 +_V 203 +ken 203 +_mo 203 +ot 202 +ie_ 201 +ou_ 201 +ur 201 +tte_ 200 +em 200 +_van_ 199 +L 199 +van_ 199 +_zo 199 +op_ 199 +_L 197 +iet_ 196 +of 194 +ten_ 194 +oer 193 +_ien_ 193 +e._ 193 +oed 192 +hie 192 +zy 191 +en._ 191 +De 191 +_ver 191 +_ho 189 +_De 189 +oa 189 +den_ 188 +_zy 188 +lo 187 +dan 187 +nt_ 186 diff --git a/libtextcat/data/new_fingerprints/lm/mingo.lm b/libtextcat/data/new_fingerprints/lm/mingo.lm new file mode 100644 index 000000000000..4d1947a33345 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/mingo.lm @@ -0,0 +1,400 @@ +_ 3156 +' 991 +a 829 +t 788 +k 788 +n 752 +e 640 +h 602 +ë 551 +s 495 +w 466 +ö 422 +y 403 +u 397 +'_ 368 +_n 330 +i 269 +. 269 +._ 240 +e' 238 +ô 219 +a' 209 +ne 209 +e_ 194 +_ne 184 +wa 172 +_k 164 +kw 162 +á 154 +e'_ 146 +_t 145 +'t 141 +" 141 +ë' 134 +ê 132 +_h 131 +é 129 +ak 129 +ne_ 126 +ta 123 +_ne_ 123 +at 121 +u' 121 +hu 118 +_u 108 +ka 107 +à 106 +nö 105 +N 103 +ö' 100 +hs 95 +ha 95 +te 94 +ya 93 +_ë 92 +kh 90 +'k 86 +ú 85 +æ 84 +ni 84 +'. 84 +wë 83 +ny 83 +sh 80 +_N 80 +'._ 76 +ö_ 76 +kë 72 +_" 71 +ën 70 +th 69 +yu 66 +_w 66 +ëh 66 +t_ 66 +ô_ 66 +Ne 65 +Ne' 65 +'s 64 +ne' 64 +"_ 64 +_. 63 +_._ 61 +Ne'_ 61 +, 60 +ne'_ 58 +në 58 +kwa 57 +_ne' 57 +öt 57 +a't 57 +ek 56 +s_ 56 +ët 55 +i' 55 +_hu 54 +T 54 +ë_ 54 +_s 54 +_Ne 53 +të 53 +tö 53 +_Ne' 53 +_ne'_ 53 +ht 53 +- 53 +ts 52 +ya' 52 +ë'_ 51 +_wa 51 +_Ne'_ 51 +'ö 50 +Ãy 50 +_ka 50 +as 50 +nà 49 +un 49 +ê_ 49 +ty 48 +hu_ 48 +ke 48 +u_ 48 +,_ 47 +yô 46 +he 46 +ye 46 +kê 45 +si 45 +nô 44 +khu 44 +a'k 44 +_a 43 +ák 43 +wat 42 +'ë 42 +nö' 42 +wá 42 +aw 41 +an 41 +we 41 +ôt 41 +i_ 41 +_kh 41 +tak 41 +_te 41 +ik 41 +_khu 40 +ës 40 +yö 40 +k_ 39 +khu_ 39 +yu' 38 +hö 38 +wën 38 +n- 38 +ëhs 38 +_ha 38 +wa' 38 +_n- 37 +sa 37 +? 37 +_T 37 +æ' 37 +_un 36 +hô 36 +wé 36 +ah 36 +_khu_ 35 +iy 35 +ëk 35 +ut 35 +ök 35 +öh 35 +te' 35 +u'_ 35 +_kë 34 +yô_ 34 +ha' 34 +st 34 +ti 34 +ta' 34 +u't 33 +ya't 33 +'t_ 33 +'ö_ 33 +akw 33 +ôk 33 +nÃy 32 +'ta 32 +ku 32 +ui 32 +_sh 32 +aa 31 +én 31 +ay 31 +ënö 31 +â 30 +se 30 +tw 30 +yë 30 +(_ 29 +_(_ 29 +_)_ 29 +us 29 +_ëhs 29 +( 29 +kö 29 +_ëh 29 +ae 29 +hë 29 +_) 29 +)_ 29 +_( 29 +sy 29 +) 29 +ëë 28 +ôn 28 +sk 28 +tá 28 +té 28 +tk 28 +ên 28 +kê_ 28 +a_ 28 +án 27 +_huik 27 +ön 27 +_hui 27 +kwé 27 +huik 27 +ék 27 +hui 27 +uik 27 +_na 27 +na 27 +a'_ 26 +uikê 26 +_ni 26 +." 26 +ææ 26 +wi 26 +huikê 26 +ikê 26 +nya 25 +."_ 25 +?" 25 +të' 25 +Ãyu 25 +awë 25 +sn 25 +në_ 24 +ö'_ 24 +ikê_ 24 +ekh 24 +'ke 24 +uikê_ 24 +?"_ 24 +tek 24 +êt 24 +', 24 +"N 23 +_ta 23 +'a 23 +hk 23 +tê 23 +tsi 23 +wö 23 +yu'_ 23 +_"N 23 +Ãyu' 23 +nÃyu 23 +a'ta 23 +é' 23 +nÃyu' 23 +shô 22 +wënà 22 +kwa' 22 +ënà 22 +kwë 22 +_u' 22 +ey 22 +ënÃyu 22 +K 22 +ai 22 +_shô 22 +úw 22 +aö 22 +ëht 22 +ënÃy 22 +kwe 22 +wënÃy 22 +_he 22 +_te' 21 +ún 21 +s. 21 +nöh 21 +ëö 21 +_ëk 21 +'sh 21 +kë' 21 +htö 21 +H 21 +ata 21 +U 20 +'h 20 +_ha' 20 +ææ' 20 +ae_ 20 +a'ke 20 +ény 20 +unö 20 +we' 20 +wë' 20 +i'_ 20 +kës 20 +ya'ta 19 +es 19 +awënà 19 +hô_ 19 +uk 19 +awën 19 +hw 19 +ë'. 19 +ô' 19 +tö' 19 +sé 19 +ë'ë 19 +ë'._ 19 +'ëë 19 +hkw 19 +-a 19 +twa 19 +'kw 18 +he_ 18 +_K 18 +nöt 18 +hsa 18 +hsi 18 +ôni 18 +aya 18 +is 18 +ëë' 18 +kæ 18 +skw 18 +uw 18 +',_ 18 +e't 18 +kææ 18 +ka' 18 +nae 17 +shô_ 17 +sat 17 +niy 17 +ëts 17 +ath 17 +ye' 17 +Ã_ 17 +'ëë' 17 +'u 17 +_ët 17 +ës_ 17 +_kës 17 +_y 17 +kak 17 +ö'ö 17 +kat 17 +_th 17 +'ë_ 17 +út 17 +u'k 17 +_H 17 +tekh 17 +s._ 17 +wê 17 +_nae 17 +kwá 17 +_shô_ 17 +yê 17 +öte 16 +sha 16 +ke_ 16 +k. 16 +si' 16 +hö' 16 +yá 16 +hta 16 +kn 16 +_nae_ 16 +hse 16 +ké 16 +ë'ë_ 16 +unë 16 +nya' 16 +nae_ 16 +nê 16 +á' 16 +ás 16 +tô 16 +a's 16 +a'u 16 diff --git a/libtextcat/data/new_fingerprints/lm/mongolian_cyrillic.lm b/libtextcat/data/new_fingerprints/lm/mongolian_cyrillic.lm new file mode 100644 index 000000000000..7ac690c8ac2e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/mongolian_cyrillic.lm @@ -0,0 +1,363 @@ +_ 77671 +о 60289 +г 57097 +Ó© 51540 +л 41421 +а 40332 +Ñ€ 35484 +Ñ… 31764 +н 31695 +д 30802 +Ñ 29381 +и 28336 +Ò¯ 26898 +й 25901 +у 21671 +Ñ‚ 21639 +Ñ 21118 +ч 19759 +оо 12185 +м 12159 +Ó©Ó© 11964 +ц 11468 +гү 10443 +үй 10090 +чи 9604 +н_ 9584 +гүй 9529 +в 9133 +_Ñ… 8958 +й_ 8712 +ор 8462 +з 8134 +г_ 8106 +ий 8078 +Ó©Ñ€ 7638 +б 7524 +ол 7372 +ÑÑ 6997 +_Ñ‚ 6942 +уу 6908 +их 6787 +йг 6584 +ог 6380 +го 6337 +чих 6301 +өл 6221 +_о 6219 +д_ 6092 +_Ó© 6085 +Ò¯Ò¯ 5978 +Ñ€_ 5841 +гө 5674 +ш 5656 +өг 5647 +_б 5547 +аа 5471 +он 5259 +_з 5115 +Ð³Ñ 5047 +нг 5025 +Ñ…Ó© 4840 +хо 4806 +ж 4794 +ар 4704 +л_ 4687 +до 4578 +ал 4489 +йн 4345 +е 4326 +өн 4232 +Ñ_ 4191 +дө 4122 +лг 4089 +то 4085 +ан 4062 +чх 4039 +а_ 3970 +о_ 3935 +үйг 3903 +ул 3888 +гүйг 3887 +га 3756 +_Ñ 3746 +_ц 3600 +ÑÑ_ 3567 +гÑÑ 3545 +аг 3529 +гч 3515 +Ñо 3502 +Ñ‹ 3455 +цг 3441 +Ó©_ 3396 +лд 3389 +ха 3367 +ла 3290 +оо_ 3290 +ло 3281 +ги 3278 +Ñ_ 3252 +Ñ…_ 3249 +ийн 3227 +ÑÑ€ 3212 +ро 3204 +_д 3176 +_м 3157 +Ó©Ó©_ 3138 +рч 3048 +Ð¹Ð³Ñ 3033 +йн_ 3031 +йгÑÑ 3027 +ÑÓ© 3006 +үл 2995 +Ò¯Ð¹Ð³Ñ 2931 +үйгÑÑ 2931 +Ð³Ò¯Ð¹Ð³Ñ 2929 +та 2922 +үй_ 2876 +йг_ 2861 +_г 2851 +гүй_ 2831 +Ñ‚Ñ 2816 +ай 2795 +гоо 2759 +да 2724 +уул 2723 +рд 2687 +Ð¾Ñ 2650 +аа_ 2642 +гөө 2602 +рг 2598 +Ñг 2567 +Ñй 2566 +гий 2547 +Ñл 2546 +_ш 2543 +дг 2513 +Ñ€Ó© 2478 +ли 2469 +лө 2462 +_а 2461 +ох 2455 +Ñй_ 2448 +к 2440 +ийг 2425 +Ñ‚Ó© 2423 +ур 2418 +үүл 2406 +оро 2401 +_хо 2378 +Ó©Ñ… 2370 +лч 2367 +_Ñ…Ó© 2359 +од 2253 +Ó©Ñ 2252 +ра 2246 +_то 2227 +_н 2215 +Ð»Ñ 2177 +Ñ‚_ 2162 +лц 2141 +Ó©Ñ€Ó© 2123 +онг 2044 +Ó©Ó©Ñ€ 2037 +ийн_ 2022 +_ор 2015 +Ñа 2014 +зо 1998 +хг 1989 +_Ó©Ñ€ 1989 +ин 1981 +нх 1972 +ов 1970 +Ñ 1960 +гд 1959 +дог 1959 +хгү 1951 +хгүй 1948 +цо 1931 +ма 1925 +үр 1920 +лт 1919 +өнг 1896 +нд 1895 +дөг 1882 +ил 1878 +оол 1876 +оор 1871 +уд 1867 +ийг_ 1864 +ба 1859 +на 1852 +_у 1850 +мо 1818 +зө 1816 +өөл 1813 +хи 1810 +өд 1804 +мө 1794 +хд 1790 +өв 1772 +_Ò¯ 1767 +Ñн 1758 +в_ 1757 +Ð´Ñ 1757 +Ñон 1745 +_зо 1734 +_Ñ‚Ó© 1728 +цө 1726 +Ñ‚Ñй 1722 +но 1719 +ам 1716 +ÑÑ€_ 1707 +Ð¾Ð¾Ñ 1697 +ц_ 1696 +ав 1681 +дч 1680 +дчи 1668 +_зө 1665 +Ó©Ó©Ñ 1645 +нгү 1634 +Ñөн 1631 +дчих 1631 +Ñ‚Ñй_ 1623 +Ñ…Ó©Ó© 1610 +йл 1600 +ÑÑÑ€ 1583 +рл 1572 +гу 1569 +Ñ…Ñ 1565 +рчи 1555 +ÑÑ 1543 +ши 1514 +жи 1503 +ÑŒ 1492 +гÑÑ_ 1481 +_ха 1478 +гийн 1477 +ихд 1476 +цго 1474 +_цо 1470 +оог 1470 +цгоо 1466 +өөг 1465 +чихд 1464 +хоо 1463 +ри 1457 +ан_ 1447 +ай_ 1440 +ой 1438 +ни 1428 +ÑÑÑ€_ 1427 +лий 1417 +нгүй 1410 +үд 1402 +цгө 1402 +цгөө 1400 +_цө 1396 +ын 1383 +_ба 1378 +гг 1372 +оч 1372 +ггү 1368 +ггүй 1368 +өгч 1353 +ом 1350 +иг 1335 +огч 1320 +нө 1313 +ууд 1312 +Ñ…Ñ 1300 +ÑÑ 1298 +йд 1296 +ар_ 1289 +Ñ€Ñ 1288 +Ð°Ñ 1287 +ад 1282 +ч_ 1280 +ж_ 1278 +аг_ 1271 +өч 1269 +_Ñ 1268 +Ñн 1268 +ху 1267 +роо 1266 +ыг 1260 +ыг_ 1253 +лгү 1251 +гт 1249 +ÑÑ_ 1246 +_ол 1238 +чд 1230 +Ñ€Ó©Ó© 1223 +бу 1220 +йнх 1203 +ÑÑÑ 1203 +ын_ 1201 +бо 1196 +ид 1194 +ийнх 1188 +гчи 1184 +ороо 1181 +Ñ€Ñ 1179 +ат 1174 +оÑо 1170 +Ð¸Ñ…Ñ 1168 +дги 1166 +дгий 1166 +Ñ‡Ð¸Ñ…Ñ 1158 +ах 1155 +ÑÑÑ_ 1154 +ту 1153 +уг 1152 +лгүй 1150 +_бу 1139 +Ñ‚Ò¯ 1129 +_өл 1129 +өрч 1124 +цоо 1121 +лз 1119 +нго 1119 +гц 1110 +Ó©Ñ€Ó©Ó© 1100 +йгÑÑ_ 1098 +Ð½Ñ 1096 +ор_ 1094 +онго 1090 +за 1089 +оло 1087 +от 1085 +лчи 1079 +ал_ 1066 +өлг 1066 +гÑÑÑ€ 1063 +олг 1062 +Ñ…Ò¯ 1060 +цөө 1059 +тл 1054 +_ху 1054 +_мө 1052 +чхө 1050 +он_ 1047 +үүд 1046 +гÑÑÑ€_ 1046 +лу 1040 +аар 1036 +п 1026 +рх 1025 +рчих 1024 +Ð»Ñ 1021 +йгÑÑÑ€ 1020 +гчд 1019 +Ñв 1016 +вл 1014 +лчх 1014 +орд 1013 +орч 1013 +ихг 1012 +ихгү 1012 +ихгүй 1012 +йт 1010 +нгө 1008 +йд_ 1005 +өнгө 1004 diff --git a/libtextcat/data/new_fingerprints/lm/nepali.lm b/libtextcat/data/new_fingerprints/lm/nepali.lm new file mode 100644 index 000000000000..5d3507646f93 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/nepali.lm @@ -0,0 +1,400 @@ +_ 7044 +f 2698 +] 1196 +g 933 +s 815 +l 781 +/ 698 +; 662 +k 645 +d 635 +]_ 630 +f_ 611 +sf 549 +f] 541 +n 526 +t 460 +o 412 +j 399 +f]_ 374 +L 365 +_; 364 +x 356 +_k 353 +{ 337 +b 323 +u 321 +' 303 +df 299 +sf] 290 +_l 283 +sf]_ 265 +g] 245 +G 232 +e 231 +| 227 +a 222 +/_ 222 +L_ 211 +_g 210 +c 207 +df_ 207 +{_ 207 +_c 201 +g_ 201 +kf 184 +f/ 177 +_u 171 +/f 168 +P 163 +_/ 162 +k| 161 +_s 161 +} 159 ++ 158 +h 151 +fn 150 +n] 145 +of 141 +tf 140 +: 136 +p 136 +t_ 134 +. 133 +_k| 133 +._ 133 +_e 131 +_d 131 +_f 129 +_._ 127 +_. 127 +n]_ 123 +;f 119 +lj 118 +O 118 +? 118 +q 118 +nf 118 +y 116 +_a 113 +m 111 +Ps 110 +r 110 +fg 106 +w 105 +sf_ 103 +lg 102 +g]_ 100 +gf 94 +_g] 91 +Psf 90 +x? 89 +fd 88 +fO 88 +_lj 85 +]s 85 +z 85 +s_ 84 +_p 82 +_sf 80 +D 79 +kl 78 +_;f 78 +f/_ 77 +cf 77 +}_ 76 +Psf] 76 +jf 76 +, 75 +fl 75 +\ 75 +_x 74 +kfn 74 +f{ 74 +_/f 74 +]k 73 +of] 73 +,_ 72 +v 72 +lt 72 +_cf 72 +_b 71 +O{ 70 +i 69 +xf 68 +]kf 68 +_/_ 68 +Psf]_ 68 +_h 67 +g]kfn 67 +g]k 67 +d_ 67 +g]kf 67 +O{_ 67 +]kfn 67 +_g]k 66 +g' 66 +_g]kf 66 +o_ 64 +" 64 +;b 63 +Gq 63 +f+ 63 +n_ 63 +fk 62 +sf/ 62 +/L 62 +_kl 62 +I 61 +T 61 +/s 61 +If 61 +dG 60 +_j 60 +]sf 60 +;_ 59 +u_ 59 +yf 59 +dGq 58 +fO{ 58 +af 57 +eP 57 +fs 57 +fO{_ 57 +fj 56 +f; 56 +qL 56 +l/ 55 +\_ 55 +;D 54 +dGqL 54 +GqL 54 +;+ 53 +g\ 53 +/sf 52 +nfO 51 +_kf 51 +ug 51 +pg 51 +:t 51 +fp 50 +_ug 50 +u/ 50 +lg_ 49 +ePs 49 +fdf 48 +cl 48 +nfO{_ 48 +nfO{ 48 +;/ 48 +_cl 47 +F 47 +'/ 47 +g\_ 47 +_r 47 +fo 47 +_eP 47 +]{ 46 +Z 46 +/L_ 46 +_f_ 45 +ePsf 45 +gd 45 +_;D 45 +if 45 +Gb 44 +lb 44 +Q 44 +_n 44 +fsf 44 +]sf] 43 ++; 43 +nL 43 +lx 43 +_t 43 +_: 42 ++;b 42 +_o 42 +fdf_ 42 +bf 42 +hf 42 +_lg 42 +ePsf] 41 +j_ 41 +ln 41 +ef 41 +/] 41 +_df 41 +To 40 +klg_ 40 +_klg_ 40 +]sf]_ 40 +klg 40 +_klg 40 +_u/ 40 +f{_ 40 +_ePs 40 +'g 40 +Gg 39 +:y 39 +_g\ 39 +f} 39 +kf_ 39 +:tf 38 +]{_ 38 +fpg 38 +b' 38 +hg 38 +g]{ 38 +jZ 38 +_;+ 37 +gs 37 +wf 37 +o{ 37 +GqL_ 37 +l_ 37 +qL_ 37 +dGqL_ 37 +gf_ 37 +]; 37 +_ePsf 36 +x' 36 +?_ 36 +! 36 +fx 35 +u| 34 +;d 34 +_lb 34 +fn_ 34 +_P 34 +fg_ 34 += 34 +bn 34 +Zj 34 +o; 34 +g]{_ 34 +_g\_ 34 +kIf 33 +dl 33 +kI 33 +ljZj 33 +jZj 33 +rf 33 ++u 33 +]d 33 +ljZ 33 +nL_ 32 +f+; 32 +fnL 32 +gL 32 +;/sf 32 +tf_ 32 +wfg 32 +_;/ 32 +Qm 32 +fb 32 +f;_ 32 +ld 32 +;/sf/ 32 +/sf/ 32 +_x' 32 +;/s 32 +;f+ 32 +_;f+ 32 +fu 32 +x?_ 32 +;f+; 31 +_k|:t 31 +s' 31 +_kI 31 +_k|: 31 +of]_ 31 +_kIf 31 +_;f+; 31 +]l 31 +k|: 31 +bg 31 +_;/s 31 +|: 31 +f+;b 31 +_;/sf 31 +|:t 31 +|] 31 +k|:t 31 +m_ 31 +;f+;b 31 +Jo 30 +k|wf 30 +|w 30 +|wf 30 +k|w 30 +km 30 +J 30 +k|wfg 30 +|wfg 30 +]t 30 +@ 30 +;f] 30 +b_ 29 +> 29 +kfn_ 29 +fn] 29 +gsf 29 +sf+ 29 +Zjf 29 +]kfn_ 29 +jif 29 +ji 29 +jf; 29 +_k|w 29 +fgd 29 +_k|wf 29 +Zjf; 29 +ljZjf 28 +jZjf; 28 +]n 28 +_z 28 +_s' 28 +_dG 28 +|]; 28 +_;d 28 +jZjf 28 +u|] 27 +gdGq 27 +gdG 27 +?n 27 +x'g 27 +|wfgd 27 +wfgd 27 +gdGqL 27 ++u|] 27 +f+u 27 +fgdG 27 +u|]; 27 ++u|]; 27 +wfgdG 27 +dfg 27 +a} 27 +_dGq 27 +f+u| 27 +fgdGq 27 +sf+u| 27 +_sf+u 27 +f+u|] 27 ++u| 27 +sf+u 27 +_sf+ 27 +x?n 27 +S 26 +_To 26 +]kfnL 26 +_! 26 +_T 26 +ul 26 +/f] 26 +fsf] 26 +af_ 26 +;s 26 +kfnL 26 +|:tf 26 +k|:tf 26 diff --git a/libtextcat/data/new_fingerprints/lm/norwegian.lm b/libtextcat/data/new_fingerprints/lm/norwegian.lm new file mode 100644 index 000000000000..f2c3cec3aae3 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/norwegian.lm @@ -0,0 +1,400 @@ +_ 22970 +e 6833 +n 4206 +r 3516 +t 3112 +a 2587 +s 2440 +i 2112 +l 1901 +o 1900 +n_ 1875 +r_ 1761 +k 1713 +g 1630 +en 1615 +m 1508 +e_ 1450 +d 1444 +er 1436 +h 1306 +t_ 1300 +_h 1180 +_s 1148 +er_ 1105 +v 982 +en_ 976 +an 919 +. 901 +._ 791 +_. 781 +et 770 +g_ 762 +_._ 726 +Ã¥ 725 +u 719 +f 709 +p 702 +ha 682 +_ha 672 +de 657 +te 651 +_e 621 +et_ 614 +re 581 +ne 565 +_o 554 +an_ 544 +ke 534 +_, 522 +,_ 522 +, 522 +_,_ 522 +_f 519 +_m 515 +or 503 +_d 483 +_i 480 +Ã¥_ 479 +se 476 +m_ 469 +nn 454 +b 449 +me 441 +ø 434 +_a 413 +st 404 +_t 398 +og 380 +_v 377 +_og 366 +ar 364 +el 364 +le 361 +i_ 356 +om 353 +og_ 351 +_og_ 351 +li 350 +_k 346 +_de 339 +ge 339 +han 337 +y 333 +_han 332 +ve 330 +kk 323 +in 311 +_b 307 +fo 301 +j 301 +il 298 +_H 291 +H 291 +han_ 288 +_han_ 288 +for 287 +ik 281 +l_ 278 +kke 277 +tt 276 +ti 270 +ne_ 270 +d_ 269 +ed 269 +om_ 268 +nne 266 +_me 264 +ng 257 +_er 257 +_fo 256 +eg 256 +_se 256 +_g 256 +un 255 +ig 255 +sk 253 +_er_ 252 +_p 252 +_for 250 +ke_ 249 +_n 238 +_l 233 +al 232 +ør 222 +s_ 221 +ar_ 215 +at 214 +_en 211 +he 211 +pe 209 +_i_ 208 +am 200 +es 200 +si 200 +enn 197 +det 195 +or_ 193 +vi 190 +ns 189 +ikk 188 +det_ 185 +so 185 +un_ 183 +il_ 181 +nd 181 +te_ 181 +"_ 180 +" 180 +_"_ 180 +_" 180 +em 179 +_ti 176 +kke_ 176 +lig 174 +ten 174 +Ha 173 +_Ha 173 +re_ 172 +ikke 168 +je 165 +Han 165 +ter 165 +_Han 165 +eg_ 164 +pÃ¥ 164 +_pÃ¥ 163 +_si 163 +_Ã¥ 163 +_Han_ 162 +Han_ 162 +pÃ¥_ 162 +_pÃ¥_ 161 +til 160 +som 160 +_so 159 +den 159 +_det 157 +ed_ 155 +ll 155 +_ik 155 +rt 155 +som_ 153 +ra 152 +a_ 152 +har 152 +nt 152 +de_ 152 +tr 151 +v_ 151 +_har 151 +ka 151 +ig_ 150 +_som 150 +for_ 150 +_som_ 150 +_en_ 149 +hu 149 +_ikk 148 +_ham 148 +ham 148 +ste 148 +_det_ 148 +_ikke 148 +enne 148 +ikke_ 148 +har_ 147 +nge 147 +D 147 +_har_ 147 +_D 147 +am_ 147 +ere 147 +ham_ 146 +_ham_ 146 +it 145 +_he 144 +_til 144 +av 143 +va 140 +men 140 +Ã¥r 140 +_ve 140 +_hu 139 +ta 139 +pen 137 +sp 137 +_st 135 +tte 135 +la 135 +_E 133 +E 133 +den_ 130 +is 130 +til_ 128 +_r 128 +tt_ 128 +Ã¥r_ 127 +k_ 124 +_Ã¥_ 124 +ri 124 +_til_ 124 +at_ 123 +ene 123 +seg 123 +_av 123 +med 122 +_vi 122 +_seg 122 +seg_ 121 +_seg_ 121 +_for_ 120 +nne_ 120 +ut 120 +_u 119 +mm 119 +mme 119 +De 118 +_De 118 +_at 118 +_hun 117 +hun 117 +ko 117 +be 116 +_at_ 115 +ter_ 115 +pen_ 114 +ker 113 +hun_ 113 +_hun_ 113 +on 111 +lig_ 111 +.. 110 +hen 107 +_med 107 +rs 106 +ser 106 +med_ 105 +_men 104 +_hen 104 +_sk 104 +_med_ 104 +ak 103 +ans 103 +ker_ 102 +av_ 101 +_ka 101 +no 100 +ver 100 +ler 99 +J 99 +spe 99 +ten_ 99 +_J 99 +ene_ 98 +ld 98 +hv 98 +_av_ 98 +ger 97 +ni 96 +gen 96 +ie 95 +ser_ 94 +_et 94 +spen 94 +_hv 94 +men_ 93 +Espe 92 +Es 92 +_Esp 92 +_Es 92 +_Espe 92 +Esp 92 +_al 92 +Espen 92 +lle 89 +rem 89 +id 89 +fø 89 +ei 88 +inn 88 +rd 88 +enne_ 88 +_henn 87 +henne 87 +henn 87 +kt 86 +spen_ 86 +_om 86 +ler_ 86 +da 86 +ett 86 +itt 86 +bl 85 +to 85 +_Je 84 +ger_ 84 +Je 84 +æ 84 +ma 83 +ing 83 +ær 83 +ns_ 83 +eli 82 +ang 82 +_be 82 +sÃ¥ 82 +_den 82 +pp 81 +rk 81 +dr 81 +oe 81 +ss 81 +_fø 80 +ek 80 +le_ 79 +_no 79 +kj 78 +elig 78 +nes 78 +nn_ 77 +nk 77 +fr 77 +sl 77 +my 77 +kan 77 +sÃ¥_ 76 +as 76 +_om_ 76 +_kan 75 +_ko 75 +_bl 73 +Hu 73 +nen 73 +_Hu 73 +eng 73 +gj 73 +rt_ 72 +ge_ 72 +ba 72 +lv 71 +rer 71 +nde 71 +ls 70 +lo 70 +ga 70 +_noe 70 +ro 70 +_den_ 70 +_ut 70 +noe 70 +Hun 69 +Hun_ 69 +_in 69 +_Hun 69 +_Hun_ 69 +ren 68 +øre 68 +ør_ 68 +sen 68 +sa 67 diff --git a/libtextcat/data/new_fingerprints/lm/persian.lm b/libtextcat/data/new_fingerprints/lm/persian.lm new file mode 100644 index 000000000000..858f468ae54a --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/persian.lm @@ -0,0 +1,400 @@ +_ 12318 +? 5938 +¤ 2613 +?_ 1815 +¢ 1745 +þ 1569 +ø 1546 +õ 1422 +- 1333 +÷ 1155 +ù 991 +¨ 841 +_ù 778 +ö 761 +î 693 +ü 664 +_ö 663 +ª 660 +¤? 631 +õ_ 624 +_? 601 +?¤ 598 +Â¥ 589 +_¤ 565 +?? 564 +ý 513 +¢_ 486 +_ü 473 +_ý 469 +þ? 459 +û 428 +ø_ 427 +_¢ 398 +î_ 392 +ó 390 +ê 384 +ö? 379 +÷? 361 +_ø 337 +_- 334 +> 327 +ú 324 +_ö? 319 +ù? 313 +_ø_ 308 +ð 305 +¤¢ 303 +¤_ 297 +?õ 290 +¨? 285 +¢÷ 272 +-_ 271 +S 266 +_ù? 265 +ô 259 +-¨ 255 +¡ 255 +??_ 255 +ý? 254 +Â¥? 250 +Y 246 +¤¢_ 245 +ì 243 +_ý? 238 +?¢ 234 +_Â¥ 229 +ò 229 +_S 228 +¤ø 216 +ø¤ 215 +?þ 215 +?¨ 214 +_¤¢ 214 +ñ 208 +þ?_ 207 +¢¤ 203 +?- 202 +_ô 199 +ù?_ 199 +öþ 195 +¨_ 190 +¨?_ 190 +_?? 189 +õ? 188 +. 188 +_¤¢_ 185 +_ñ 183 +?ø 181 +_öþ 180 +¢? 177 +* 176 +÷_ 176 +_ù?_ 174 +?û 173 +¤þ 172 +ª_ 170 +þõ 164 +ä 161 +¯ 158 +-¨? 156 +û_ 155 +¤?_ 155 +_Â¥? 152 +üõ 151 +ý¤ 150 +_¤? 146 +Â¥?_ 144 +þ¤ 143 +¢ø 141 +?¤_ 140 +*_ 138 +ü÷ 137 +?÷ 136 +¬ 133 +ùî 132 +_ý¤ 131 +-ª 131 +ø? 130 +?ú 129 +_. 129 +þó 129 +¡_ 128 +ù¢ 124 +¤õ 121 +¤- 120 +ùî_ 120 +_ùî 120 +_??_ 119 +ª? 119 +-¨?_ 119 +ã 118 +-? 118 +>_ 116 +öþ? 114 +ê_ 111 +¢þ 109 +_Â¥?_ 109 +_ü÷ 109 +_ùî_ 108 +_öþ? 107 +ö?¤ 107 +© 106 +_¢÷ 106 +.¢ 104 +?Â¥ 103 +_?¤ 103 +¤ê 103 +ó? 103 +þ¢ 103 +ñ? 102 +ªî 100 +?î 100 +?¤? 98 +ð_ 98 +ý?û 96 +¤ú 95 +öþ?_ 95 +_© 94 +§ 94 +Y_ 94 +_ö?¤ 93 +_ý?û 93 +¥ø 92 +øª 91 +_ñ? 91 +_öþ?_ 91 +?¨_ 90 +÷?_ 90 +óþ 90 +õ?_ 90 +ü- 90 +øõ 89 +: 89 +ªî_ 88 +ü÷? 88 +öø 88 +_ù¢ 87 +-ê 86 +þ÷ 85 +_öø 83 +, 82 +þõ_ 82 +þ¨ 82 +?¢_ 80 +® 80 +?¤þ 79 +_ü÷? 79 +¢ª 77 +??? 77 +øþ 76 +_-? 75 +֔ 75 +?ª 75 +?ì 75 +¤î 74 +_¤ø 73 +ä_ 73 +þóþ 73 +û? 73 +óþõ 72 +þóþõ 71 +ù¤ 71 +?> 71 +ê? 70 +_?¤_ 70 +_.¢ 70 +ø¡ 69 +à 69 +_§ 68 +¥ø¤ 68 +þóþõ_ 67 +_ü- 67 +¤øª 67 +óþõ_ 67 +õû 67 +¤ê_ 67 +¨õ 67 +¤õ_ 66 +?-_ 65 +ù÷ 65 +ý?¤ 65 +öøþ 64 +¤þ? 64 +¬_ 64 +?ð 63 +?-¨ 63 +ø- 63 +_ý?¤ 62 +ø?_ 62 +ø¤_ 62 +_-¨ 61 +õþ 61 +øþóþõ 60 +øþóþ 60 +øþó 60 +?ù 60 +øªî 60 +_¢ø 60 +¤øªî 60 +_öøþ 60 +-ª? 60 +ì_ 60 +¨¤ 60 +ý¤? 60 +÷þ 59 +öøþóþ 59 +¯? 59 +_¢? 59 +öøþó 59 +¢ª_ 59 +¤?? 58 +¤?Â¥ 58 +_üõ 58 +.- 58 +?¡ 58 +ø÷ 57 +à 57 +¢¤î 57 +?¤?_ 57 +þ?¤ 57 +ù- 57 +üõ_ 56 +ñ?¨ 56 +øõ_ 56 +?¤þ? 56 +ú? 56 +î¤ 56 +¤?¢ 56 +õû_ 55 +ô? 55 +ª?¢ 55 +_öøþó 55 +-ª?¢ 55 +þ_ 55 +ö?¤þ 55 +¤î_ 55 +S¤ 55 +üþ 54 +_S¤ 53 +øªî_ 53 +_ù¤ 53 +ò¢ 53 +¤ò 53 +¤øªî_ 53 +ø¡_ 52 +¢÷? 52 +û¤ 52 +üþ? 52 +õ> 52 +-õ 52 +¢¤? 52 +üð 52 +ì? 52 +_à 52 +î? 52 +üó 52 +??¤ 52 +ð÷ 51 +¤ò¢ 51 +-þ 51 +_ý¤? 51 +¡? 50 +_¢¤ 50 +¢¤î_ 50 +.-¨ 50 +ö?õ 50 +ë 50 +_ô? 50 +ø¢ 50 +_ñ?¨ 49 +.¢÷ 49 +÷¤ 49 +ý?¤? 49 +ú- 48 +_üð 48 +_ö?¤þ 48 +¤þ?_ 48 +ò¢_ 48 +ûù 48 +ú?_ 48 +¤ò¢_ 47 +ê¤ 47 +â 47 +.-¨? 47 +_¢þ 47 +>÷ 47 +úª 47 +÷?õ 47 +£ 47 +?¤þ?_ 47 +?ûù 47 +¤¯ 46 +_.- 46 +_ý?¤? 46 +õ¤ 46 +õó 46 +-¤ 46 +_üþ 46 +¥þ 46 +¤ð 45 +ôþ 45 +_üó 45 +ö?- 45 +§þ 45 +_ë 45 +f 45 +õ- 44 +_ù- 44 +úõ 44 +_üþ? 44 +ü-ª 44 +-î 44 +ð¢ 44 +ö?¤þ? 44 +õª 44 +.-¨?_ 44 +¨õ_ 44 +¤ª 44 +_-¨? 44 +¤?î 43 +ä? 43 +é 43 +?õ_ 43 +_¤øªî 43 +þª 43 +?ê 43 +?î_ 43 +_¤øª 43 +¯_ 43 +¤¨ 43 +þ?? 42 +Y? 42 +Â¥_ 42 +÷÷ 42 +÷õ 42 +_¥ø 42 +þ¨? 42 +¢õ 42 +/ 42 +_â 42 +???_ 42 +_¥ø¤ 42 +_ù÷ 42 +s 42 +¢÷÷ 42 +ö?-¨ 41 +÷ø 41 +ù÷? 41 +?* 41 +S¤ò¢ 41 +_ö?õ 41 +ñþ 41 +_.-¨ 41 +îþ 41 +÷¢ 41 +ùõ 41 +S¤ò 41 +ªø 41 +ý?¤?_ 40 +¤úª 40 diff --git a/libtextcat/data/new_fingerprints/lm/polish.lm b/libtextcat/data/new_fingerprints/lm/polish.lm new file mode 100644 index 000000000000..eac3b27eca6a --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/polish.lm @@ -0,0 +1,400 @@ +_ 31480 +a 7945 +i 7766 +e 7462 +o 6838 +z 5104 +n 5077 +r 4178 +w 4139 +s 3596 +c 3580 +y 3569 +t 3381 +d 3027 +k 2819 +p 2639 +m 2494 +ie 2484 +u 2016 +l 1947 +j 1932 +ni 1930 +e_ 1746 +_p 1712 +³ 1649 +a_ 1496 +o_ 1431 +, 1368 +,_ 1366 +b 1296 +_w 1257 +g 1249 +i_ 1140 +. 1118 +_s 1045 +ze 1041 +._ 1035 +_n 997 +nie 958 +cz 956 +rz 952 +h 950 +_z 933 +ê 918 +ow 911 +ie_ 902 +y_ 899 +na 885 +ch 871 +po 864 +pr 863 +wi 851 +st 847 +¿ 835 +± 832 +an 814 +ó 800 +ra 778 +zy 766 +ia 726 +za 718 +_t 699 +wa 692 +ro 692 +_d 683 +_pr 675 +¶ 661 +w_ 658 +sz 652 +_po 649 +_o 628 +m_ 613 +li 613 +dz 611 +ki 611 +en 609 +mi 593 +ta 571 +ci 564 +ej 562 +nie_ 558 +_m 553 +_k 549 +ar 543 +go 541 +_i 540 +em 532 +od 525 +yc 520 +a³ 511 +rze 508 +do 504 +eg 503 +ko 502 +ac 482 +to 478 +_na 467 +_ni 467 +h_ 466 +ch_ 466 +æ 465 +iê 461 +_b 458 +on 458 +u_ 458 +zi 454 +ka 450 +er 448 +sk 447 +si 447 +wy 444 +te 437 +ak 434 +ê_ 431 +_j 429 +je 429 +z_ 427 +ny 422 +aw 422 +ne 420 +ów 418 +_w_ 415 +_c 414 +ego 412 +prz 412 +_r 410 +al 407 +³a 405 +" 405 +re 405 +es 401 +_nie 396 +dzi 394 +ty 389 +j_ 380 +ic 380 +_prz 379 +ad 367 +ej_ 364 +le 359 +æ_ 358 +ed 354 +ych 346 +_za 346 +_do 344 +zn 344 +go_ 344 +ani 343 +_i_ 342 +no 339 +or 337 +³o 336 +tr 334 +P 330 +os 329 +am 329 +da 328 +ec 327 +ol 325 +±_ 325 +by 322 +ego_ 321 +at 321 +¿e 319 +ym 317 +wie 314 +³_ 312 +- 310 +na_ 309 +_si 303 +W 301 +as 301 +wo 300 +pa 299 +siê 295 +nia 293 +owa 292 +o¶ 286 +_P 283 +el 282 +_siê 279 +ja 278 +rzy 276 +prze 276 +_wy 275 +iê_ 269 +in 267 +_- 267 +de 267 +kie 267 +dn 264 +ob 262 +_u 261 +ych_ 260 +-_ 258 +ez 257 +_-_ 256 +¶c 255 +ws 255 +¶ci 254 +em_ 253 +_siê_ 253 +siê_ 253 +_nie_ 253 +kt 252 +ski 252 +we 251 +_g 251 +_W 250 +t_ 249 +_prze 249 +_je 248 +aj 247 +_a 247 +¿e_ 246 +_¿ 244 +ia_ 243 +eni 241 +om 240 +la 240 +k_ 235 +mo 235 +f 234 +pi 232 +is 231 +cze 231 +_z_ 230 +ñ 228 +nt 227 +ce 224 +sta 221 +ry 220 +ma 219 +cj 219 +zie 218 +ek 216 +oc 213 +dy 212 +owi 208 +sp 208 +K 208 +tó 205 +_" 204 +ud 203 +S 202 +ier 202 +pra 202 +czn 201 +ys 200 +nia_ 199 +j± 198 +_¿e 196 +oz 194 +N 192 +zo 191 +dzie 190 +ku 190 +ów_ 189 +czy 189 +_mi 188 +_¿e_ 187 +mie 186 +do_ 186 +iej 185 +ym_ 185 +_do_ 184 +cy 184 +_pa 183 +y³ 182 +_na_ 181 +zy_ 181 +ew 180 +_l 180 +_cz 178 +_by 178 +ru 177 +to_ 175 +±c 175 +_wi 175 +ln 174 +_K 171 +ok 170 +ot 170 +raw 169 +nych 168 +nyc 168 +az 168 +ik 167 +bi 167 +i,_ 166 +i, 166 +_te 165 +tu 163 +wan 163 +et 162 +J 162 +ór 161 +cie 160 +_od 160 +ba 160 +oj 159 +o¶c 158 +i³ 158 +_ro 158 +og 158 +mu 158 +A 158 +o¶ci 158 +d_ 157 +tw 156 +io 155 +gr 155 +_J 155 +_N 155 +_to 155 +us 155 +¿_ 154 +aæ 154 +ach 154 +yd 153 +_kt 153 +_ko 152 +_st 152 +pow 151 +yt 150 +kr 149 +_S 149 +a³_ 149 +ak_ 149 +li_ 148 +T 148 +ur 148 +sa 148 +op 147 +yw 147 +ent 147 +bo 146 +_sp 145 +O 145 +a,_ 144 +a, 144 +iem 144 +któ 143 +praw 143 +wn 142 +tór 142 +i± 141 +³y 141 +zu 141 +dni 140 +im 139 +icz 139 +ró 139 +³e 138 +któr 137 +my 137 +est 136 +awi 135 +przy 134 +nych_ 134 +e,_ 132 +e, 132 +ki_ 132 +pro 131 +_któ 131 +_któr 131 +_przy 130 +_ja 129 +szy 127 +ia³ 127 +Po 127 +ania 127 +M 126 +ze_ 125 +ne_ 125 +ñs 124 +_ty 124 +sze 124 +a¿ 124 +nn 124 +pod 124 +aæ_ 122 +B 122 +za_ 122 +cza 122 +³o_ 122 +Z 122 +_to_ 121 +iu 120 +zc 119 +esz 119 +skie 118 +i. 118 +_ka 117 +so 117 +trz 117 +o¿ 116 +ieg 116 +nik 116 +ga 116 diff --git a/libtextcat/data/new_fingerprints/lm/portuguese.lm b/libtextcat/data/new_fingerprints/lm/portuguese.lm new file mode 100644 index 000000000000..4e1a5d7698e4 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/portuguese.lm @@ -0,0 +1,400 @@ +_ 35328 +a 10423 +e 10132 +o 8919 +s 6795 +r 6033 +i 5443 +n 4588 +d 4531 +t 4217 +m 3476 +u 3404 +o_ 3240 +a_ 3029 +e_ 2879 +c 2756 +s_ 2461 +_d 2379 +l 2307 +p 2242 +_a 1753 +de 1751 +, 1660 +,_ 1658 +_e 1454 +es 1447 +os 1412 +ra 1343 +_p 1328 +nt 1302 +_de 1248 +do 1215 +en 1176 +re 1150 +as 1123 +v 1115 +m_ 1113 +de_ 1096 +er 1082 +g 1053 +_c 1047 +da 1008 +co 986 +os_ 975 +te 974 +ar 950 +or 943 +q 938 +qu 938 +_s 908 +ta 902 +_de_ 901 +_o 858 +se 841 +ue 831 +to 799 +ad 777 +. 761 +que 752 +em 751 +an 748 +f 746 +r_ 745 +b 732 +st 718 +is 716 +al 712 +_qu 706 +_q 706 +in 701 +as_ 696 +ã 695 +do_ 685 +ent 678 +ão 677 +_n 671 +_co 660 +_a_ 654 +_m 646 +on 645 +ç 624 +ri 623 +_que 619 +ma 602 +po 581 +ia 580 +ão_ 575 +._ 573 +na 572 +me 564 +ro 554 +_t 544 +pa 533 +da_ 528 +h 523 +ue_ 515 +ca 511 +que_ 509 +nte 503 +no 499 +tr 498 +am 496 +em_ 491 +_que_ 487 +_se 485 +om 471 +io 460 +_do 459 +ti 448 +ci 445 +_da 444 +nd 442 +ei 435 +ra_ 435 +pr 427 +_r 423 +_e_ 420 +_f 420 +ss 412 +es_ 412 +el 407 +id 406 +_o_ 399 +_pa 390 +um 379 +pe 378 +_po 376 +la 374 +ir 371 +á 371 +ic 362 +di 362 +li 359 +é 359 +_re 353 +ve 353 +mo 350 +s, 349 +s,_ 349 +ou 347 +com 340 +sa 338 +si 338 +men 337 +rt 331 +_i 330 +con 330 +o, 327 +_da_ 326 +o,_ 326 +se_ 325 +_com 325 +ado 323 +to_ 322 +ai 322 +it 320 +A 319 +ec 316 +dos 316 +_em 312 +ção 310 +aç 310 +çã 310 +ara 305 +so 299 +tu 299 +res 297 +im 296 +_pr 295 +mi 293 +ua 292 +nto 291 +ment 290 +à 290 +par 288 +_do_ 287 +ce 286 +est 286 +u_ 284 +ente 284 +S 278 +l_ 278 +_u 278 +" 276 +ni 276 +z 274 +sta 273 +nc 272 +_em_ 270 +P 269 +ção_ 267 +_v 267 +at 267 +dos_ 266 +_es 262 +« 259 +_« 259 +te_ 258 +» 257 +va 255 +le 252 +ur 252 +_um 252 +vi 251 +_par 250 +a, 247 +a,_ 247 +_con 247 +ant 242 +lo 240 +ia_ 240 +gu 237 +ar_ 235 +ac 235 +e,_ 234 +e, 234 +no_ 232 +eg 232 +il 232 +ns 232 +er_ 231 +_ma 230 +por 230 +_in 228 +_l 226 +ó 225 +ont 224 +_no 223 +_P 222 +tra 220 +E 219 +ida 218 +is_ 217 +ol 216 +açã 215 +ter 215 +ação 215 +_A 211 +un 211 +- 210 +_te 210 +or_ 209 +ma_ 208 +_pe 208 +ara_ 208 +C 206 +ist 202 +para 202 +nta 201 +ais 201 +ut 198 +nte_ 198 +j 197 +dad 196 +_na 195 +am_ 195 +ade 193 +ica 191 +x 190 +al_ 189 +O 188 +des 187 +_para 187 +ada 187 +nh 186 +_se_ 186 +mp 185 +ndo 184 +R 183 +_por 181 +ação_ 181 +para_ 179 +eir 177 +ui 177 +vo 177 +ou_ 177 +ta_ 177 +M 176 +ria 175 +tos 175 +rr 174 +D 174 +io_ 174 +br 174 +_di 173 +õ 173 +õe 173 +fo 173 +I 172 +ões 172 +_C 171 +mo_ 171 +ov 170 +pro 169 +_os_ 169 +_os 169 +das 167 +iv 166 +uma 165 +gr 165 +su 164 +fi 164 +um_ 162 +na_ 162 +ga 162 +ais_ 161 +_S 161 +lh 159 +ort 159 +cia 158 +.. 157 +_est 156 +cont 156 +ig 155 +á_ 154 +ran 154 +ça 154 +om_ 153 +_en 152 +dade 152 +_as 152 +ho 152 +ntr 151 +nto_ 151 +fe 150 +N 149 +das_ 149 +uma_ 149 +ess 149 +é_ 148 +ndo_ 147 +ob 147 +»_ 147 +ul 146 +ente_ 146 +go 146 +ento 144 +ver 144 +_des 144 +gi 144 +ha 142 +cu 142 +idad 142 +av 141 +ões_ 141 +_pro 141 +ura 141 +ap 139 +_com_ 139 +_ca 139 +com_ 139 +ao 139 +ne 138 +od 138 +_" 137 +_M 137 +pre 137 +ras 136 +_me 136 +_ao 136 +_no_ 134 +oc 134 +str 133 +tes 133 +_b 133 +and 133 +_g 133 +ro_ 133 +omo 133 +_dos 132 +_fo 132 +_dos_ 132 +rn 132 +mento 131 +ito 131 +ev 131 +rio 130 +ass 130 +eu 130 +be 128 +os, 128 +os,_ 128 +sp 127 +_uma 127 +ep 126 +tad 125 +s. 125 +_uma_ 125 +_E 125 +idade 124 +_um_ 124 +nã 124 +não 124 +ct 123 +ram 123 +ado_ 123 +ela 123 +omo_ 121 +iz 121 +_an 121 diff --git a/libtextcat/data/new_fingerprints/lm/quechua.lm b/libtextcat/data/new_fingerprints/lm/quechua.lm new file mode 100644 index 000000000000..e59992d781b3 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/quechua.lm @@ -0,0 +1,400 @@ +_ 5766 +a 4900 +n 1941 +i 1666 +u 1384 +s 1032 +t 995 +y 939 +h 929 +k 915 +q 909 +p 882 +a_ 847 +an 821 +r 783 +m 740 +c 705 +l 695 +ta 637 +ch 613 +ay 587 +qa 557 +pa 490 +ha 486 +e 474 +ma 457 +o 441 +na 434 +ku 411 +j 409 +un 367 +w 358 +in 353 +, 345 +,_ 344 +cha 318 +ar 317 +n_ 315 +as 291 +wa 289 +ta_ 269 +ll 259 +man 255 +_k 248 +._ 243 +. 243 +nt 227 +am 224 +pi 222 +la 222 +ka 217 +ac 214 +ni 214 +at 213 +aq 213 +i_ 208 +ri 207 +qa_ 204 +una 204 +y_ 192 +aj 192 +_p 192 +is 188 +_m 181 +lla 175 +ach 174 +rq 173 +us 172 +an_ 171 +_ka 171 +ata 169 +rqa 165 +sq 163 +hu 162 +sp 161 +_w 157 +nk 157 +hay 157 +_s 156 +sqa 155 +ki 153 +kun 152 +_c 152 +al 150 +nta 149 +ap 147 +ant 146 +yk 146 +ay_ 144 +spa 141 +hi 137 +_ch 136 +_n 136 +ya 135 +' 134 +j_ 133 +uy 132 +ra 132 +a,_ 132 +a, 132 +ti 130 +_a 125 +nc 125 +kuna 122 +s_ 121 +su 121 +ak 121 +_ma 118 +ana 118 +ari 115 +_t 114 +ama 114 +chi 114 +ñ 113 +a._ 111 +a. 111 +nch 111 +iy 111 +all 110 +aw 110 +_r 110 +anta 109 +ayk 109 +na_ 109 +chay 108 +sa 104 +_wa 104 +si 103 +chu 102 +pa_ 101 +acha 101 +_cha 101 +pi_ 101 +qan 100 +_pa 99 +_q 97 +aj_ 97 +awa 97 +ank 95 +nku 95 +im 94 +q_ 92 +uk 92 +C 92 +mu 90 +tu 89 +J 89 +_ni 88 +taj 87 +_J 87 +nin 86 +_chay 86 +u_ 86 +_C 84 +wan 83 +nta_ 81 +_j 81 +mant 80 +ut 79 +in_ 79 +ik 79 +manta 79 +asq 79 +yt 78 +n, 78 +asqa 78 +n,_ 78 +pay 78 +li 77 +yn 77 +nq 76 +yta 76 +ic 76 +up 76 +_Ch 75 +yku 75 +Ch 75 +he 75 +hay_ 74 +nan 74 +ina 74 +ur 73 +er 72 +S 72 +arq 72 +or 72 +_l 71 +_u 71 +aq_ 70 +os 70 +yp 70 +anc 69 +man_ 69 +mi 69 +ich 68 +_i 68 +st 67 +_S 67 +arqa 66 +it 66 +anch 66 +ru 66 +aku 65 +pu 65 +ña 65 +alla 64 +mp 64 +sqa_ 64 +'a 64 +ayku 63 +es 63 +A 63 +ia 63 +_man 63 +_Cha 61 +Cha 61 +taj_ 60 +api 60 +_ll 60 +wi 60 +ayp 60 +aman 59 +g 58 +anku 58 +yki 57 +ima 57 +yni 57 +oj 57 +mana 57 +anta_ 57 +_su 57 +uc 56 +isp 56 +ispa 56 +uch 56 +M 56 +ir 56 +_h 55 +nqa 55 +kuy 55 +ayt 54 +_M 54 +b 54 +_y 53 +_mana 53 +: 53 +uku 53 +:_ 53 +nm 53 +au 52 +ayta 52 +io 52 +qo 51 +an,_ 51 +apa 51 +spa_ 51 +erq 51 +_wi 51 +erqa 51 +_sa 51 +an, 51 +el 50 +um 50 +ana_ 50 +han 50 +il 50 +on 49 +chay_ 49 +sta 49 +_D 49 +D 49 +iku 49 +aqa 49 +che 48 +en 48 +yta_ 48 +Ma 47 +P 47 +_lla 47 +_Je 47 +yq 47 +Je 47 +ita 47 +rqan 47 +ypi 46 +har 46 +Jes 46 +_Jes 46 +ios 46 +ayq 46 +Dio 46 +un_ 46 +kus 46 +_Dio 46 +taq 46 +_Dios 46 +_Ma 46 +_Di 46 +Dios 46 +Di 46 +d 46 +kan 45 +Chay 45 +oq 45 +_Chay 45 +_pay 45 +upa 45 +mun 45 +ata_ 44 +_tu 44 +nis 44 +re 44 +paq 44 +yo 44 +ej 44 +qay 43 +ncha 43 +ha_ 43 +_A 43 +I 43 +_kan 43 +_nis 43 +_P 43 +nman 43 +nma 43 +ataj 42 +ara 42 +ku_ 42 +nata 42 +nat 42 +i, 41 +tin 41 +qh 41 +t' 41 +orq 41 +nki 41 +_ru 41 +_ku 41 +i,_ 41 +ip 40 +ham 40 +usq 40 +_ya 40 +qank 39 +orqa 39 +ayn 39 +mana_ 39 +ray 39 +ym 39 +uma 39 +_pu 39 +par 39 +kay 39 +n. 38 +qa,_ 38 +n._ 38 +sus 38 +aypi 38 +usqa 38 +qanku 38 +ill 38 +qa, 38 +was 38 +pa, 38 +pa,_ 38 +asp 38 +qa._ 37 +_mu 37 +paj 37 +amp 37 +hin 37 +uti 37 +rin 37 +_im 37 +_ima 37 +ja 37 +_ri 37 +rqa_ 37 +taq_ 37 +qa. 37 +sh 36 +spa,_ 36 +cha_ 36 +spa, 36 +achi 36 +una_ 36 +rqank 36 +jt 36 +K 36 +amu 36 +aspa 35 +_Jesu 35 +Jesus 35 +nispa 35 +ki_ 35 +waw 35 +ko 35 +ne 35 +esus 35 +int 35 diff --git a/libtextcat/data/new_fingerprints/lm/romanian.lm b/libtextcat/data/new_fingerprints/lm/romanian.lm new file mode 100644 index 000000000000..65b8e7554d8e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/romanian.lm @@ -0,0 +1,400 @@ +_ 20674 +a 6376 +e 5815 +i 5746 +t 3396 +r 3280 +n 3103 +u 2835 +s 2611 +c 2582 +e_ 2235 +l 2224 +o 2149 +a_ 1974 +d 1629 +m 1528 +p 1410 +i_ 1358 +in 1308 +_c 1167 +_s 1118 +_d 999 +re 905 +ar 898 +, 791 +,_ 786 +_p 785 +de 771 +_a 754 +te 687 +_i 667 +at 654 +ti 645 +ca 639 +n_ 630 +ta 617 +si 614 +_de 609 +f 606 +st 583 +ri 581 +u_ 567 +nt 553 +. 542 +ra 540 +_m 534 +g 528 +v 516 +ul 516 +de_ 513 +_in 503 +b 492 +_de_ 474 +._ 472 +le 459 +l_ 444 +un 443 +_si 440 +es 437 +tr 426 +ea 420 +t_ 412 +ce 412 +ma 407 +cu 402 +er 398 +_ca 397 +si_ 388 +_f 387 +_l 383 +z 382 +la 381 +ne 370 +sa 364 +as 360 +_e 357 +in_ 356 +an 352 +it 351 +te_ 346 +or 345 +el 345 +ci 339 +_si_ 333 +_n 330 +are 324 +pe 319 +re_ 317 +al 310 +_t 309 +se 304 +ic 295 +ie 290 +_u 290 +ul_ 290 +ni 289 +int 285 +_o 280 +en 279 +ta_ 279 +ur 261 +pa 256 +co 255 +_pe 254 +ia 252 +mi 251 +pr 249 +_ma 249 +oa 249 +me 246 +lu 246 +li 241 +im 238 +_in_ 237 +da 237 +na 237 +_sa 235 +ac 234 +- 234 +ii 232 +est 231 +r_ 231 +h 230 +_cu 230 +le_ 229 +ai 229 +ca_ 227 +il 226 +ru 223 +sc 223 +_v 221 +nu 220 +tu 220 +_un 220 +nd 220 +di 219 +are_ 216 +to 215 +am 214 +on 213 +o_ 208 +is 208 +sa_ 203 +la_ 200 +_b 200 +ste 195 +et 194 +ec 191 +_r 186 +car 185 +ui 180 +un_ 179 +lo 178 +cu_ 177 +ei 176 +e, 175 +e,_ 175 +pe_ 171 +m_ 167 +_la 166 +a, 165 +a,_ 164 +_ce 164 +rt 163 +_co 163 +ent 162 +ro 162 +ele 162 +_pe_ 160 +po 160 +ea_ 159 +" 158 +ntr 158 +_cu_ 158 +_pr 157 +ut 157 +nc 156 +ata 155 +care 154 +um 153 +au 151 +va 151 +_o_ 150 +_car 150 +ii_ 145 +ind 145 +_un_ 144 +os 144 +ad 141 +_la_ 140 +I 140 +este 138 +ste_ 138 +care_ 138 +ir 137 +ga 136 +ap 136 +ol 136 +ra_ 136 +_di 134 +D 134 +_care 133 +se_ 133 +om 133 +ara 133 +ati 133 +fi 133 +_sa_ 131 +zi 130 +vi 130 +_ca_ 129 +_se 128 +_nu 128 +ai_ 127 +ch 127 +pi 124 +ve 123 +fa 122 +ot 121 +_a_ 120 +este_ 120 +du 119 +ine 119 +s_ 118 +fo 118 +_ci 118 +ui_ 118 +ba 118 +i, 117 +i,_ 116 +ne_ 115 +us 115 +_g 115 +a. 115 +fe 114 +A 114 +pu 114 +ce_ 113 +ar_ 113 +_pa 113 +oc 112 +sta 112 +lui 112 +ns 112 +em 112 +' 112 +oar 112 +din 111 +iu 111 +_int 111 +ate 111 +mu 111 +hi 110 +ele_ 110 +mp 109 +_D 109 +S 109 +sti 108 +bi 108 +ata_ 107 +ti_ 107 +tra 107 +C 107 +c_ 106 +tre 106 +_al 105 +rea 105 +mai 105 +j 104 +a._ 104 +gi 104 +e. 103 +d_ 103 +_fa 103 +E 102 +mo 102 +at_ 101 +_e_ 101 +nte 101 +lt 101 +sp 101 +za 100 +mai_ 100 +su 99 +na_ 98 +tat 97 +sin 97 +ez 96 +tru 96 +e._ 96 +ie_ 96 +ia_ 96 +_re 96 +tul 96 +_fo 96 +ina 95 +art 95 +_C 95 +no 95 +nu_ 94 +_es 94 +_po 94 +cr 94 +inc 93 +_da 92 +_mai 92 +lui_ 92 +_din 92 +_est 92 +pre 91 +_mai_ 91 +io 91 +chi 91 +ge 90 +pri 90 +eu 90 +uri 90 +az 90 +_nu_ 89 +_me 89 +ct 89 +au_ 88 +esc 88 +ev 88 +ei_ 88 +min 87 +ace 87 +op 86 +ng 86 +ici 86 +_lu 85 +ari 85 +_mi 84 +ita 84 +_S 84 +_tr 84 +ere 83 +or_ 83 +ast 83 +ist 83 +nt_ 83 +_se_ 82 +ou 82 +tin 82 +intr 82 +con 82 +do 81 +_fi 81 +str 81 +am_ 80 +rat 80 +ru_ 80 +ri_ 80 +par 80 +oi 80 +uc 79 +ze 79 +pl 79 +res 78 +_ac 77 +ulu 77 +din_ 76 +va_ 76 +ada 76 +ului 75 +_con 75 +id 75 +inte 74 +ile 73 +cit 73 +_din_ 73 +lor 73 +_" 72 +ig 72 +rin 72 +da_ 72 +_st 72 +-_ 71 +_- 71 +it_ 71 +ani 71 +nd_ 71 +ci_ 70 +ag 70 +eri 70 +i. 70 +tru_ 70 +_ne 70 +rm 70 +P 69 +_este 69 +nta 69 +bu 69 +une 69 +ma_ 69 +nti 69 +imp 68 +_-_ 68 +iv 68 +ind_ 68 diff --git a/libtextcat/data/new_fingerprints/lm/romansh.lm b/libtextcat/data/new_fingerprints/lm/romansh.lm new file mode 100644 index 000000000000..e65969ca34fa --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/romansh.lm @@ -0,0 +1,400 @@ +_ 10888 +a 3490 +e 2268 +i 2196 +s 2169 +n 1961 +t 1555 +r 1510 +l 1281 +u 1249 +a_ 1155 +c 1060 +d 954 +o 815 +s_ 744 +g 726 +m 686 +h 620 +p 617 +n_ 559 +v 493 +_s 480 +er 476 +ch 469 +_d 457 +in 455 +da 418 +_c 404 +r_ 388 +_e 377 +. 358 +_p 340 +as 333 +l_ 328 +, 327 +._ 320 +_i 320 +,_ 315 +la 313 +en 312 +sc 310 +an 301 +ta 300 +_da 285 +f 282 +_t 268 +_a 262 +nt 259 +_m 252 +un 248 +ra 247 +i_ 247 +na 245 +ma 245 +ia 241 +ar 234 +sch 228 +b 228 +da_ 217 +es 214 +ai 213 +st 212 +' 209 +e_ 208 +as_ 208 +er_ 200 +t_ 199 +re 199 +_l 195 +al 193 +_n 193 +el 192 +tg 192 +te 188 +z 187 +ha 180 +_f 180 +sa 174 +_da_ 172 +ve 169 +ei 168 +_v 165 +at 165 +ss 161 +is 161 +_ch 161 +on 160 +la_ 158 +cu 158 +ad 158 +he 156 +in_ 147 +va 147 +_in 146 +gl 145 +ns 141 +ur 140 +ü 140 +u_ 138 +ts 138 +pe 136 +li 134 +gi 133 +et 133 +de 132 +ig 132 +or 130 +ti 129 +il 127 +d_ 126 +che 123 +ut 122 +us 122 +cha 121 +di 120 +ia_ 120 +_b 118 +_la 117 +na_ 116 +ain 115 +per 115 +to 115 +_cu 113 +_sc 112 +se 111 +ls 109 +- 108 +iu 108 +ca 107 +si 104 +ir 102 +rt 102 +ie 102 +_g 102 +un_ 102 +nd 101 +av 101 +ni 100 +q 99 +au 97 +ls_ 97 +ll 96 +qu 96 +_pe 96 +le 96 +rs 95 +pa 95 +ri 95 +_ma 94 +_per 93 +am 93 +eg 92 +_r 92 +gn 92 +me 92 +pi 91 +an_ 91 +en_ 91 +ga 91 +ent 90 +hi 90 +_e_ 90 +it 89 +ta_ 88 +ter 87 +ns_ 86 +iv 86 +igl 86 +a. 85 +em 85 +I 85 +chi 84 +_en 84 +int 84 +ue 83 +su 82 +tt 82 +a, 82 +a,_ 82 +o_ 82 +_ve 82 +a._ 82 +_q 81 +_qu 80 +ge 80 +" 80 +_la_ 80 +ar_ 80 +vi 79 +gl_ 79 +tu 78 +ng 78 +ro 76 +mi 76 +sta 75 +ed 75 +lla 74 +ei_ 73 +_o 73 +ic 73 +el_ 73 +_il 73 +_in_ 72 +g_ 72 +pr 71 +nu 70 +ina 70 +_h 69 +scha 68 +mai 68 +pl 68 +il_ 68 +os 68 +ha_ 68 +be 67 +uo 67 +cun 67 +ra_ 67 +_pa 65 +ts_ 64 +s. 64 +co 64 +_u 64 +fi 64 +_I 63 +sa_ 63 +s._ 63 +_re 62 +ün 62 +_nu 62 +? 62 +S 62 +_se 61 +no 61 +nt_ 61 +E 61 +tr 61 +mp 60 +igl_ 60 +_su 60 +_st 60 +ess 60 +im 60 +zi 59 +?_ 59 +nc 59 +_E 58 +_- 58 +_cun 58 +_te 58 +_sa 58 +ant 58 +main 58 +eu 57 +ssa 57 +-_ 57 +iun 57 +_-_ 57 +aint 56 +op 56 +al_ 56 +dal 56 +j 56 +ama 55 +_tg 55 +ua 55 +M 55 +per_ 55 +tsc 54 +nta 54 +tsch 54 +um 54 +fa 54 +za 54 +_di 54 +pia 54 +_per_ 54 +T 54 +_M 53 +ne 53 +era 53 +A 53 +que 53 +_dal 53 +cr 53 +_de 53 +lla_ 53 +_che 52 +h' 52 +_me 51 +ot 51 +_pr 51 +_pl 51 +_sch 51 +ch' 50 +_S 50 +opi 50 +ou 50 +tta 50 +mo 50 +ada 50 +ba 50 +_an 49 +top 49 +id 49 +è 49 +he_ 49 +C 49 +va_ 49 +uto 49 +ins 48 +topi 48 +L 48 +ir_ 48 +ist 48 +c_ 48 +_il_ 48 +P 48 +ss_ 48 +ag 47 +_no 47 +res 47 +las 47 +_vi 46 +s, 46 +schi 46 +_en_ 46 +tg_ 46 +s,_ 46 +_si 46 +_que 45 +_T 45 +az 45 +'i 45 +cun_ 44 +_fa 44 +_mi 44 +utop 44 +utopi 44 +sche 44 +_C 44 +ur_ 44 +tge 44 +po 44 +es_ 44 +x 44 +nz 44 +_L 44 +_cun_ 44 +man 44 +_ch' 43 +_fi 43 +pu 43 +ell 43 +opia 43 +_igl 43 +_ig 43 +sp 43 +topia 43 +ava 42 +egn 42 +che_ 42 +on_ 42 +ci 42 +_P 42 +ev 42 +ond 41 +_" 41 +à 41 +us_ 41 +_ha 41 +D 40 +_co 40 +etg 40 +'e 40 +las_ 40 +est 40 +ura 40 +uel 39 +ed_ 39 +vo 39 +gia 39 +bu 39 +mu 39 +nn 39 +gli 39 +_A 39 +lu 39 +cha_ 39 +ul 38 +mal 38 +_ün 38 +ina_ 38 +_bu 38 +_ca 38 +_ü 38 +uc 38 +nts 38 +tra 38 +_tu 37 diff --git a/libtextcat/data/new_fingerprints/lm/russian.lm b/libtextcat/data/new_fingerprints/lm/russian.lm new file mode 100644 index 000000000000..bddb68514275 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/russian.lm @@ -0,0 +1,400 @@ +_ 76249 +о 19732 +е 16714 +а 14389 +и 13942 +Ñ‚ 13160 +н 12444 +Ñ 9867 +Ñ€ 8461 +в 7895 +л 7330 +к 6498 +м 5935 +. 5725 +у 5287 +д 5019 +п 4877 +Ñ 4083 +, 3899 +,_ 3878 +Ñ‹ 3656 +ÑŒ 3376 +и_ 3167 +_п 3144 +е_ 3135 +о_ 3098 +- 3019 +з 2983 +_в 2952 +._ 2930 +_Ñ 2919 +ч 2887 +г 2876 +б 2797 +ÑÑ‚ 2672 +_н 2631 +то 2585 +.. 2407 +_и 2316 +но 2300 +-_ 2294 +й 2281 +а_ 2249 +на 2057 +Ñ_ 2029 +ов 1981 +ни 1950 +_Ñ‚ 1944 +Ñ… 1874 +ен 1856 +_о 1774 +... 1744 +ра 1709 +не 1685 +по 1636 +_- 1625 +ко 1616 +те 1595 +ро 1584 +_к 1558 +_-_ 1531 +й_ 1521 +ет 1518 +ж 1509 +_и_ 1454 +та 1433 +ан 1419 +ер 1396 +от 1389 +ÑŒ_ 1381 +го 1375 +ал 1370 +_по 1364 +ре 1345 +ка 1338 +пр 1337 +ва 1329 +ти 1306 +ли 1300 +_д 1297 +_м 1290 +ÐµÑ 1284 +во 1271 +че 1256 +ор 1245 +.... 1232 +Ñ‚ÑŒ 1219 +Ð¾Ñ 1212 +ш 1192 +ÑŽ 1187 +в_ 1168 +..... 1167 +он 1147 +ак 1144 +ц 1109 +ог 1101 +ло 1099 +Ñ‚_ 1093 +ри 1076 +м_ 1074 +_пр 1063 +ол 1062 +ль 1045 +_не 1034 +ел 1029 +_б 1026 +ин 1000 +од 998 +ом 996 +ме 993 +Ñ‹_ 975 +ве 968 +Ñк 968 +_на 966 +де 946 +ны 943 +_Ñ€ 931 +_ч 923 +ат 913 +ем 910 +у_ 907 +за 898 +Ñ…_ 898 +ле 889 +то_ 881 +ит 878 +ой 875 +Ñе 862 +_в_ 833 +_з 816 +ки 816 +Ñ‚Ñ€ 807 +" 786 +Ñ‚ÑŒ_ 769 +ед 767 +не_ 762 +ÑÑ 754 +ла 748 +об 747 +мо 741 +да 730 +го_ 715 +к_ 713 +Ð¸Ñ 711 +_у 702 +ой_ 697 +ил 694 +ма 692 +нн 687 +до 662 +Ð°Ñ 660 +ам 656 +Ð¸Ñ 649 +же 646 +аз 638 +Ñо 629 +на_ 619 +_г 615 +ми 612 +_а 610 +Ñ 609 +Ð²Ñ 608 +.._ 605 +вы 604 +ого 604 +им 604 +_ко 600 +ав 597 +Ñл 594 +ие 593 +_не_ 589 +_е 587 +_те 583 +ту 583 +ич 583 +ру 575 +оÑÑ‚ 571 +щ 571 +л_ 570 +_Ñ 569 +_Ñ 559 +Ð 556 +ени 544 +из 540 +ек 536 +ова 533 +Ñ„ 525 +: 519 +Ð’ 513 +ани 511 +_Ð²Ñ 510 +ий 510 +Ð 508 +_ка 508 +! 503 +? 501 +ди 498 +ли_ 489 +П 488 +про 486 +_ра 485 +Ñи 484 +ир 484 +_ÑÑ‚ 484 +ьн 484 +льн 484 +:_ 484 +ÑÑ_ 480 +_за 477 +бо 470 +_л 469 +..._ 465 +бы 464 +их 464 +И 462 +ег 461 +тв 459 +Ð½Ð¸Ñ 458 +ÑÑ‚ 454 +чт 454 +ÑÑ‚ 445 +Ñ‡ÐµÑ 442 +_то 442 +иÑ_ 441 +ик 440 +ви 437 +ак_ 436 +Ñта 436 +ого_ 435 +_Ð’ 434 +иче 433 +ци 431 +что 431 +Ñ‹Ñ… 429 +_ÑÑ‚ 429 +С 425 +_чт 424 +_Ð 424 +ÑŽ_ 423 +пе 422 +Ð½Ñ 422 +_что 422 +Ð»Ñ 419 +вÑе 418 +ду 418 +еÑк 415 +нт 413 +как 411 +Ñто 411 +_вы 409 +ну 408 +Ñ‚Ñ 406 +н_ 406 +_П 401 +Ðœ 401 +но_ 399 +_про 398 +_. 397 +ров 396 +це 396 +кт 394 +еÑÑ‚ 394 +_" 393 +ше 393 +Ñ_ 392 +_от 392 +О 391 +_на_ 391 +ред 391 +чеÑк 390 +о- 390 +ван 388 +а, 385 +ад 384 +Ð°Ñ 384 +_Ñ_ 383 +а,_ 383 +Ñ‚Ñ‹ 383 +?_ 383 +_об 380 +_вÑе 380 +_та 378 +_как 376 +хо 375 +так 375 +аль 374 +ож 373 +Ñ‹Ñ…_ 372 +ово 372 +ив 371 +_во 369 +му 369 +_Ð 369 +ей 368 +пре 368 +зн 366 +Ð¸Ñ‡ÐµÑ 365 +пи 365 +его 362 +_Ñо 360 +ое 360 +!_ 360 +ать 360 +Ñти 358 +их_ 358 +тн 358 +мен 358 +Ñ‚ÑÑ 356 +ие_ 356 +ичеÑк 354 +бе 352 +_бы 352 +ÑÑ‚Ñ€ 349 +ку 349 +_мо 348 +ет_ 348 +Т 346 +_ме 344 +ев 344 +при 343 +чи 342 +мн 341 +ниÑ_ 339 +ар 338 +нно 337 +ован 334 +ÐºÑ 334 +ур 328 +_че 328 +оль 328 +ут 327 +что_ 324 +ом_ 323 +оп 323 +рм 322 +Ð_ 322 +_что_ 322 +Ñто 322 +оч 321 +о,_ 320 +о, 320 +Ñко 319 +кон 319 +лов 318 +ый 318 +ÑƒÑ 317 +от_ 316 +иро 315 +Ñтв 314 +и, 314 +ий_ 313 +ÑÑ 313 +и,_ 313 +том 312 +ае 312 +енн 311 +ез 311 +ной 311 +_Ñто 310 +К 309 +Ñа 309 +раз 309 +еп 309 +_до 308 +оло 308 +ÑÑ‚ÑŒ 308 +уд 307 +дел 307 +_И 305 +аб 305 +ÑÑŒ 303 +альн 302 +ок 300 +Ñки 300 +ных 300 +_при 298 +ча 297 +_ж 297 +е, 297 +е,_ 297 +_Ð_ 296 +_ни 296 +_._ 295 +Ñ‚ÑÑ_ 295 +ии 294 +зна 293 +Ñ, 292 +Ñ,_ 292 +_пре 291 +_С 290 +ной_ 290 +аÑ_ 288 +_О 288 +али 286 +же_ 285 +Ñп 285 +иÑÑ‚ 285 +ных_ 285 +з_ 283 +_Ðœ 283 diff --git a/libtextcat/data/new_fingerprints/lm/sanskrit.lm b/libtextcat/data/new_fingerprints/lm/sanskrit.lm new file mode 100644 index 000000000000..e21b8712111d --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/sanskrit.lm @@ -0,0 +1,400 @@ +a 15017 +_ 14975 +h 5028 +i 3820 +t 2976 +s 2788 +r 2599 +| 2437 +n 2432 +aa 2276 +ha 2007 +m 1982 +a_ 1802 +v 1799 +d 1768 +u 1629 +y 1599 +_| 1470 +|_ 1470 +e 1403 +k 1371 +sh 1330 +ra 1243 +p 1213 +va 1152 +A 1134 +ya 1120 +ma 1097 +na 1091 +ar 1080 +ta 1054 +M 1050 +. 1047 +am 1037 +an 1006 +|| 967 +||_ 965 +_|| 965 +_||_ 963 +at 962 +M_ 946 +as 910 +_s 882 +o 848 +b 844 +i_ 827 +aM 764 +aM_ 685 +c 630 +ch 629 +sa 623 +N 616 +ad 612 +H 601 +pa 595 +H_ 584 +g 580 +_n 579 +l 554 +bh 552 +hi 547 +ka 542 +it 536 +ii 535 +ama 519 +_|_ 505 +e_ 477 +_p 475 +dh 475 +av 469 +ak 445 +aH 444 +da 440 +aH_ 439 +ay 437 +j 437 +_na 432 +ana 430 +hh 428 +ti 426 +ara 425 +aa_ 410 +_k 394 +shh 389 +_v 388 +_sa 381 +.h 379 +ah 369 +_b 368 +h_ 363 +.h_ 363 +cha 362 +haa 361 +_t 358 +ri 352 +sha 345 +ap 333 +vi 330 +is 329 +^ 321 +o_ 321 +_m 320 +ai 311 +_d 311 +la 302 +na_ 301 +.n 298 +ava 295 +al 294 +_sh 291 +ja 288 +a. 280 +aan 277 +ish 274 +aN 273 +aya 273 +ash 266 +ha_ 265 +ga 264 +st 255 +ni 255 +ii_ 254 +hu 253 +Na 253 +R 248 +R^ 248 +^i 247 +R^i 247 +a.n 243 +th 241 +_c 240 +_ch 239 +maa 238 +bha 237 +vaa 233 +ab 228 +ir 226 +\ 226 +ita 223 +uu 222 +dha 220 +har 218 +_a 216 +_bh 216 +nam 212 +u_ 212 +he 212 +m.h_ 211 +m. 211 +m.h 211 +ur 210 +es 209 +ata 208 +te 206 +yaa 205 +_ma 204 +esh 202 +aka 200 +id 199 +pr 199 +aha 198 +hy 198 +T 197 +aat 197 +_OM_ 196 +OM 196 +_O 196 +_OM 196 +OM_ 196 +O 196 +ti_ 195 +ari 194 +raa 193 +ag 192 +_y 192 +aas 190 +_ta 190 +_j 189 +I 189 +_na_ 187 +am.h_ 185 +am.h 185 +am. 185 +_pa 183 +iv 182 +de 182 +ada 178 +nd 178 +_cha 177 +_h 176 +ati 175 +taa 173 +ev 172 +nt 171 +rii 171 +ishh 170 +ya_ 168 +_vi 166 +ast 165 +tr 164 +abh 164 +kh 162 +ala 160 +tha 160 +apa 160 +asa 158 +naa 158 +_nam 156 +ru 156 +A_ 155 +_ka 154 +aar 153 +_pr 152 +_g 151 +pra 150 +ham 150 +hha 149 +aana 149 +di 149 +ra_ 147 +ik 146 +.a 144 +yat 143 +ks 143 +hA 143 +hya 143 +ksh 143 +ut 142 +sy 141 +nama 140 +_va 140 +.\ 140 +paa 140 +han 139 +eva 138 +U 138 +mi 138 +_r 136 +_ja 136 +asy 135 +hr 135 +sya 134 +cha_ 132 +rv 132 +tv 130 +asya 130 +kar 130 +ho 129 +yo 129 +in 128 +adh 127 +yA 127 +va_ 126 +su 125 +_ya 125 +shha 124 +pu 124 +R^it 123 +sta 123 +mu 123 +^it 123 +ty 123 +_nama 121 +ac 120 +rii_ 120 +ach 120 +aNa 119 +tas 119 +shi 119 +iva 119 +hav 119 +tra 118 +var 118 +par 118 +haM 117 +aad 117 +kaa 117 +hch 117 +mas 117 +ai_ 117 +hc 117 +sar 116 +aam 116 +_bha 115 +_pra 114 +et 114 +haM_ 113 +aay 113 +aj 113 +ye 113 +o. 113 +An 111 +arii 111 +t.h 110 +t. 110 +ath 110 +t.h_ 109 +man 109 +te_ 108 +o.a 108 +hara 108 +rA 108 +rva 108 +tva 108 +asta 108 +up 108 +shr 107 +daa 104 +me 104 +dr 104 +ram 104 +arii_ 102 +_ni 102 +arv 102 +iH 102 +hit 101 +ras 101 +aga 101 +Am 101 +mA 101 +ba 101 +amas 100 +tu 100 +yaM 100 +ant 99 +ud 99 +uk 98 +iH_ 98 +yaM_ 98 +kha 98 +au 98 +ira 97 +shhT 97 +rah 97 +hT 97 +hhT 97 +D 96 +_sar 96 +re 96 +eshh 95 +sarv 94 +amaa 94 +and 94 +arva 94 +_ra 93 +_dh 93 +tt 92 +tad 92 +hm 92 +raM_ 91 +dev 91 +raM 91 +C 91 +ani 91 +_sarv 90 +atha 89 +Ad 89 +chi 89 +tA 88 +sarva 88 +avi 88 +taM 87 +hava 87 +anaa 86 +vA 86 +dd 86 +nA 85 +Ar 85 +hv 85 +taM_ 85 +dhi 84 +ksha 84 +ip 84 +ma_ 84 +_sha 84 +ati_ 83 +yai 83 +vat 83 +At 83 +kR 82 +kR^ 82 +bhi 82 +_shr 82 +to 82 +ta_ 82 +br 82 +ek 82 +kR^i 82 +tat 81 +nta 81 +hma 81 +aaya 80 +tam 80 +en 80 +us 79 +bra 79 +ke 79 +kt 79 +ddh 79 +mo 79 diff --git a/libtextcat/data/new_fingerprints/lm/scots.lm b/libtextcat/data/new_fingerprints/lm/scots.lm new file mode 100644 index 000000000000..7aac457075f6 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/scots.lm @@ -0,0 +1,400 @@ +_ 11688 +e 3223 +a 2469 +t 2269 +i 1928 +n 1903 +r 1414 +o 1406 +h 1369 +s 1249 +l 929 +n_ 890 +_t 862 +_a 843 +d 818 +e_ 798 +th 704 +w 661 +he 625 +an 612 +t_ 606 +u 592 +_th 575 +c 508 +s_ 471 +the 470 +, 469 +- 458 +in 455 +m 445 +,_ 440 +b 434 +g 429 +er 409 +ee 408 +_the 407 +k 402 +an_ 402 +f 385 +_w 378 +he_ 376 +the_ 364 +_an 362 +_o 360 +y 358 +_the_ 354 +_s 353 +_an_ 342 +a_ 335 +r_ 327 +_b 316 +d_ 303 +i_ 278 +en 277 +p 270 +ei 245 +A 236 +wa 232 +_A 231 +re 229 +in_ 229 +ui 218 +oo 217 +le 217 +ai 216 +et 212 +ti 209 +it 209 +_f 206 +te 204 +_a_ 203 +_m 202 +ha 200 +as 193 +on 188 +at 184 +_i 183 +_wa 183 +_c 182 +o_ 180 +or 178 +_h 176 +_g 169 +ch 165 +A_ 159 +_l 158 +_A_ 157 +st 156 +_d 155 +_ti 148 +. 148 +._ 146 +ke 144 +ti_ 143 +-- 143 +_o_ 142 +ow 142 +--- 140 +ed 138 +---- 138 +_r 137 +as_ 137 +y_ 136 +er_ 136 +----- 136 +ir 135 +aa 135 +la 131 +een 130 +ae 129 +_ti_ 128 +ra 126 +es 125 +nd 124 +de 120 +h_ 120 +ie 120 +ar 119 +ll 119 +nt 118 +ot 118 +en_ 115 +ma 115 +eet 113 +her 112 +el 112 +is 112 +' 112 +at_ 111 +ic 109 +se 108 +or_ 106 +wu 104 +me 104 +ne 103 +fo 102 +on_ 101 +was 99 +_was 98 +et_ 98 +ri 98 +_e 97 +_ma 97 +v 97 +_n 97 +! 97 +li 97 +ht 93 +hi 92 +_wu 92 +ng 91 +ro 91 +it_ 90 +ck 90 +_fo 90 +tha 90 +k_ 89 +il 89 +cht 86 +eet_ 86 +_p 86 +we 86 +_was_ 85 +was_ 85 +rt 84 +ed_ 83 +ter 83 +id 83 +ga 82 +; 82 +;_ 81 +ther 79 +tt 76 +air 76 +e, 75 +un 75 +ho 75 +for 74 +ge 74 +_st 73 +_y 72 +_he 72 +wh 71 +_on 71 +sh 70 +z 70 +e,_ 69 +bi 68 +_tha 68 +wui 67 +!_ 67 +ad 67 +een_ 66 +l_ 66 +ts 66 +_for 66 +n, 66 +_wh 65 +re_ 65 +be 65 +eh 64 +hat 64 +ns 64 +br 64 +g_ 64 +ui_ 64 +rr 64 +wui_ 63 +ni 63 +_wui 62 +ay 62 +s, 62 +pe 61 +n,_ 61 +bo 61 +al 61 +ye 61 +_bi 60 +oot 60 +na 60 +ang 60 +s,_ 59 +es_ 59 +ill 58 +that 58 +_wui_ 58 +nn 58 +eh_ 58 +oa 57 +han 57 +_that 56 +_br 56 +ca 56 +_ga 56 +ng_ 56 +um 55 +hat_ 55 +oon 55 +od 55 +for_ 55 +no 55 +ree 55 +_for_ 54 +_le 54 +ht_ 54 +ot_ 54 +_k 53 +rd 53 +ki 53 +aw 53 +nd_ 53 +_on_ 53 +_it 53 +ik 53 +t, 53 +_be 52 +that_ 52 +ve 52 +rn 52 +'s 51 +au 51 +co 51 +ich 51 +to 51 +lo 51 +t,_ 51 +ea 51 +tee 51 +lan 50 +fi 50 +_at 50 +am 50 +_in 50 +ere 50 +ur 50 +le_ 50 +nt_ 49 +'s_ 49 +hin 49 +yi 49 +hr 49 +ts_ 49 +_ca 48 +" 48 +ta 48 +cht_ 48 +-_ 48 +_as 47 +T 47 +ang_ 47 +lei 46 +_ma_ 46 +tr 46 +_ro 46 +fe 46 +ma_ 46 +icht 46 +_as_ 46 +der 46 +cl 46 +e- 45 +n- 45 +thr 45 +ba 45 +m_ 45 +st_ 45 +rt_ 45 +_u 45 +do 45 +_T 45 +im 44 +_se 44 +sk 44 +_la 44 +eik 44 +bit 43 +ike 43 +B 43 +kee 43 +tte 43 +di 43 +eed 43 +_B 42 +_aa 42 +her_ 42 +da 42 +ff 42 +tu 42 +ie_ 42 +_cl 42 +_ba 42 +oot_ 42 +bu 41 +eike 41 +oc 41 +hu 41 +_thr 41 +ther_ 41 +_co 41 +aa_ 41 +so 41 +_me 41 +H 41 +_H 40 +ke_ 40 +ert 40 +lu 40 +ist 40 +si 40 +iz 40 +ar_ 39 +uc 39 +thi 39 +ad_ 39 +ru 39 +owe 39 +gi 38 +_bit 38 +_do 38 +int 38 +bl 38 +ld 38 +_at_ 38 +lt 38 +ac 38 +_ha 38 +ae_ 38 +rs 37 +here 37 +ei_ 37 +han_ 37 +p_ 37 +is_ 37 +eth 37 +fa 37 +_sk 37 +ll_ 37 +ss 36 +bra 36 +wha 36 +gl 36 +ck_ 36 +pl 36 +lin 36 +ir_ 36 +ab 36 +_ther 36 +_da 35 +ce 35 +rin 35 +_oo 35 +rl 35 +wee 35 +and 35 +sa 35 +_yi 35 +_bra 35 +'d 35 +ds 35 +_bo 35 diff --git a/libtextcat/data/new_fingerprints/lm/scots_gaelic.lm b/libtextcat/data/new_fingerprints/lm/scots_gaelic.lm new file mode 100644 index 000000000000..491862a8c9d2 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/scots_gaelic.lm @@ -0,0 +1,400 @@ +_ 12634 +a 5353 +h 3268 +i 2898 +n 2792 +e 1651 +r 1563 +d 1455 +_a 1425 +c 1245 +n_ 1236 +s 1165 +l 1152 +an 1121 +t 980 +ai 979 +g 962 +u 905 +ch 902 +ha 836 +h_ 833 +a_ 829 +ea 821 +o 794 +dh 726 +an_ 711 +b 639 +m 585 +na 514 +nn 506 +ac 498 +r_ 495 +s_ 482 +ir 480 +ach 466 +id 458 +_an 450 +_c 427 +th 403 +à 388 +he 383 +in 379 +bh 367 +idh 358 +ad 342 +_n 341 +il 332 +nn_ 323 +_t 322 +_d 319 +ar 317 +e_ 311 +dh_ 307 +_an_ 303 +_b 302 +_na 294 +air 289 +ig 279 +. 275 +ir_ 272 +ag 272 +_ai 272 +, 270 +gu 269 +,_ 269 +._ 265 +ean 264 +ch_ 261 +f 259 +? 258 +_s 255 +ann 250 +ra 241 +ei 241 +_a_ 241 +ha_ 241 +d_ 238 +- 235 +_m 231 +gh 230 +hea 228 +le 226 +_f 224 +ui 223 +is 223 +as 218 +adh 218 +l_ 216 +g_ 208 +à i 207 +ò 207 +hai 205 +cha 205 +air_ 204 +na_ 201 +inn 198 +tha 190 +C 189 +G 188 +ann_ 187 +_ag 186 +_air 186 +eac 185 +_g 185 +_na_ 184 +ach_ 184 +_C 183 +us 183 +_ch 183 +la 182 +_G 182 +each 181 +us_ 178 +al 178 +gus 176 +gus_ 176 +_th 169 +_air_ 168 +_agus 167 +agus_ 167 +_agu 167 +agus 167 +agu 167 +ta 164 +aid 163 +hi 163 +hd 163 +chd 160 +T 157 +A 156 +ic 152 +_T 152 +adh_ 150 +idh_ 148 +mh 147 +?_ 146 +ar_ 145 +oi 144 +da 143 +aidh 143 +_bh 139 +ean_ 138 +sa 138 +ig_ 138 +_r 136 +_A 134 +ì 134 +te 134 +achd 131 +hu 131 +_e 130 +aig 130 +_l 130 +_ann 129 +ain 127 +ne 127 +dhe 125 +_dh 125 +à id 123 +o_ 121 +hl 119 +acha 119 +ga 118 +à idh 118 +on 118 +it 117 +aidh_ 116 +de 115 +nan 115 +ua 115 +_ann_ 115 +ich 115 +il_ 114 +m_ 114 +eil 114 +ri 112 +at 112 +ma 111 +li 109 +ao 109 +re 109 +inn_ 108 +_tha 107 +fh 106 +as_ 106 +bh_ 106 +nan_ 103 +lea 103 +lt 103 +S 103 +a? 103 +a?_ 102 +io 102 +E 101 +am 101 +' 100 +_a? 100 +igh 100 +_a?_ 99 +_gu 99 +idhe 99 +t_ 99 +se 99 +si 98 +ba 97 +ù 97 +tha_ 96 +bha 95 +B 94 +is_ 94 +u_ 94 +_B 94 +_i 93 +ile 92 +aic 91 +hei 91 +ia 90 +ho 89 +Th 88 +ath 88 +_Th 88 +rt 87 +ib 87 +Gà id 86 +_Gà i 86 +_Gà 86 +Gà i 86 +òr 86 +Gà 86 +Gà idh 86 +_Gà id 86 +had 85 +ibh 85 +_fh 85 +p 84 +ad_ 83 +_? 83 +_E 83 +hd_ 82 +dhea 82 +chd_ 82 +ear 81 +ith 81 +_tha_ 80 +h- 79 +eal 78 +hean 78 +sg 77 +rea 77 +_S 76 +ais 75 +ll 75 +han 74 +hà 74 +achd_ 74 +ead 74 +idhea 73 +am_ 72 +dha 72 +_nan 71 +_nan_ 71 +hadh 71 +gh_ 71 +ail 70 +hui 70 +Ch 69 +eachd 69 +h. 69 +aich 69 +hli 69 +chai 69 +om 68 +fa 68 +chad 68 +I 67 +h._ 67 +_Ch 67 +tea 67 +nea 66 +chadh 66 +achad 66 +rai 66 +lig 66 +haid 66 +dea 66 +rt_ 65 +à r 65 +dhl 65 +ana 64 +eann 64 +Ei 64 +le_ 64 +hn 64 +ilt 64 +uid 64 +_fa 63 +_Tha 63 +Tha 63 +ob 63 +_si 62 +ro 62 +cu 62 +ainn 62 +un 62 +dhli 61 +idhli 61 +lean 61 +idhl 61 +à idhl 61 +hlig 61 +dhlig 61 +in_ 60 +_à 60 +st 60 +rr 60 +_cu 60 +hr 60 +_aig 60 +bhe 59 +i_ 59 +aigh 59 +Tha_ 59 +è 59 +_ri 59 +_Tha_ 59 +lb 58 +che 58 +ran 58 +nac 58 +haidh 58 +hadh_ 58 +aig_ 58 +Gh 58 +ilea 58 +_Gh 58 +lte 58 +_le 58 +ru 58 +à idhe 57 +_I 57 +ilte 57 +eadh 57 +M 56 +hlig_ 56 +L 56 +chu 56 +nach 56 +_ma 56 +lig_ 56 +h,_ 55 +th_ 55 +ibh_ 55 +_aig_ 55 +D 55 +atha 55 +_Ei 55 +h, 55 +gu_ 54 +_gu_ 54 +im 54 +eil_ 54 +eu 53 +_M 53 +Al 53 +irt 53 +_L 53 +iad 53 +sea 52 +lba 52 +Alba 52 +F 52 +Alb 52 +uai 52 +ich_ 52 +_F 51 +ilean 51 +has 51 +tai 51 +each_ 50 +eacha 50 +har 50 +ni 50 +_de 50 +irt_ 50 +n,_ 50 +mha 50 +n, 50 +_e_ 50 +ide 49 +neach 49 +neac 49 +ur 49 +rd 49 +_h 49 +hean_ 49 +oc 49 +eò 49 +te_ 49 +han_ 49 +on_ 49 diff --git a/libtextcat/data/new_fingerprints/lm/serbian_ascii.lm b/libtextcat/data/new_fingerprints/lm/serbian_ascii.lm new file mode 100644 index 000000000000..9471be6eabd5 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/serbian_ascii.lm @@ -0,0 +1,400 @@ +_ 34122 +a 9113 +o 8135 +i 7736 +e 7535 +n 5207 +s 4860 +j 3995 +t 3797 +r 3660 +u 3224 +l 3065 +d 3061 +e_ 2941 +v 2786 +a_ 2746 +k 2701 +m 2492 +o_ 2328 +y 2238 +p 2151 +_s 2148 +i_ 2094 +je 1887 +c 1854 +z 1731 +_n 1437 +_p 1432 +g 1418 +b 1368 +u_ 1333 +je_ 1237 +, 1226 +,_ 1214 +_i 1130 +st 1105 +na 1076 +. 1056 +_d 1033 +._ 1030 +_j 967 +ra 934 +ko 908 +ni 900 +cy 893 +sy 875 +_je 871 +_o 824 +ta 799 +no 780 +_u 777 +re 766 +_b 764 +_k 763 +da 760 +ne 754 +li 750 +ti 745 +se 722 +po 713 +to 713 +_je_ 696 +an 688 +ja 683 +pr 665 +va 651 +lo 634 +_z 626 +m_ 625 +is 625 +il 622 +ov 621 +la 621 +_m 615 +bi 604 +_t 603 +_po 594 +en 586 +_se 578 +os 578 +in 576 +od 576 +ka 552 +ve 548 +ij 538 +_pr 536 +al 536 +vo 535 +om 530 +_i_ 525 +nj 515 +ed 509 +_na 507 +na_ 503 +og 499 +oj 498 +ma 493 +_bi 492 +on 489 +ak 482 +im 481 +ye 481 +ro 480 +vi 473 +sa 469 +ri 464 +da_ 451 +av 450 +at 449 +se_ 447 +es 446 +h 443 +ao 441 +ji 437 +yi 436 +_da 433 +ad 432 +_se_ 430 +lj 428 +zy 426 +za 426 +_ne 425 +de 422 +tr 417 +cj 415 +_u_ 414 +_c 412 +le 402 +_v 397 +ar 390 +_g 390 +ic 384 +n_ 382 +ju 379 +lo_ 377 +aj 376 +_ko 369 +ao_ 366 +ek 361 +_da_ 359 +et 356 +go 354 +iz 346 +_za 345 +_r 344 +or 342 +mo 341 +el 340 +as 339 +ik 336 +te 332 +_sa 329 +d_ 323 +am 320 +me 318 +sto 317 +di 315 +ec 311 +ol 310 +a,_ 307 +a, 307 +_ni 302 +ya 296 +do 295 +yt 294 +su 292 +syt 289 +li_ 288 +sta 286 +ije 284 +ko_ 277 +ti_ 277 +la_ 277 +ga 276 +bil 275 +no_ 274 +a. 273 +nu 272 +a._ 271 +ne_ 271 +om_ 268 +_cy 266 +_na_ 263 +_bil 263 +sv 263 +ru 259 +to_ 256 +_od 253 +cyi 253 +nje 251 +it 251 +pa 250 +az 248 +e,_ 245 +e, 245 +ob 244 +dn 243 +ac 242 +ost 242 +k_ 240 +iv 239 +io 238 +_su 238 +_iz 237 +ilo 235 +_sv 234 +_ka 233 +koj 231 +mi 229 +im_ 229 +ije_ 227 +g_ 226 +em 223 +su_ 223 +ih 223 +ji_ 221 +kr 220 +ut 220 +_koj 220 +V 218 +_st 218 +ye_ 217 +_l 214 +_V 213 +ovo 211 +j_ 210 +uc 208 +ja_ 208 +h_ 207 +nij 206 +sk 206 +ot 203 +io_ 203 +gl 203 +_do 201 +ok 200 +ns 199 +ilo_ 199 +er 197 +ih_ 195 +pre 193 +ci 193 +og_ 193 +ki 192 +sl 191 +t_ 189 +ni_ 189 +_a 189 +vr 188 +ati 187 +_su_ 186 +nije 181 +pro 181 +be 180 +yn 179 +cye 178 +ju_ 178 +ku 177 +isy 177 +ta_ 174 +sye 172 +_tr 172 +O 172 +jen 172 +_to 171 +pi 168 +_pre 168 +S 168 +ima 167 +nije_ 167 +_mo 166 +eg 166 +e._ 164 +za_ 164 +e. 164 +_pro 164 +gov 163 +N 162 +dr 162 +ako 162 +tv 162 +_S 160 +P 159 +ma_ 159 +_on 159 +sp 158 +nst 158 +anj 158 +dj 157 +oc 157 +_sy 156 +ev 155 +ce 155 +lik 154 +_nij 153 +_N 152 +ist 151 +_P 151 +_nije 151 +- 151 +ba 150 +jed 150 +sti 150 +ova 149 +_is 148 +id 148 +ton 148 +ke 147 +pos 147 +od_ 147 +osy 146 +Vi 146 +ila 145 +ins 145 +bo 145 +_Vi 145 +ir 144 +_za_ 144 +oz 144 +ecj 144 +cje 143 +on_ 143 +zn 142 +_O 141 +us 141 +i, 141 +i,_ 141 +mu 140 +inst 140 +cya 140 +oji 139 +esy 139 +icy 139 +lja 138 +_go 138 +i. 138 +_re 137 +_bilo 137 +edn 137 +acy 137 +rat 137 +bilo 137 +ali 136 +ecy 136 +ija 135 +pri 135 +ad_ 135 +lic 135 +i._ 135 +Vins 134 +Vin 134 +ston 134 +Vinst 134 +ga_ 134 +nston 134 +insto 134 +nsto 134 +_Vins 133 +_Vin 133 +zi 132 +ran 131 +le_ 130 +ili 130 +bilo_ 130 +_pos 129 +ila_ 129 +est 128 +_ve 128 +tre 128 +zye 127 +_nj 127 +si 126 +f 126 +alo 125 +ako_ 125 +tra 125 +sa_ 125 +pu 124 +ud 124 +z_ 124 +_ra 124 +iti 124 +_de 124 +odi 123 +T 123 +-_ 122 +o,_ 121 +o, 121 +du 121 +rs 121 +B 120 +ka_ 119 +red 119 +_od_ 118 +an_ 118 +nu_ 118 +iko 117 +dno 117 +_pa 117 +s_ 116 diff --git a/libtextcat/data/new_fingerprints/lm/slovak_ascii.lm b/libtextcat/data/new_fingerprints/lm/slovak_ascii.lm new file mode 100644 index 000000000000..29c8736b3ba3 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/slovak_ascii.lm @@ -0,0 +1,400 @@ +_ 20064 +a 4991 +o 4983 +e 3838 +n 3342 +i 3317 +r 2583 +s 2501 +v 2383 +t 2325 +c 1918 +k 1912 +l 1888 +d 1736 +u 1725 +p 1543 +a_ 1527 +y 1371 +m 1339 +z 1227 +h 1194 +e_ 956 +_p 881 +_s 828 +o_ 814 +na 809 +b 808 +_v 798 +j 797 +. 796 +ov 795 +._ 785 +st 687 +i_ 665 +, 657 +,_ 657 +_n 625 +ch 623 +u_ 618 +ro 617 +po 598 +_o 546 +ne 529 +en 520 +v_ 519 +_a 516 +ra 506 +pr 488 +y_ 481 +od 476 +_z 467 +ie 462 +ni 448 +an 447 +vy 434 +to 433 +h_ 431 +_na 429 +re 416 +ch_ 404 +ho 401 +al 399 +ci 394 +va 387 +na_ 386 +_pr 385 +_d 384 +_k 378 +la 377 +ko 375 +do 374 +_po 373 +si 353 +_t 346 +_r 337 +os 336 +no 334 +in 328 +tr 324 +om 321 +_v_ 320 +ny 319 +m_ 319 +ri 317 +S 306 +ac 302 +sa 300 +ti 300 +_m 298 +za 298 +er 291 +ia 290 +ce 290 +li 289 +yc 286 +ych 285 +ed 284 +at 281 +ob 281 +ak 280 +_na_ 280 +il 279 +_, 279 +_,_ 279 +ok 277 +sk 268 +ych_ 265 +_c 262 +mi 261 +ol 260 +me 260 +l_ 259 +t_ 259 +ku 258 +ta 256 +le 256 +_b 254 +or 252 +_a_ 250 +lo 247 +oc 246 +vo 246 +es 244 +ve 242 +_vy 240 +on 238 +_sa 231 +as 231 +da 230 +aj 228 +av 218 +el 216 +ova 216 +ic 215 +ne_ 209 +_do 208 +sa_ 207 +ka 205 +_sa_ 204 +te 203 +j_ 201 +_ro 199 +P 198 +_za 196 +ky 196 +_S 195 +je 194 +ar 193 +_. 193 +it 192 +s_ 192 +em 191 +ej 191 +ur 190 +ad 189 +_o_ 187 +_._ 187 +ov_ 185 +de 180 +_% 179 +om_ 179 +% 179 +_u 174 +pre 173 +dn 173 +D 172 +rok 170 +ie_ 170 +sp 169 +pri 167 +_pre 167 +am 165 +- 164 +ke 164 +eh 162 +oz 161 +k_ 160 +ost 160 +_j 156 +f 155 +zn 153 +g 152 +kt 152 +ho_ 151 +eho 151 +hod 150 +ku_ 148 +is 148 +zi 147 +cn 147 +eho_ 146 +ej_ 145 +az 145 +tu 145 +_pri 144 +cen 144 +_st 143 +ma 142 +ast 141 +_ce 140 +rov 140 +la_ 138 +ot 138 +nych 135 +nyc 135 +_ob 133 +z_ 133 +nych_ 131 +N 130 +li_ 129 +ani 129 +co 128 +nt 128 +ny_ 127 +E 125 +_ne 124 +) 123 +_( 123 +sti 123 +A 123 +( 123 +cho 122 +vi 122 +_sp 122 +di 120 +pa 120 +n_ 119 +ju 118 +ys 117 +bo 117 +_P 116 +_tr 115 +V 114 +je_ 114 +ln 114 +_i 113 +ze 113 +spo 112 +_N 112 +nd 111 +nu 111 +so 111 +red 110 +vn 110 +kl 110 +kov 110 +_cen 110 +_rok 109 +tn 109 +du 109 +nc 109 +ap 109 +d_ 108 +van 108 +ca 108 +M 108 +chod 107 +ti_ 107 +U 106 +_ak 106 +ru 105 +sta 105 +ym 105 +_- 104 +et 103 +_h 102 +est 102 +_je 102 +nos 101 +aci 101 +us 100 +dov 100 +pod 100 +_to 100 +tor 99 +uc 99 +ras 98 +ky_ 98 +_s_ 98 +_mi 97 +* 97 +uj 97 +nost 97 +vys 97 +ovy 97 +ez 97 +oku 96 +_V 96 +op 96 +bc 96 +rast 96 +se 95 +B 95 +roku 95 +kto 94 +ove 94 +by 94 +-_ 94 +_ko 93 +obc 92 +nie 91 +ia_ 91 +ka_ 91 +*_ 90 +ali 90 +lo_ 89 +ovan 89 +to_ 88 +iz 88 +_bo 88 +_l 88 +odo 87 +bch 87 +bchod 87 +bcho 87 +sl 86 +st_ 86 +pred 86 +C 86 +pol 85 +_pred 85 +R 85 +ik 84 +uro 84 +pi 84 +ek 84 +zo 83 +eni 83 +obch 83 +cie 83 +oku_ 83 +obcho 83 +ns 83 +roku_ 82 +ii 82 +tv 82 +ba 82 +ent 82 +_spo 81 +tov 81 +pe 81 +kon 80 +kc 80 +ec 80 +kci 80 +ck 80 +x 79 +osti 79 +_Sk 79 +mi_ 79 +_in 79 +Sk 79 +sia 79 +br 78 +rh 78 +val 78 +olo 77 +_pod 77 +%_ 77 +_%_ 77 +bu 77 +_f 77 +iv 77 +_obc 77 +_obch 77 +eb 76 +str 76 +nej 76 +_D 76 +ni_ 75 +ou 75 +im 75 +ena 74 +tre 74 +_A 74 +mo 74 +su 74 +rz 73 +_trh 73 +trh 73 +_U 73 +al_ 73 +_ra 73 +_e 72 +_C 72 +sti_ 72 +zv 72 +te_ 72 +cno 72 +oj 72 +ktor 71 +_roku 71 +ocn 71 +ina 71 +sil 71 +nov 71 +alo 71 +odn 70 +nan 70 +oh 70 diff --git a/libtextcat/data/new_fingerprints/lm/slovenian.lm b/libtextcat/data/new_fingerprints/lm/slovenian.lm new file mode 100644 index 000000000000..0fb3f18f1659 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/slovenian.lm @@ -0,0 +1,400 @@ +_ 10406 +a 2828 +e 2676 +i 2458 +o 2418 +n 1814 +r 1484 +v 1253 +l 1248 +s 1228 +t 1172 +j 1107 +d 1085 +k 911 +p 880 +a_ 823 +m 763 +i_ 681 +e_ 678 +_p 603 +o_ 566 +u 521 +z 516 +b 456 +_s 435 +je 434 +, 416 +,_ 411 +ni 399 +Ä 383 +_v 372 +_d 356 +pr 355 +g 345 +ra 336 +_n 332 +st 323 +an 313 +po 303 +re 301 +na 295 +h 287 +ov 276 +_pr 276 +li 275 +al 274 +_z 270 +je_ 259 +la 255 +Å¡ 253 +ne 248 +en 246 +ko 244 +in 237 +c 234 +ti 234 +v_ 234 +_po 232 +no 230 +ve 230 +_k 227 +_i 224 +da 224 +. 221 +_j 221 +ri 220 +ja 216 +_t 214 +se 213 +ed 212 +._ 211 +em 206 +te 205 +za 201 +od 201 +av 200 +lo 196 +nj 194 +_o 194 +_je 193 +il 190 +or 183 +ka 181 +sk 179 +_b 178 +_je_ 178 +ih 178 +n_ 177 +_za 173 +h_ 171 +er 171 +os 171 +_na 168 +va 168 +ta 164 +le 163 +m_ 161 +ev 157 +ij 157 +ar 157 +do 155 +to 155 +ž 154 +A 153 +el 150 +_m 148 +ro 147 +ol 146 +_v_ 145 +aj 145 +di 143 +N 142 +S 142 +at 140 +ih_ 139 +ki 138 +de 137 +_in 135 +vo 135 +ga 134 +me 131 +in_ 129 +vi 129 +om 127 +_in_ 125 +et 124 +pre 124 +O 123 +bi 120 +I 119 +da_ 117 +ik 117 +ma 115 +E 114 +so 113 +bo 112 +it 112 +anj 112 +eg 110 +ni_ 109 +mi 108 +ke 108 +na_ 108 +u_ 108 +lj 106 +iz 105 +ob 105 +_da 103 +li_ 103 +is 103 +im 102 +red 102 +_pre 102 +dr 100 +mo 99 +P 99 +_se 99 +ji 98 +r_ 97 +ad 97 +pri 97 +K 97 +_l 97 +tr 95 +pa 94 +no_ 94 +j_ 92 +ki_ 91 +ti_ 91 +_pri 91 +dn 89 +_P 88 +ej 88 +_da_ 87 +ne_ 86 +ega 86 +_r 86 +_bi 86 +l_ 86 +em_ 86 +go 86 +" 85 +sl 85 +ek 84 +ali 84 +ove 84 +aÄ 84 +ak 84 +ci 83 +ga_ 83 +ko_ 83 +se_ 82 +_S 82 +jo 81 +ot 81 +ja_ 81 +_so 80 +lov 80 +L 80 +D 79 +V 79 +as 78 +_do 78 +am 78 +nje 77 +es 77 +za_ 77 +_pa 76 +T 75 +tu 75 +_za_ 74 +sti 74 +_dr 74 +la_ 74 +_N 74 +_de 74 +ega_ 73 +_ko 73 +og 73 +ns 72 +Äe 72 +ds 72 +_bo 71 +ora 71 +vn 71 +ost 71 +_ne 71 +iÄ 70 +ven 69 +z_ 69 +Äi 69 +_te 68 +ce 68 +_se_ 67 +Äa 67 +oÄ 67 +M 66 +_u 66 +un 65 +ln 65 +pos 64 +ju 64 +sta 64 +op 64 +di_ 63 +ud 63 +vs 63 +t_ 62 +nsk 62 +tv 62 +on 62 +ski 62 +R 62 +pa_ 62 +_ka 62 +i, 61 +so_ 61 +_iz 60 +_pa_ 60 +s_ 60 +i,_ 60 +pro 59 +del 59 +rav 59 +eni 59 +oli 58 +rj 58 +e, 57 +Å¡e 57 +ili 57 +vr 57 +d_ 57 +_le 57 +pred 57 +jo_ 56 +e,_ 56 +nik 56 +love 56 +_pred 56 +ske 56 +er_ 55 +str 55 +Än 54 +pra 54 +J 54 +_Å¡ 54 +oven 53 +_ra 53 +tn 53 +_na_ 53 +_so_ 53 +nih 53 +loven 53 +si 52 +ke_ 52 +_g 52 +ic 52 +udi 51 +bi_ 51 +eds 51 +oj 51 +ru 51 +a, 51 +_pro 50 +_pos 50 +nc 50 +nih_ 50 +Äu 50 +a,_ 50 +_a 50 +az 50 +ok 50 +B 50 +let 49 +udi_ 49 +_od 49 +_K 49 +aj_ 48 +_bi_ 48 +_ve 48 +raÄ 48 +o, 47 +_tu 47 +ija 47 +ter 47 +ist 47 +Z 47 +reds 46 +nd 46 +ali_ 46 +A_ 46 +iti 46 +bil 46 +_ob 46 +o,_ 46 +ati 46 +tud 45 +tudi 45 +_ki 45 +k_ 45 +be 45 +aÅ¡ 45 +ir 45 +ža 45 +do_ 45 +sp 45 +_ki_ 45 +_st 45 +ep 44 +_del 44 +tudi_ 44 +rž 44 +aÄu 44 +_ni 44 +ah 43 +raÄu 43 +raÄun 43 +iÅ¡ 43 +_mo 43 +avn 43 +_tud 43 +Äun 43 +aÄun 43 +_tudi 43 +_to 42 +raz 42 +kr 42 +ova 42 +_e 42 +ogo 42 +ani 42 +_" 42 +ev_ 42 +br 42 +eb 42 +sa 42 +mi_ 42 +tem 42 +ta_ 41 +prav 41 +i. 41 +slov 41 +ens 41 +bo_ 41 +že 41 +_T 41 +_let 41 +odo 41 +slo 41 +ensk 40 +ka_ 40 +neg 40 +ez 40 +nos 40 +eÄ 40 +_sl 40 +_V 40 +rža 40 +nega 40 +ili_ 39 diff --git a/libtextcat/data/new_fingerprints/lm/spanish.lm b/libtextcat/data/new_fingerprints/lm/spanish.lm new file mode 100644 index 000000000000..e40317f956a9 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/spanish.lm @@ -0,0 +1,400 @@ +_ 25044 +e 7830 +a 7437 +o 5102 +s 4394 +n 4358 +i 4065 +r 3998 +l 3634 +d 3118 +c 2931 +t 2834 +u 2316 +a_ 2269 +e_ 2211 +s_ 1862 +de 1679 +p 1673 +_d 1644 +m 1447 +_de 1443 +n_ 1332 +o_ 1301 +en 1295 +_e 1216 +es 1177 +_l 1132 +de_ 1080 +la 1060 +os 1028 +_de_ 1027 +_p 963 +l_ 910 +ci 890 +_c 866 +_a 866 +os_ 801 +ar 777 +er 775 +as 768 +ra 746 +nt 736 +_la 727 +re 726 +,_ 724 +, 724 +el 722 +ta 708 +ue 701 +g 678 +on 674 +al 670 +_s 666 +co 653 +b 637 +an 622 +v 616 +la_ 616 +or 612 +te 599 +st 596 +el_ 580 +_la_ 573 +y 545 +to 543 +r_ 517 +ad 512 +ó 511 +do 504 +ro 504 +se 488 +as_ 488 +q 487 +qu 487 +. 479 +._ 478 +en_ 475 +ca 460 +in 459 +un 456 +_co 450 +es_ 449 +ic 449 +_en 440 +ac 440 +que 439 +na 439 +lo 430 +_m 430 +f 429 +ent 428 +da 412 +ue_ 411 +po 405 +le 399 +_q 399 +_qu 399 +que_ 393 +_que 388 +ie 386 +h 385 +pa 382 +y_ 371 +ti 367 +_que_ 365 +_en_ 365 +_y 361 +tr 358 +_el 353 +ri 349 +ia 342 +_el_ 333 +_se 330 +ió 330 +_y_ 330 +io 329 +pr 320 +ón 317 +ec 317 +no 314 +id 301 +à 300 +mi 299 +_t 299 +ión 292 +nte 292 +me 286 +aci 283 +do_ 279 +li 276 +con 276 +nd 273 +est 272 +ni 272 +á 271 +di 270 +_es 268 +_lo 267 +ció 265 +ma 265 +ón_ 264 +_pr 263 +_r 261 +ción 255 +z 254 +ra_ 251 +si 247 +ión_ 246 +oc 245 +nc 244 +_u 244 +_po 243 +los 243 +or_ 242 +_con 241 +is 239 +del 238 +_del 237 +ado 236 +se_ 233 +_i 233 +los_ 231 +_re 231 +por 229 +_del_ 228 +sta 228 +del_ 228 +al_ 228 +ne 226 +_h 226 +cu 225 +_n 225 +_a_ 224 +_v 224 +_un 223 +ce 222 +so 220 +ción_ 218 +res 218 +vi 217 +om 216 +te_ 212 +_pa 211 +ien 210 +j 209 +E 208 +_los 207 +_los_ 207 +to_ 206 +ol 204 +it 203 +am 202 +ació 201 +rt 201 +ación 201 +pe 197 +ha 190 +_se_ 189 +nto 188 +_o 184 +_E 184 +on_ 184 +sa 183 +na_ 182 +ta_ 181 +su 180 +cia 180 +mo 180 +ct 178 +par 178 +_f 177 +_por 176 +eg 172 +_in 172 +ur 170 +L 168 +ve 166 +im 164 +ga 163 +_est 161 +ar_ 161 +ab 160 +_L 159 +tu 158 +at 158 +no_ 157 +s, 157 +s,_ 157 +_por_ 156 +por_ 156 +las 156 +ba 154 +o,_ 154 +o, 154 +ento 151 +et 150 +C 150 +_ha 149 +A 149 +tra 148 +ient 148 +_al 147 +a,_ 146 +ica 146 +a, 146 +pro 146 +ado_ 145 +ici 144 +_ca 144 +an_ 144 +las_ 143 +ara 143 +nci 143 +ente 142 +ú 142 +rr 142 +ir 142 +da_ 141 +em 141 +ll 140 +il 139 +Ãa 138 +iv 138 +_su 138 +_par 136 +ul 136 +ant 136 +_A 135 +mp 135 +_las_ 134 +_las 134 +_C 134 +_pro 133 +men 132 +P 132 +des 131 +com 130 +ion 130 +era 130 +ed 129 +ida 129 +sp 128 +gu 127 +nte_ 127 +ns 127 +za 126 +dos 125 +M 125 +cio 125 +les 125 +_P 124 +bl 124 +_com 122 +s._ 122 +s. 122 +_M 121 +ua 120 +nta 120 +mu 119 +_no 118 +dad 118 +ñ 117 +é 116 +un_ 116 +va 116 +ist 116 +nes 116 +iento 115 +one 114 +ara_ 113 +S 113 +ada 113 +_un_ 113 +fi 111 +pre 110 +tos 110 +ter 109 +ot 109 +esta 108 +_me 107 +ido 107 +ob 107 +_g 105 +br 105 +go 105 +ea 104 +nto_ 104 +ona 103 +pu 103 +dos_ 103 +tro 103 +ier 103 +para 102 +ment 101 +ag 101 +ero 101 +gr 101 +rec 101 +bi 101 +ia_ 100 +una 100 +nic 99 +ncia 99 +Ãa_ 98 +a._ 98 +tos_ 98 +a. 98 +ran 98 +lo_ 97 +ones 97 +rm 96 +lu 96 +ron 95 +con_ 95 +ó_ 95 +nes_ 95 +_ci 95 +ante 94 +ch 94 +_con_ 94 +_para 94 +ntr 93 +una_ 93 +para_ 93 +mie 92 +ico 92 +fe 92 +les_ 92 +uc 92 +ip 91 +sto 91 +_ma 91 +ui 91 +sta_ 91 +_ve 90 +cion 90 +" 90 +op 90 +cal 89 +_mu 89 +_S 89 +ro_ 89 +_pe 88 +ste 88 +ras 88 +pl 88 +_una 88 +_di 87 +ento_ 86 +ita 86 +ione 85 +ect 85 +_una_ 85 +mien 85 +tan 85 +du 84 +den 84 +ndo 84 +per 84 +eri 84 diff --git a/libtextcat/data/new_fingerprints/lm/swahili.lm b/libtextcat/data/new_fingerprints/lm/swahili.lm new file mode 100644 index 000000000000..56090b40153e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/swahili.lm @@ -0,0 +1,400 @@ +_ 16483 +a 9342 +i 5293 +a_ 4071 +u 2730 +k 2609 +n 2351 +w 2076 +m 1928 +e 1866 +h 1800 +o 1775 +wa 1743 +l 1486 +s 1419 +i_ 1401 +t 1399 +_k 1306 +y 1100 +_w 969 +li 945 +wa_ 911 +z 891 +_wa 890 +ka 834 +ku 799 +r 770 +b 733 +an 727 +ma 723 +o_ 711 +_m 707 +na 698 +ya 675 +ha 672 +g 602 +al 580 +d 570 +at 560 +am 554 +_n 549 +_ku 549 +ik 543 +_h 515 +ya_ 514 +A 494 +is 487 +_y 485 +hi 474 +na_ 471 +_ya 471 +ta 468 +sh 456 +ali 449 +j 426 +u_ 423 +ki 418 +e_ 402 +p 401 +ti 401 +_wa_ 399 +f 392 +_ya_ 390 +ba 390 +ri 385 +ng 385 +il 380 +c 358 +hu 356 +_na 356 +ni 355 +za 354 +zi 351 +ia 344 +_na_ 343 +_a 336 +in 327 +_ma 326 +ch 322 +mb 317 +ika 314 +. 311 +._ 309 +_ka 307 +as 306 +ak 306 +ati 301 +, 300 +ka_ 296 +,_ 294 +_u 292 +kw 286 +ili 278 +K 274 +en 271 +si 266 +_kw 262 +la 261 +ni_ 261 +ma_ 261 +_s 258 +kwa 258 +ar 256 +ut 245 +za_ 245 +nd 242 +mba 241 +_kwa 239 +_z 234 +li_ 233 +un 233 +ny 230 +it 229 +se 229 +yo 227 +ia_ 222 +M 221 +sa 221 +kat 217 +_K 214 +_i 213 +ika_ 213 +ana 212 +ish 212 +kati 206 +_ha 204 +on 201 +ai 200 +I 198 +aa 196 +um 195 +im 190 +v 188 +mu 187 +amb 187 +sha 185 +em 183 +fa 181 +zi_ 180 +di 179 +mi 178 +_M 178 +us 176 +_ki 176 +ha_ 175 +iw 172 +ama 172 +_kat 168 +_kati 168 +_hi 166 +_l 166 +ra 166 +kwa_ 165 +la_ 164 +W 164 +ja 163 +U 163 +N 163 +amba 161 +ao 161 +_za 160 +ji 160 +B 157 +iwa 155 +tik 155 +wal 155 +le 155 +tika 154 +ge 153 +lis 153 +tu 152 +atika 152 +to 152 +atik 152 +uw 152 +_kwa_ 151 +A_ 151 +ke 150 +S 147 +tika_ 145 +aj 145 +we 144 +cha 144 +bi 141 +az 140 +er 139 +ek 138 +katik 138 +ez 138 +uwa 137 +kut 135 +_al 134 +_B 134 +ad 134 +mu_ 133 +_ali 133 +rik 132 +_W 131 +ba_ 131 +kuw 131 +me 130 +ali_ 128 +kuwa 128 +ema 127 +wan 127 +bu 126 +sem 126 +_A 125 +ir 125 +ata 125 +iz 124 +_hu 124 +ay 124 +ul 124 +af 123 +iki 122 +ema_ 121 +da 120 +ti_ 120 +sema 119 +aka 118 +sema_ 118 +te 118 +uz 117 +yo_ 117 +_v 117 +io 116 +iy 115 +uta 115 +ani 115 +_wal 115 +he 115 +if 114 +_la 114 +ab 114 +go 112 +_za_ 111 +ama_ 111 +sa_ 111 +pa 110 +_t 110 +zo 110 +nge 110 +wam 109 +wali 108 +ua 107 +ur 106 +_c 106 +ise 105 +_ch 105 +isem 105 +ho 105 +ye 104 +iyo 104 +E 104 +el 104 +mo 103 +ung 103 +eri 103 +_wali 103 +_b 102 +mba_ 102 +ari 101 +ita 101 +isema 100 +ot 99 +_la_ 99 +uk 99 +ao_ 99 +di_ 99 +sha_ 99 +ini 99 +kuwa_ 98 +uwa_ 98 +ana_ 98 +lise 98 +lisem 98 +uli 97 +shi 97 +ga 96 +iwa_ 96 +fu 96 +T 96 +R 95 +_il 95 +wak 94 +aw 94 +isha 94 +ri_ 93 +_am 93 +ara 92 +_cha 92 +aji 92 +_ili 91 +ifa 91 +O 90 +_p 90 +uh 90 +iri 90 +chi 90 +asi 89 +po 89 +a. 89 +ong 89 +azi 88 +_j 88 +_kut 88 +eny 88 +nc 88 +a._ 88 +ko 87 +uu 87 +id 87 +w_ 87 +no 87 +P 86 +ah 86 +ina 86 +rika 86 +_Bw 85 +H 85 +gu 85 +uo 85 +Bw_ 85 +_Bw_ 85 +_se 85 +Bw 85 +ib 84 +_S 84 +kam 84 +hi_ 84 +nya 84 +si_ 83 +a, 82 +no_ 81 +pi 81 +ok 81 +i. 81 +ip 81 +kwam 81 +i._ 81 +amba_ 80 +dh 80 +end 80 +ani_ 80 +a,_ 79 +wamb 79 +kwamb 79 +_sh 79 +eza 79 +nz 79 +wi 79 +_kwam 79 +wamba 79 +alis 78 +_kuw 78 +ngo 78 +ap 77 +_N 77 +any 77 +ili_ 77 +C 77 +WA 76 +vy 76 +wana 76 +_hiy 75 +Wa 75 +hiyo 75 +nch 75 +_hiyo 75 +de 75 +_kuwa 75 +ing 75 +hiy 75 +vi 75 +isha_ 74 +es 74 +atu 74 +_Wa 74 +nchi 74 +aki 74 +lim 73 +da_ 73 +ini_ 73 +ash 73 +ala 73 +i, 73 +ano 73 +i,_ 72 +_kam 71 +_wan 71 +ano_ 71 +mw 71 +nde 71 +ji_ 71 +ion 70 +_amb 70 +ndi 70 +_Ka 70 +eza_ 70 diff --git a/libtextcat/data/new_fingerprints/lm/swedish.lm b/libtextcat/data/new_fingerprints/lm/swedish.lm new file mode 100644 index 000000000000..1c021242b9fe --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/swedish.lm @@ -0,0 +1,400 @@ +_ 33494 +e 8992 +n 7900 +t 7859 +a 7781 +r 7251 +s 6435 +i 5649 +l 4541 +d 4079 +o 3724 +m 3203 +k 3058 +g 2478 +en 2403 +n_ 2389 +t_ 2073 +de 1939 +r_ 1910 +v 1890 +h 1789 +u 1782 +_s 1768 +ä 1724 +er 1709 +f 1597 +en_ 1537 +a_ 1526 +an 1357 +p 1320 +et 1317 +ö 1278 +Ã¥ 1261 +st 1236 +ar 1226 +c 1191 +_d 1158 +e_ 1116 +in 1045 +_f 1027 +te 1000 +b 997 +_a 978 +s_ 974 +ra 958 +. 956 +tt 935 +_i 898 +_m 890 +._ 886 +ll 870 +ta 844 +_o 842 +_e 839 +nd 820 +ti 804 +sk 798 +re 779 +at 769 +_de 754 +om 743 +m_ 739 +ör 720 +, 697 +,_ 695 +ng 686 +li 673 +ka 666 +oc 662 +_h 654 +on 652 +et_ 647 +ch 645 +ns 643 +is 642 +er_ 630 +är 625 +_v 614 +_t 614 +ni 611 +i_ 609 +_oc 592 +tt_ 587 +na 586 +y 586 +la 579 +_b 579 +h_ 577 +kt 575 +ch_ 568 +ig 564 +fö 563 +och 555 +or 555 +_och 554 +och_ 554 +_och_ 553 +me 548 +den 548 +om_ 535 +_i_ 531 +d_ 530 +j 529 +ik 520 +de_ 520 +för 518 +ge 498 +ad 497 +_k 491 +_fö 487 +ri 484 +el 482 +il 481 +so 480 +al 474 +g_ 469 +le 464 +an_ 461 +_för 447 +si 437 +ar_ 437 +att 435 +_p 434 +es 420 +ing 413 +se 407 +to 404 +_u 403 +_en 403 +and 398 +den_ 395 +nde 393 +nn 393 +_l 391 +Ã¥_ 391 +D 385 +än 383 +nt 382 +l_ 381 +tr 378 +_D 372 +va 370 +am 369 +sa 367 +_so 365 +ga 364 +_en_ 361 +är_ 358 +ck 357 +av 354 +v_ 351 +ed 347 +ma 346 +da 346 +som 346 +rs 344 +som_ 344 +ve 342 +ter 341 +att_ 341 +ha 338 +ne 337 +ut 335 +as 332 +ska 329 +_at 327 +_att 326 +_som 324 +_att_ 324 +_som_ 323 +vi 322 +ikt 317 +_av 316 +det 316 +_den 315 +he 315 +ss 314 +un 307 +ke 304 +_g 303 +us 302 +di 302 +_st 300 +rn 297 +_me 296 +_ä 295 +ade 294 +" 290 +_ha 290 +av_ 289 +ill 288 +_n 286 +_in 279 +io 275 +_r 275 +der 275 +it 274 +_av_ 274 +sta 274 +gen 272 +isk 270 +_ti 269 +id 265 +na_ 265 +ns_ 264 +ko 262 +_den_ 261 +ag 258 +det_ 257 +lig 257 +era 256 +ll_ 255 +_det 252 +_är 251 +be 249 +_är_ 248 +ra_ 247 +ion 244 +- 241 +pr 240 +oni 233 +til 231 +ten 228 +_si 225 +k_ 222 +pÃ¥ 222 +fr 221 +ro 219 +till 219 +iv 216 +ls 216 +ande 215 +ör_ 214 +_det_ 213 +äl 212 +_pÃ¥ 211 +ts 210 +ens 209 +med 209 +mm 208 +rt 208 +_till 208 +_til 208 +_va 207 +_fr 205 +_sk 205 +var 205 +nin 204 +ning 203 +ol 201 +ka_ 200 +lle 198 +ett 198 +rd 197 +em 196 +pÃ¥_ 195 +x 195 +rk 194 +_ut 194 +ste 194 +ds 193 +_vi 192 +Ã¥r 192 +S 192 +nde_ 191 +are 191 +ver 190 +_pÃ¥_ 190 +nis 189 +kr 189 +_med 188 +all 188 +Ã¥n 187 +nge 185 +mo 184 +os 183 +ld 182 +ade_ 181 +_S 181 +ed_ 180 +rä 176 +De 175 +_- 175 +kan 174 +ta_ 173 +ng_ 172 +vä 171 +för_ 170 +ill_ 170 +han 170 +_De 170 +pp 169 +lt 169 +sam 168 +nte 167 +ans 167 +ton 166 +ur 165 +mi 165 +ess 165 +kl 164 +ig_ 164 +ks 164 +as_ 163 +und 163 +men 162 +med_ 161 +_med_ 161 +ak 161 +Di 160 +ot 159 +rna 159 +ul 159 +_var 159 +te_ 158 +gen_ 158 +het 157 +kto 157 +str 156 +_Di 155 +tad 155 +lan 154 +ga_ 154 +iska 154 +fa 154 +fi 154 +sÃ¥ 154 +Dikt 153 +Dik 153 +pe 153 +ska_ 152 +ja 152 +H 151 +res 151 +ku 151 +iu 150 +ande_ 150 +till_ 150 +t. 150 +ern 150 +rm 149 +_Dikt 149 +_Dik 149 +ie 149 +bl 148 +-_ 147 +od 147 +_H 147 +n. 147 +ist 147 +_di 146 +ius 146 +_" 145 +la_ 145 +sl 145 +man 145 +ren 145 +_för_ 145 +toni 144 +kton 144 +n._ 144 +ktoni 144 +ikton 144 +I 144 +ikto 144 +nius 143 +ten_ 143 +onius 143 +oniu 143 +toniu 143 +ing_ 143 +Dikto 143 +niu 143 +_ko 143 +ic 142 +_sa 142 +_han 142 +ett_ 142 +sm 141 +ba 141 +M 141 +gr 140 +lä 140 +ex 138 +t._ 138 +sp 137 +lla 137 +_et 137 +_M 137 +dr 137 +rö 136 +rad 136 +ek 136 +_be 135 +tar 135 +_-_ 135 +_om 134 +rl 134 +E 134 +mä 133 diff --git a/libtextcat/data/new_fingerprints/lm/tagalog.lm b/libtextcat/data/new_fingerprints/lm/tagalog.lm new file mode 100644 index 000000000000..bc87d38d0c92 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/tagalog.lm @@ -0,0 +1,400 @@ +_ 10664 +a 6092 +n 3421 +i 2332 +g 2149 +ng 1488 +an 1332 +g_ 1308 +ng_ 1243 +t 1155 +a_ 1138 +o 1137 +l 1105 +s 1067 +k 1035 +_n 957 +m 909 +y 801 +ang 787 +u 767 +ang_ 683 +p 682 +na 670 +_a 634 +la 596 +. 572 +in 561 +r 559 +sa 545 +_s 502 +._ 487 +ma 476 +ka 473 +_na 464 +b 462 +_m 458 +al 448 +d 445 +pa 414 +n_ 412 +at 394 +ak 393 +h 378 +ya 358 +_k 356 +ag 350 +ni 344 +_p 339 +_sa 335 +o_ 328 +ta 308 +_an 307 +iy 300 +sa_ 296 +_ang 294 +_ang_ 293 +t_ 290 +ay 286 +ga 283 +on 280 +it 275 +_sa_ 265 +ala 258 +_ng 256 +am 252 +_i 248 +_pa 248 +i_ 243 +na_ 243 +an_ 240 +e 239 +_ma 237 +_ka 235 +iya 231 +y_ 229 +il 228 +li 228 +w 226 +_ni 225 +_na_ 224 +_ng_ 220 +as 214 +ba 207 +si 206 +" 195 +ti 195 +ha 186 +, 179 +ar 178 +ing 173 +ra 173 +A 172 +ki 168 +ong 167 +_b 167 +ap 166 +,_ 162 +ong_ 161 +ko 159 +ay_ 159 +un 157 +ul 155 +yo 154 +to 152 +_l 150 +ah 148 +is 147 +hi 147 +_t 146 +lan 145 +ama 142 +niy 139 +at_ 138 +_niy 138 +aka 136 +wa 136 +niya 135 +_niya 135 +ab 134 +- 134 +di 133 +_si 132 +"_ 131 +aw 129 +_d 126 +_A 126 +yan 123 +ya_ 122 +ata 120 +a. 120 +gi 120 +P 120 +ing_ 118 +um 115 +o. 113 +aki 113 +ri 113 +ik 112 +nd 112 +ila 111 +mo 110 +da 110 +a._ 110 +in_ 109 +la_ 107 +ali 106 +S 106 +man 105 +ig 105 +iya_ 105 +s_ 104 +_ak 104 +_at 103 +_h 102 +yon 102 +asa 101 +ina 101 +_P 101 +n. 99 +N 98 +aa 98 +ga_ 97 +_mo 97 +_ba 97 +_" 95 +ito 94 +bi 94 +yang 94 +n._ 93 +pag 92 +lang 92 +yang_ 92 +_la 92 +o._ 90 +K 90 +_at_ 90 +tu 88 +_S 88 +ara 87 +nga 87 +ro 85 +apa 83 +rr 82 +lam 82 +lo 81 +nan 81 +_N 80 +aman 79 +aha 78 +mg 78 +mga 78 +mga_ 78 +_mga_ 78 +_mg 78 +_mga 78 +_K 78 +siy 77 +kan 76 +it_ 76 +san 76 +d_ 75 +ad 75 +di_ 74 +tin 74 +' 74 +ati 73 +siya 73 +kin 72 +M 72 +lang_ 71 +mo_ 70 +_mo_ 70 +ako 70 +uma 70 +_pag 69 +pi 69 +l_ 68 +_siy 68 +rrr 68 +_siya 67 +ula 67 +_M 66 +Pa 66 +iyo 66 +mi 66 +bu 66 +mu 65 +no 65 +pu 65 +nag 65 +ung 65 +Na 65 +ot 64 +_Na 64 +niya_ 64 +iyan 64 +ku 64 +k_ 63 +go 62 +awa 62 +ip 61 +_Pa 61 +lu 61 +_di 60 +pan 60 +_ta 60 +ini 60 +isa 60 +nt 60 +iyang 60 +_iyo 59 +_iy 59 +on_ 59 +tan 59 +mang 59 +aba 59 +gan 59 +ut 58 +I 58 +hin 58 +nak 58 +an. 57 +akin 57 +_r 57 +han 57 +Ka 57 +_ay 57 +_ako 56 +may 56 +iyon 56 +rrrr 55 +Sa 55 +aga 55 +to_ 55 +nit 55 +_ko 54 +er 54 +ib 54 +ari 54 +ana 54 +ili 54 +an._ 54 +ahi 54 +au 54 +ala_ 54 +gk 53 +pa_ 53 +_is 53 +rin 53 +ilan 52 +_kan 52 +_Ka 52 +_it 52 +_Sa 51 +king 51 +ko_ 51 +_nak 51 +gin 51 +_ay_ 50 +bo 50 +_iyon 50 +amang 50 +'y 49 +os 49 +mang_ 49 +_pa_ 49 +kat 49 +a, 49 +An 48 +Ma 48 +ny 48 +mag 48 +_ku 48 +_ito 48 +_ha 47 +yong 47 +? 47 +aking 47 +T 47 +ni_ 47 +yong_ 47 +_An 47 +king_ 47 +_akin 46 +sang 46 +_nag 46 +kas 46 +_aki 46 +_ni_ 46 +ayo 45 +kit 45 +'y_ 45 +mat 45 +_Ma 45 +lal 45 +ot_ 45 +nya 44 +ban 44 +ndi 44 +oo 44 +_u 44 +ngi 44 +_hi 44 +sang_ 44 +B 43 +su 43 +may_ 43 +rrrrr 43 +p_ 43 +ita 43 +wal 43 +ika 43 +abi 43 +aan 43 +_may 43 +lama 42 +naka 42 +mal 42 +_I 42 +_ri 42 +alan 42 +any 42 +im 42 +_pu 42 +ai 42 +wala 41 +anya 41 +a,_ 41 +_B 41 +ndi_ 41 +as_ 41 +pat 41 +po 41 +nang 41 +_mag 41 +laman 40 +lala 40 +kal 40 +g- 40 +.. 40 +ir 40 +! 40 +uk 40 +gu 39 +ito_ 39 +ro_ 39 +_g 39 +_da 39 +_isa 39 +_lam 39 +ilang 39 +kanya 39 +w_ 39 +kany 39 +agk 38 +pal 38 +ka_ 38 +_naka 38 +siya_ 38 +isan 38 diff --git a/libtextcat/data/new_fingerprints/lm/tamil.lm b/libtextcat/data/new_fingerprints/lm/tamil.lm new file mode 100644 index 000000000000..8563707d5e9d --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/tamil.lm @@ -0,0 +1,400 @@ +_ 11468 +Õ 2697 +Ô 2533 +´ 1960 +» 1786 +½ 1249 + 1103 +Ù 990 +£ 976 +Ø 849 +¡ 847 +¨ 831 +¿ 821 +À 794 +¾ 758 +§ 750 +à 721 +¹ 715 +¶ 713 +. 702 +£_ 671 +× 655 +â 648 +Æ 625 +._ 611 +ç 601 +Ã… 571 +¯ 557 +»Õ 556 +É 524 +_É 520 +¢ 518 +_½ 492 +Ç 489 +_× 488 +Þ 484 +_Ç 482 +¤ 461 +Ä 442 +¼ 418 +ÿ 416 +Â¥ 403 +§_ 398 +½Ô 373 +_Ø 368 +ª 360 +à 359 +_à 349 +»Ô 348 +´Õ 342 +à 337 +_ 331 +õ 331 +¨_ 323 +_¼ 321 +¡´ 320 +_´ 310 +Õ¿ 308 +Ãœ 299 +¢» 294 +Ó 292 +´Ô 292 +ÿ» 290 +Õ§ 288 +¡Þ 284 +â_ 279 +ý 271 +õ´ 266 +ÂÕ 265 +_» 265 +¾Ô 241 +_¾ 234 +´ª 232 += 231 +± 229 +¤½ 227 +== 221 +¶Õ 218 +»_ 218 +Õ§_ 213 +=== 211 +Õ_ 210 +==== 201 +ª_ 199 +´_ 198 +ÀÕ 197 +===== 191 +, 188 +ؽ 188 +¿Õ 187 +º 186 +_¶ 184 +,_ 184 +à 183 +Ô¯ 181 +¿_ 179 +Ô¨ 178 +Ã¥ 172 +´ª_ 171 +ì 169 +Ùà 168 +¢»Õ 165 +¿Ô 165 +Ô_ 165 +ç¡ 164 +È 161 +Þ_ 160 +è 160 +_è 157 +« 156 +_È 156 +§à 152 +Ô´ 152 +Ù» 151 +à 150 +_à 149 +ä 149 +ö 146 +ÕÅ 146 +ؽÔ 145 +¯´ 143 +Ö 142 +׶ 142 +ÃÔ 142 +½Õ 140 +Ã_ 139 +Ô§ 138 +_Éç 138 +Éç 138 +¶Ô 137 +Õ¨ 137 +Æ_ 136 +¢â 133 +_׶ 132 +ÔÆ 128 +¡´Õ 128 +ÀÔ 127 +¥¹ 127 +_Ë 127 +Ë 127 +_ؽ 127 +ÕÀ 125 +öÓ 125 +ÄÕ 124 +ÆÔ 122 +ÅÕ 121 +Þ£ 120 +Õç 119 +¼Ô 119 +×½ 119 +´Ù 119 +_Ã¥ 118 +¯_ 118 +£. 117 +´ÕÅ 116 +»Õ¿ 115 +ÂÔ 115 +_ÂÕ 113 +â. 113 +×» 113 +£._ 112 +¡Þ_ 112 +ÕÙ 111 +Õ¡ 111 +ÙÄ 109 +×´ 109 +é 109 +â._ 108 +_½Ô 107 +Õ´ 106 +¤_ 105 +ÆÕ 104 +' 104 +½¹ 103 +ç_ 102 +¹Ô 102 +Ø´ 101 +¼Õ 100 +Éÿ 100 +_Éÿ 99 +_×» 99 +ÔÀ 98 +Éÿ» 98 +æ 98 +_×½ 97 +Ã…_ 97 +Ô» 97 +_Éÿ» 97 +Ô¨_ 97 +¹_ 97 +ß 96 +±Å 96 +×´Ô 96 +»Ô¨ 95 +¡Þ£ 94 +Õ¿_ 94 +Ø» 94 +ø 94 +Ø 93 +à 93 +_ؽÔ 93 +¾Õ 93 +Õ¨_ 92 +ÿ»_ 91 +ÃÕ 91 +_Éç¡ 90 +Éç¡ 90 +Ãç 90 +ÕÆ 90 +_Ãç 90 +ÿ»Õ 90 +Ù¹ 89 +_´Ô 89 +_Ãç_ 88 +_¼Ô 88 +Ãç_ 88 +ì_ 86 +ç¡´ 86 +ÅÔ 86 +ÿâ 85 +¥à 85 +¯´ª 84 +¨Æ 84 +¨ì 83 +Ô¥ 83 +÷ 83 +_Þ 83 +´Ä 82 +à _ 81 +Ô´_ 81 +¨. 80 +_¾Ô 80 +¿Õ§ 80 +² 80 +Õà 80 +Ä_ 79 +´ÙÄ 79 +_½¹ 79 +¨Å 79 +Ô£ 79 +Ǧ 78 +¡_ 78 +¨._ 77 +_¶Õ 77 +§ÃÔ 77 +_¼Õ 77 +ë 77 +Åâ 76 +Þ£_ 75 +ÔÆ_ 75 +Ù¾ 75 +¯ 75 +Ù 74 +çÿ 73 +ýà 72 +è 72 +¨ì_ 71 +Ô§_ 71 +´ë 71 +¥Ü 71 +§Ù 70 +»Õ§ 70 +§Ùà 70 +£½ 69 +Ù»_ 69 +ªÄ 69 +ç¡Þ 69 +Ó_ 68 +Ôõ 68 +ؾ 67 +_Ø´ 67 +Õ¢ 66 +ÄÔ 66 +»Ô¨_ 66 +à £ 66 +_Ç» 66 +Ç» 66 +Ô¹ 66 +ÃŽ 66 +¿Õ§_ 65 +Ôâ 65 +_»Õ 64 +¯. 64 +¹¢ 63 +Õ¥ 63 +Ô¡ 63 +_×´ 63 +_ÃŽ 63 +Ù´ 62 +´ÄÕ 62 +Õ¿Ô 62 +¯._ 61 +Éÿ»Õ 61 +_½Õ 61 +_×»Ô 61 +×»Ô 61 +Ôõ´ 61 +½¥ 60 +ÿ»Õ¿ 60 +_ä 60 +_Éÿ»Õ 60 +Ô¾ 60 +Ôç 59 +׶Ô 59 +¡Þ£_ 59 +¤ä 59 +_ÇÙ 58 +ÿâ_ 58 +ÙÅ 58 +Ç 58 +ÇÙ 58 +Éÿ»Õ¿ 58 +ºÕ 58 +»À 57 +½Ô¯ 57 +¹£ 57 +ýº 57 +_Ç 57 +æ£ 56 +Ôà 56 +_׶Ô 55 +? 55 +ý¹ 54 +ÃÕ 54 +ÙÆ 54 +»Õç 54 +_×´Ô 54 +´ÕÅâ 53 +»Õ¿_ 53 +_À 53 +ÕÅâ 53 +¡´ÕÅ 53 +ç¡´Õ 52 +ÇÀ 52 +ÙÄ_ 52 +× 51 +Ãœ_ 51 +'_ 51 +_ÇÀ 51 +Ô 51 +èì 51 +Âç 51 +î 51 +â¡ 51 +_èì 51 +À_ 51 +Õ¿Õ 51 +ÙÀ 51 +_ɧ 50 +ɧ 50 +½ý 50 +»Õ§_ 50 +×½Ô 50 +±ÅÕ 50 +»¾ 49 +à £_ 49 +Ôà 49 +õ´ª 49 +´¥ 49 +»Õ_ 49 +»ÕÀ 48 +¶Ô§ 48 +ç£ 48 +Õ¡´ 48 +Õ¤ 48 +ÕØ 48 +À£ 48 +ÀÕ¿ 48 +_Éç¡´ 47 +æ£_ 47 +ê 47 +èì_ 47 +׶ԧ 47 +Éç¡´ 47 +Â_ 47 +_èì_ 47 +?_ 46 +Ø¿ 46 +Ô¿ 46 +_Ø» 46 +¹õ 46 +_Ù 46 +õ´ª_ 46 +è£ 45 +´ÙÄ_ 45 +¢Ù» 45 +¢Ù 45 +»Ù 45 +_×½Ô 44 +½Ü 44 +Ô£_ 44 +ÕÆÕ 44 +´Ø 44 +á 44 +´£ 44 +½Ôâ 44 +ÃÔ£ 44 +Ã. 44 +_׶ԧ 43 +´ë¡ 43 diff --git a/libtextcat/data/new_fingerprints/lm/thai.lm b/libtextcat/data/new_fingerprints/lm/thai.lm new file mode 100644 index 000000000000..e4b65ecdad56 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/thai.lm @@ -0,0 +1,400 @@ +_ 6290 +Ã’ 5252 +à 4377 +¹ 3920 +¡ 3050 +è 2984 +à 2657 +§ 2522 +Ñ 2454 +à 2369 +é 2304 +à 2158 + 1981 +Ç 1908 +Õ 1693 +Ã… 1543 +Ô 1443 +´ 1422 +· 1398 +µ 1398 +» 1301 +Ë 1245 +à 1239 +¤ 1210 +º 1181 +Ê 1081 +¨ 1045 +ä 978 +Òà 951 +¾ 907 +ª 892 +èÒ 829 +á 795 +¡Ò 735 +ã 722 +¡Òà 710 +¢ 691 +Ò 688 +× 668 +ç 595 +. 588 +ç 553 +Ø 543 +ÃÑ 532 +Õè 528 +Ãà 522 +»Ã 522 +·Õ 477 +Ù 452 +·Õè 451 +èà 443 +¹Ò 441 +Ó 422 +Ò§ 419 +éÒ 414 +ì 388 +¹_ 378 +Ãà 367 +Ã’Ã 360 +° 354 +Ѻ 349 +Ã’_ 347 +éà 345 +»Ãà 340 +Çè 337 +ÃÒ 337 +Ãè 336 +ÇèÒ 336 +à » 334 +Ò¹ 333 +Ñé 332 +¡Ã 328 +´é 325 +Ö 322 +_à 320 +ç¹ 314 +À 313 +ÃÃ’ 299 +ѧ 297 +Ò¡ 297 +×à 296 +äà 295 +Ñ° 293 +ÃÑ° 293 +×è 290 +â 289 +äÃè 288 +¸ 285 +ã¹ 280 +¼ 273 +è§ 268 +¹Ò 268 +é¹ 266 +¾Ã 263 +ѹ 263 +³ 263 +áÅ 256 +¹¡ 254 +ËÒ 253 +§¡ 252 +×èà 251 +ÃÕ 251 +ä´ 248 + 248 +ä´é 248 +¨à 248 +»ç¹ 247 +à »ç¹ 247 +ȍ 247 +à »ç 247 +á 245 +ÇÒ 245 +ãË 244 +¶ 244 +ªÒ 243 +µÔ 241 +_á 240 +¡Ñ 238 +È 237 +_¡ 236 +µè 234 +« 227 +éÇ 225 +é§ 221 +É 216 +¡ 214 +ä 213 +Õ 211 +Ñé§ 211 +Ãà 211 +¢à 210 +Ô¹ 209 +èÇ 206 +Õé 202 +Ã…Ã 202 +˹ 201 +¹Ñ 201 +_¹ 200 +µÃ 197 +Ëé 196 +§_ 193 +ù 191 +Ãà 191 +ÀÒ 188 +à188 +ÒÇ 186 +¢Ã§ 184 +¹Õ 184 +ãËé 183 +¤Ç 183 +ÃÕ 181 +§à 180 +¡Å 179 +áÅà 178 +èÒ§ 177 +èÒ_ 176 +Ãä 176 +_¾ 176 +¤ÇÒà 174 +¹µ 174 +ÇÒà 174 +¤ÇÒ 174 +Ñ´ 172 +Ô´ 172 +ÇèÒ_ 171 +¾Ãà 167 +¨Ò 167 +ù 167 +Òµ 167 +¹Õé 167 +¾Ãä 166 +¡ç 166 +¤Ã 165 +¹à 165 +è¹ 163 +ºÒ 162 +¢é 161 +§ã 161 +Õ_ 161 +ì_ 154 +ËÅ 154 +Ã× 154 +éç 151 +¹¡Òà 151 +¹¡Ò 151 +ä» 150 +Ñ¡ 150 +é_ 148 +ÃÒ 147 +µÑ 146 +¹· 146 +ÅÑ 146 +Â_ 146 +à ¾ 145 +Åé 140 +à ¡ 139 +¨Ò¡ 139 +à à 138 +ÃÔ 138 +¾Å 137 +Ã× 136 +·Ñ 135 +¡Ñº 134 +Ò¡ 133 +_à 132 +ºÃ 132 +§ä 132 +Ãà 131 +à · 130 +Åè 129 +ÒµÔ 129 +_· 128 +¡Ô 128 +µÃÕ 128 +ÃѺ 128 +Õ¡ 128 +à Ë 127 +¹à 127 +µé 126 +_¹Ò 126 +ªÒµÔ 126 +Ã…Ã’ 126 +ªÒµ 126 +¹Ç 126 +Ã’Ã 126 +_áÅ 125 +§¹ 124 +§¤ 124 +¡ÒÃà 124 +ÒÃà 124 +Ãà 123 +Ò¤ 122 + 122 +áµ 122 +à Ê 121 +ÇÑ 121 +Ñé¹ 120 +ÃÃà 120 +Ò· 119 +¹µÃ 119 +èÒÇ 118 +áµè 118 +§· 117 +ǹ 117 +ÂÑ 117 +ùµÃ 117 +¹µÃÕ 117 +ùµ 117 +ùµÃÕ 117 +Ãà 116 +Ùé 116 +_¹Ò 116 +à Ã× 116 +°Ã¹ 115 +à à 115 +ÃѰù 115 +Ëà 115 +à ¢ 115 +Ñ°à 115 +Ã. 115 +Ѱùµ 115 +ÃÑ°à 115 +°Ã¹µ 115 +Ѱù 115 +°Ã¹µÃ 115 +°à 115 +_áÅà 113 +éà 113 +Ãè 113 +èà 112 +㨠112 +_Ê 112 +¹Ñé 111 +Ô» 111 +¹Ñé¹ 111 +èà 111 +èä 110 +_à 110 +Âà 110 +_¨ 110 +Ò¨ 109 +»ÃÒ 108 +¹Ò¡ 108 +_Ë 107 +Ñ 107 +éÒ¹ 107 +¨Ñ 106 +§¡Ò 106 +_¤ 106 +§¡Òà 105 +Ç 105 +Ôµ 105 +¹é 105 +Ã’Ã… 103 +´Â 102 +è_ 102 +ÃÃà 102 +à à 102 +¹¹ 102 +ÃÃ’ 101 +â´ 100 +悅 100 +ªè 100 +_ä 99 +ÇÅ 99 +µéç 99 +â´Â 99 +Ã_ 98 +ê 98 +¾Ñ 98 +»ÃÃà 98 +¡Ã 97 +Öè§ 97 +¡_ 97 +Öè 97 +Òª 97 +é¹_ 96 +ÅÔ 96 +Ñ°º 94 +Ò¾ 94 +ÃÑ°º 94 +¼Ù 94 +°º 94 +Âè 93 +¹ä 93 +·Ò 93 +°ºÒ 92 +ÅèÒ 92 +Ã…. 92 +Ñ°ºÒ 92 +¡Ñ¹ 92 +Ñ°ºÒÅ 92 +§Ã 92 +ÃÑ°ºÒ 92 +ºÒÅ 92 +°ºÒÅ 92 +Ö§ 92 +.à 91 +¢éÒ 91 +á 91 +_¾Å 90 +ÃÂè 90 +¾Å. 90 +Ãà 90 +ÃÂèÒ 90 +ÂèÒ 90 +¡Ãà 89 +.Ã. 89 +ÂèÒ§ 89 +Õé_ 89 +ÃÂèÒ§ 89 +儤 89 +ÊØ 89 +抅 88 +ú 88 +_â 88 +Ã….à 87 +èç 87 +Ãê 87 +Ãà 87 +ÂÇ 87 +Åѧ 87 +¾Å.Ã. 87 +Ã….Ã. 87 +¹Õé_ 87 +à ´ 87 +á 87 +¼Ùé 87 +ÇÔ 87 +¾Å.à 87 +»Ãê 87 +Ȅ 87 +' 86 +ÊÔ 86 +Êè 86 +Ãèä 86 +_¾Å. 86 +·Ò§ 86 +Çà 85 +¡à 85 +§ 85 +._ 85 +¡ÃÑ 85 +ÃÀ 85 +ªÇ 84 +Ã’Ã 84 +¹éÒ 84 +¡ÃÑ° 84 +ÀÔ 84 +·Ó 84 +ËÒà 84 +_¾Å.à 84 +Ãà 83 +§Ê 83 +ÃÀÔ 83 +äÃèä 83 +ѵ 82 +»ÃÒ 82 +¤¹ 82 +Ã_ 82 diff --git a/libtextcat/data/new_fingerprints/lm/turkish.lm b/libtextcat/data/new_fingerprints/lm/turkish.lm new file mode 100644 index 000000000000..553be45fd735 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/turkish.lm @@ -0,0 +1,400 @@ +_ 23226 +i 8957 +a 7675 +e 6219 +n 5169 +& 4950 +; 4950 +l 4674 +r 4464 +&i 3206 +&i; 3206 +i; 3206 +s 3206 +k 3009 +d 2977 +t 2442 +m 2010 +y 2001 +u 1885 +n_ 1725 +g 1584 +o 1567 +b 1470 +e_ 1426 +ü 1353 +ar 1273 +la 1236 +a_ 1209 +i_ 1169 +in 1124 +_b 1101 +an 1097 +er 1073 +le 1058 +s; 1036 +&s; 1036 +&s 1036 +;_ 1018 +de 936 +;n 918 +&i;n 917 +i;n 917 +,_ 872 +_, 872 +_,_ 872 +, 872 +i;_ 863 +&i;_ 863 +_k 816 +en 769 +r_ 762 +_y 759 +da 759 +il 759 +k_ 750 +z 739 +nd 725 +&g; 708 +g; 708 +&g 708 +ra 697 +_a 692 +_d 692 +_s 676 +' 673 +_i 644 +._ 641 +. 641 +_. 637 +_._ 637 +c 637 +ka 635 +v 632 +;& 624 +h 615 +_g 597 +ri 596 +ç 596 +lar 583 +li 580 +ma 559 +ya 555 +ler 553 +p 547 +re 529 +al 529 +ö 527 +_t 520 +ir 508 +ak 502 +bi 500 +;l 480 +in_ 478 +di 477 +r& 468 +el 453 +et 449 +ek 445 +n& 439 +_o 439 +ol 437 +da_ 434 +n&i 433 +n&i; 433 +ni 429 +ti 428 +de_ 425 +an_ 422 +eri 421 +r&i 417 +r&i; 417 +s& 413 +s&i 413 +s&i; 413 +ar& 412 +me 407 +te 405 +a& 404 +i& 390 +ay 387 +ne 380 +_bi 373 +_ka 368 +ar&i; 367 +ar&i 367 +u_ 363 +as 363 +_e 362 +ta 359 +&i;l 352 +i;l 352 +nda 350 +ki 347 +na 346 +si 343 +_v 337 +;&i; 334 +;&i 334 +ve 334 +ara 333 +en_ 332 +;i 331 +on 328 +un 326 +l&i; 322 +l& 322 +l&i 322 +leri 322 +ba 318 +_m 318 +ik 315 +mi 315 +f 306 +lar& 302 +lar&i 302 +sa 298 +_h 297 +ld 296 +&i;& 290 +i;& 290 +_ve 288 +l_ 287 +ge 286 +is 285 +ed 285 +i&s; 284 +i&s 284 +;r 282 +_ya 279 +_ol 279 +d&i; 278 +d& 278 +d&i 278 +nl 277 +kl 275 +;k 274 +&i;n_ 271 +;n_ 271 +i;n_ 271 +ile 270 +or 269 +iy 267 +a&s 264 +a&s; 264 +y&i; 262 +ad 262 +y& 262 +y&i 262 +ye 259 +ha 258 +es 258 +t& 257 +t&i 257 +t&i; 257 +ini 253 +;nd 253 +i;nd 253 +ür 253 +&i;nd 253 +se 248 +_ge 248 +i;nda 248 +;nda 248 +;n&i; 247 +i;n& 247 +;n& 247 +&i;n& 247 +i;n&i 247 +;n&i 247 +bu 245 +_' 245 +_ba 244 +as&i 242 +as&i; 242 +_de 242 +as& 242 +at 240 +am 240 +nda_ 239 +ar_ 231 +ve_ 230 +rin 230 +_ve_ 228 +_bu 227 +im 227 +&i;r 226 +i;r 226 +ur 221 +g;&i 220 +g;& 220 +&g;& 220 +yo 220 +&g;&i 220 +g;&i; 220 +ul 215 +ak_ 215 +ke 213 +nu 213 +erin 211 +g;i 208 +&g;i 208 +lan 207 +bir 205 +r&i;n 205 +nde 202 +rl 202 +n&i;_ 201 +ko 201 +ca 200 +m_ 197 +rd 196 +t_ 194 +er_ 194 +st 193 +em 193 +_sa 190 +lm 189 +rt 188 +_ü 187 +i;k 187 +ün 187 +ola 187 +&i;k 187 +lerin 185 +ce 185 +'_ 185 +;m 183 +az 183 +rk 182 +yü 182 +;la 181 +_bir 181 +ir_ 180 +n&i;n 180 +ru 180 +lu 180 +;nda_ 178 +e& 177 +_ç 176 +_ha 175 +_ko 173 +esi 171 +_ö 170 +ap 170 +ni_ 168 +tü 167 +den 164 +ind 161 +di_ 161 +be 161 +s&i;n 160 +nin 159 +üz 158 +ri_ 155 +y&i;l 155 +_p 154 +nin_ 153 +&s;_ 152 +_y& 152 +edi 152 +s;_ 152 +_y&i; 152 +_y&i 152 +yl 151 +le_ 151 +inde 150 +eti 150 +ala 150 +&i;&s 149 +ele 149 +i;&s; 149 +;&s; 149 +;&s 149 +i;&s 149 +ek_ 148 +ere 148 +çi 147 +du 145 +ön 145 +z_ 144 +na_ 144 +eri_ 143 +ec 142 +gö 142 +i;&g; 141 +s&i;_ 141 +bir_ 141 +&i;&g 141 +i;&g 141 +ah 141 +;&g; 141 +;&g 141 +_gö 140 +lar_ 140 +eli 140 +a&g; 140 +a&g 140 +dan 140 +ac 140 +iç 140 +an& 140 +u& 139 +;&g;& 138 +_yü 138 +an&i 138 +an&i; 138 +pa 138 +it 137 +_ola 137 +_bir_ 136 +;t 135 +ör 135 +ne_ 135 +ini_ 134 +lma 134 +kan 133 +ab 132 +to 131 +ba& 131 +kar 130 +r&i;_ 130 +_ar 129 +ili 129 +li_ 129 +ki_ 128 +bu_ 127 +anl 127 +dü 127 +ler_ 126 +_ba& 126 +kon 126 +ll 125 +tl 125 +ine 125 +e&g; 124 +e&g 124 +_il 124 +_bu_ 124 +re_ 124 +bil 123 +&s;i 123 +;&i;n 123 +s;i 123 +ede 123 +zd 123 +'' 122 +_da 122 +_'' 122 +_tü 122 +ret 122 +_-_ 121 +_''_ 121 +mas 121 +''_ 121 +- 121 +dan_ 121 +leri_ 121 +;u 121 +_- 121 +ev 121 diff --git a/libtextcat/data/new_fingerprints/lm/ukrainian.lm b/libtextcat/data/new_fingerprints/lm/ukrainian.lm new file mode 100644 index 000000000000..438bbdabae46 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/ukrainian.lm @@ -0,0 +1,400 @@ +_ +о +и +а +â•• +в +н +Ñ€ +Ñ‚ +е +д +л +к +у +п +Ñ +_п +м +и_ +Ñ +_в +з +â••_ +а_ +б +о_ +е_ +ÑŒ +г +ч +_Ñ‚ +ов +_з +_д +у_ +в╕ +. +_Ñ +ли +ро +в_ +на +по +ш +ÑŽ +ти +й +ж +ор +Ñ… +_к +ит +ин +щ +╕д +" +_б +Ñ_ +до +ви +ц +О +ом +ко +_н +пр +â•“ +ра +ни +С +._ +то +од +ка +ло +_по +ри +м_ +ÑŽ_ +ки +_пр +ал +Ð +н╕ +Ñ‚ÑŒ +_м +ен +ол +ер +, +го +ÑŒ_ +ою +"_ +_до +ар +за +й_ +на_ +_в_ +во +ил +бу +та +â•– +_щ +ав +ки_ +_â•• +що +И +об +да +уд +д╕ +мо +_бу +_â••_ +Ñ‚ÑŒ_ +Т +ог +Ð +_за +,_ +╕в +_ц +Ð +╕н +ою_ +╕л +ÑÑ‚ +п╕ +де +ат +Ð¾Ñ +те +ну +не +_що +_в╕ +в╕д +_о +дн +ти_ +ла +а╓ +ли_ +ого +Ñ‚â•• +он +П +о╖ +хо +ик +_ч +ле +_Ñ€ +â•–_ +л╕ +ц╕ +_П +ом_ +що_ +но +р╕ +ду +ить +_п╕ +ьк +н╕_ +_" +же +з_ +_не +ÑÑ +аж +Я +З +Ð’ +_г +â•“_ +Ñ…_ +_ви +Ð¸Ñ +_то +оро +ва +нн +_л +ов╕ +_що_ +ди +про +_мо +ль +му +ем +н_ +ий_ +_Ñ‚â•• +ати +Я_ +ще +_про +К +оди +оло +рт +ак +ить_ +ад +о╖_ +ив +Ð»Ñ +ий +_Ñ +_Ð +го_ +до_ +_З +_ка +п╕д +Ð½Ñ +_ко +_на +че +чи +_Ñо +_ÑÑ‚ +а╓_ +_з_ +же_ +при +ÑÑ_ +ови +б╕ +ка_ +╕й +ого_ +пе +би +╕ль +â••Ñ‚ +к_ +_буд +ма +Ñо +Ñи +буд +Ñв +пов +оз +ок +Л +_при +Г +Д +оч +тор +ур +га +уде +аз +ел +ан +их_ +╕льк +ити +ен╕ +к╕ +ому +их +ен╕_ +льк +_до_ +_Ñ… +ве +ОС +му_ +_п╕д +не_ +Ð½Ð½Ñ +зн +буде +ча +ому_ +али +â••Ñ +ц╕_ +ин_ +_буде +â••Ñ€ +â••Ñ +ннÑ_ +д_ +! +чен +Ñ„ +Й +ов╕д +_пра +дов +льки +ув +ру +ре +гр +_пер +_не_ +╕да +тер +рон +Й_ +енн +рн +пер +им +ши +╕льки +ла_ +льки_ +шо +ба +_й +." +в╕р +_т╕ль +ход +ьки +_Ñоб +иш +дем +Ñто +_С +_К +ви_ +арти +_Ñто +_Ð’ +СТ +Ð²Ñ +нк +вч +Ð²Ñ +дно +_л╕ +т╕л +!_ +_ки +_у +╕й_ +ван +ьки_ +нÑ_ +т╕ль +чо +рти +бит +ÐµÐ½Ð½Ñ +ину +_Ñв +вин +д╕_ +пра +ну_ +каж +_па +_пе +_за_ +удем +будем +т╕льк +арт +кою +ьо +па +I +зна +але +щен +."_ +аже +пов╕ +за_ +_ро +_гр +ми +_т╕л +Ñоб +РО +_в╕д +карт +каже +* +ЗР+Ч +_Ñтор +╕нк +ож +Б diff --git a/libtextcat/data/new_fingerprints/lm/vietnamese.lm b/libtextcat/data/new_fingerprints/lm/vietnamese.lm new file mode 100644 index 000000000000..14221268dc1e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/vietnamese.lm @@ -0,0 +1,400 @@ +_ 88044 +n 17000 +h 12823 +t 9071 +i 8490 +c 8394 +g 8035 +ng 6718 +_t 6352 +_c 5234 +a 5083 +g_ 4883 +ng_ 4882 +_n 4379 +n_ 4365 +i_ 4365 +u 4149 +m 3648 +à 3635 +nh 3480 +o 3451 +Ç 3193 +_Ç 3168 +r 3011 +l 2692 +Ãœ 2659 +, 2328 +,_ 2295 +c_ 2279 +_l 2263 +ch 2226 +v 2161 +th 2158 +a_ 2132 +_th 2113 +y 2111 +. 2107 +t_ 2106 +h_ 2074 +_v 2057 +_h 1888 +_m 1834 +_ch 1813 +s 1749 +nh_ 1715 +u_ 1679 +á 1668 +à _ 1649 +tr 1611 +b 1589 +_tr 1581 +_nh 1541 +_b 1530 +m_ 1505 +p 1483 +._ 1455 +k 1429 +_s 1415 +o_ 1380 +y_ 1371 +_k 1367 +_ng 1348 +© 1343 +e 1296 +ó 1208 +‰ 1199 +ô 1181 +¶ 1139 +Ã¥ 1132 +T 1126 +hi 1121 +â 1081 +ì 1058 +ê 1055 +d 1040 +_g 1039 +kh 1034 +_kh 1027 +_T 994 +©i 967 +là 943 +_là 941 +" 926 +ä 916 +Ü© 901 +an 893 +_d 888 +Ó 877 +©i_ 859 +on 853 +à 848 +N 846 +Š 820 +ó_ 813 +ÂŒ 801 +ôn 795 +ph 776 +_p 772 +_ph 752 +û 752 +§ 734 +hÃœ 728 +ho 715 +và 709 +_và 703 +gÃœ 702 +® 700 +_r 696 +H 693 +_là _ 691 +là _ 691 +Ü©i 684 +à n 679 +ên 677 +ông 676 +ha 672 +gi 663 +C 656 +_gi 655 +Ö 654 +gÜ©i 646 +gÜ© 646 +Ü©i_ 624 +ông_ 610 +ngÃœ 610 +_ngÃœ 608 +ác 606 +ú 601 +ngÜ© 600 +ngÜ©i 600 +_ngÜ© 598 +gÜ©i_ 594 +ûa 587 +cû 586 +_cû 585 +¶t 585 +cûa 585 +ên_ 585 +_cûa 584 +có 583 +ûa_ 582 +_có 581 +cûa_ 581 +ã 581 +_cûa_ 580 +¶t_ 578 +_N 574 +có_ 574 +» 573 +_có_ 572 +iÂŒ 568 +À 562 +‹ 562 +m¶ 557 +_C 553 +_m¶ 551 +p_ 540 +Æ 540 +m¶t 538 +m¶t_ 534 +_m¶t 532 +ìn 528 +_m¶t_ 528 +ti 526 +i‰ 525 +Ù 517 +ÃŽ 512 +ình 500 +.. 500 +† 497 +và _ 497 +_và _ 496 +æ 491 +q 490 +qu 490 +_H 487 +_q 484 +_qu 484 +ong 481 +ong_ 471 +há 471 +x 470 +hô 468 +¿ 466 +_" 460 +Ü® 456 +ro 453 +ình_ 445 +ì_ 434 +_x 434 +ã_ 427 +à 423 +_ti 423 +in 422 +ân 421 +"_ 418 +iŠ 415 +Ön 413 +ron 412 +V 411 +rong 410 +§i 410 +rong_ 409 +ác_ 405 +ª 400 +‰n 399 +hôn 398 +Ã¥i 395 +ay 390 +_V 387 +h» 382 +hà 380 +à y 377 +Ãt 376 +uy 374 +ÇÜ 374 +_ÇÜ 373 +§i_ 372 +cá 371 +_cá 367 +nà 366 +‹_ 366 +hú 366 +_nà 365 +ra 363 +hông 362 +ho_ 359 +ân_ 356 +án 356 +° 355 +Ã¥i_ 354 +ai 352 +hu 352 +cho 352 +Ü®c 351 +®c 351 +Ón 351 +_cho 349 +tro 347 +‰t 347 +à o 347 +_tro 346 +_tron 346 +ñ 346 +tron 346 +trong 346 +M 345 +khô 340 +âu 338 +_khô 338 +cho_ 337 +_cho_ 336 +hông_ 336 +ay_ 333 +ch_ 332 +Çã 331 +Ãt_ 331 +( 331 +_( 330 +Ã’ 329 +_Çã 329 +ÇÜ® 328 +_ÇÜ®c 328 +) 328 +ÇÜ®c 328 +_ÇÜ® 328 +khôn 324 +_ñ 324 +_khôn 322 +Çã_ 322 +_Çã_ 320 +ò 318 +Ã¥n 318 +không 316 +à o_ 316 +Ü®c_ 316 +®c_ 316 +nhÃœ 315 +Ãœ_ 315 +»ng 313 +»ng_ 313 +»n 313 +_nhÃœ 313 +Th 312 +hì 311 +Û 310 +h»n 310 +h»ng_ 310 +h»ng 310 +iê 309 +gh 309 +Šu 307 +ta 307 +anh 307 +¡ 307 +ai_ 306 +à ng 306 +à y_ 304 +ÇÜ®c_ 298 +ÜÖ 295 +S 295 +: 294 +‰t_ 294 +e_ 294 +:_ 290 +ÃŽ_ 289 +ua 288 +æn 288 +à i 286 +Šu_ 285 +_nh» 285 +nh» 285 +nh»ng 285 +nh»n 285 +_nh»n 285 +_Th 285 +‰n_ 284 +ÂŒn 283 +Ø 281 +_M 281 +A 281 +úc 278 +L 277 +ø 277 +ÜÖn 276 +_ha 276 +n, 275 +Öng 275 +ÜÖng 275 +an_ 272 +à i_ 271 +iŠu 269 +sÓ 269 +n,_ 268 +¿_ 268 +on_ 267 +_sÓ 267 +các 265 +à ng_ 265 +_các 264 +anh_ 264 +ngh 264 +_ta 263 +hi_ 262 +hà n 261 +Š_ 261 +âu_ 261 +Àn 260 +ù 260 +_ngh 257 +ia 255 +¢ 252 +... 252 +êu 251 +Ùc 251 +i, 249 +iŠu_ 248 +nhi 247 +B 246 +i,_ 245 +Óng 244 +ª_ 244 +co 244 +_nhi 244 +Â¥ 244 +ܧ 244 +iên 243 +D 243 +Tr 241 +_S 240 +hÃœ_ 239 +òn 237 +hà 236 +hÆ 233 +K 233 +Öng_ 232 +ôi 232 +ÜÖng_ 232 +Àn_ 231 +_co 231 +ÂŒn_ 229 +hÃ¥ 229 +äi 229 +yê 229 +Ûn 229 +¢n 228 +Ûng 228 +_L 227 +Ûng_ 226 +inh 226 +Çi 225 +mà 225 +_Çi 225 +ng, 224 +ang 224 +P 224 +Šn 224 +g, 224 +g,_ 223 +_v§ 223 diff --git a/libtextcat/data/new_fingerprints/lm/welsh.lm b/libtextcat/data/new_fingerprints/lm/welsh.lm new file mode 100644 index 000000000000..c25d4a410bef --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/welsh.lm @@ -0,0 +1,400 @@ +_ 78614 +a 18113 +d 17890 +y 16837 +e 14108 +n 13902 +r 13869 +i 12928 +o 9972 +l 9188 +h 7454 +g 7165 +w 7010 +t 6296 +f 6080 +u 5400 +n_ 5159 +_y 5051 +dd 4946 +s 4884 +r_ 4547 +m 4364 +d_ 4223 +yn 4082 +_a 4020 +c 3707 +th 3301 +u_ 2900 +yd 2900 +ae 2649 +l_ 2588 +_g 2547 +. 2460 +' 2458 +._ 2408 +wy 2334 +yn_ 2331 +_d 2307 +b 2267 +ym 2262 +ra 2261 +an 2235 +ar 2228 +ol 2218 +ia 2212 +i_ 2199 +ed 2191 +_yn 2038 +_i 2020 +y_ 1983 +_c 1934 +_yn_ 1861 +ei 1815 +it 1796 +g_ 1787 +ith 1779 +dd_ 1745 +h_ 1741 +ydd 1731 +ai 1709 +au 1708 +'r_ 1655 +'r 1655 +_y_ 1621 +di 1615 +, 1613 +,_ 1608 +da 1585 +od 1546 +_o 1535 +ad 1532 +th_ 1510 +ll 1494 +au_ 1492 +_s 1415 +er 1359 +o_ 1350 +io 1311 +a_ 1299 +ol_ 1294 +_e 1273 +_h 1249 +cy 1246 +yr 1243 +en 1228 +p 1217 +_f 1212 +_cy 1203 +mr 1200 +gy 1193 +ymr 1185 +ha 1172 +hy 1164 +eg 1159 +do 1151 +edd 1147 +G 1134 +we 1107 +C 1089 +_ar 1078 +_m 1075 +yf 1057 +de 1048 +fe 1047 +_G 1045 +ch 1043 +rae 1023 +el 1016 +no 1010 +ni 1010 +_b 1009 +ne 1001 +ef 992 +ri 983 +et 978 +_C 957 +_gy 952 +_a_ 943 +eth 943 +_. 940 +_._ 939 +eit 930 +eith 927 +ys 921 +wyd 912 +ga 901 +_i_ 893 +s_ 887 +mrae 878 +mraeg 878 +raeg 878 +ymrae 878 +aeg 878 +ymra 878 +mra 878 +_p 856 +aet 850 +aeth 850 +_n 844 +un 838 +on 836 +ait 833 +aith 833 +nn 830 +at 827 +oe 823 +li 805 +_r 801 +ddi 795 +gw 789 +ma 783 +le 777 +nt 772 +ho 769 +ff 766 +yr_ 764 +na 753 +la 748 +rh 747 +eg_ 734 +si 733 +ng 732 +dol 732 +ro 718 +al 712 +_dd 706 +wn 705 +oed 705 +Gy 704 +dy 701 +Cy 696 +o' 692 +ar_ 691 +ny 690 +Gym 678 +wr 677 +id 676 +_Gy 672 +_o_ 668 +Gymr 667 +if 662 +ith_ 662 +_ar_ 660 +iad 657 +_w 657 +fo 656 +eu 655 +aeg_ 650 +raeg_ 650 +aith_ 648 +or 648 +_Gym 646 +fa 642 +re 638 +_Cy 638 +_Gymr 635 +_gw 633 +fy 633 +oedd 633 +edd_ 629 +rd 627 +od_ 622 +ac 619 +ddo 612 +an_ 607 +Gymra 606 +er_ 605 +A 604 +eth_ 601 +hw 596 +ydd_ 591 +o'r 586 +o'r_ 586 +es 583 +ir 579 +dw 573 +go 559 +yl 548 +rw 545 +aeth_ 545 +wydd 543 +aw 539 +_rh 539 +dr 537 +ly 537 +fn 534 +dau 533 +_hy 531 +t_ 531 +sg 529 +'n 529 +* 528 +'n_ 523 +_* 522 +nyd 521 +nydd 521 +M 519 +st 518 +Y 516 +sy 515 +yd_ 513 +lw 512 +_ga 508 +iai 503 +il 502 +_l 499 +rt 494 +ad_ 493 +_yr 493 +_yr_ 492 +as 492 +dol_ 492 +f_ 491 +dda 491 +ig 490 +og 484 +wa 479 +he 478 +iaith 477 +iait 477 +oedd_ 475 +_ma 473 +c_ 472 +Cym 470 +te 469 +_ym 467 +am 467 +_M 465 +_ia 462 +efn 462 +i' 460 +ie 458 +_Cym 458 +_ac 457 +dau_ 456 +yw 455 +ew 453 +fr 441 +fod 441 +_A 441 +du 437 +_sy 434 +e_ 432 +wi 426 +Cymr 426 +se 425 +B 424 +D 424 +_Cymr 423 +bl 423 +lu 420 +in 417 +_t 417 +tr 414 +ac_ 413 +wed 410 +os 410 +_iai 407 +_iait 407 +el_ 405 +_ac_ 405 +rha 404 +m_ 404 +is 403 +on_ 401 +eu_ 393 +hi 393 +rdd 393 +id_ 389 +_Y 388 +ry 387 +odd 387 +rwy 387 +rf 386 +io_ 380 +ynn 380 +cyf 380 +hr 380 +_cyf 379 +yddi 379 +cyn 372 +_de 372 +rth 371 +ru 368 +S 363 +wei 363 +ysg 362 +_B 362 +u' 361 +yddo 360 +wn_ 360 +so 359 +dio 359 +_ei 358 +N 356 +dwy 355 +_da 353 +me 353 +gan 353 +gyf 353 +w_ 352 +_o' 351 +fer 349 +nol 347 +hyn 346 +ddy 346 +af 346 +ta 343 +ddol 343 +_fe 340 +nd 340 +mae 338 +_cyn 338 +efnyd 337 +fnydd 337 +fny 337 +efny 337 +fnyd 337 +iad_ 335 +_mae 333 +ion 333 +_ll 330 +def 330 +_gyf 327 +nt_ 326 +i'r_ 326 +- 326 +i'r 326 +weith 325 +weit 325 +defn 325 +defny 325 +bo 324 +hyd 323 +by 322 +_si 321 +ir_ 321 +hau 318 +nod 318 +edi 315 +I 314 +fyd 313 +wyr 313 +ada 311 +ddio 310 +rif 309 +sia 307 +sa 306 +fel 305 +tha 305 +_S 302 +_ne 302 +_u 301 +fod_ 300 +_o'r 300 +yg 300 +_o'r_ 300 +_i' 299 +ge 299 +dia 299 diff --git a/libtextcat/data/new_fingerprints/lm/yiddish_utf.lm b/libtextcat/data/new_fingerprints/lm/yiddish_utf.lm new file mode 100644 index 000000000000..e3386a3c1d86 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/yiddish_utf.lm @@ -0,0 +1,400 @@ +× 29767 +_ 13552 +_× 6516 +Ö 4273 + 3670 +× 3670 +×¢ 3261 +¢ 3261 +¢× 2826 +×¢× 2826 +×Ö 2785 +Ö 2785 +×™ 2565 +™ 2565 +¨ 2082 +ר 2082 +™× 2062 +×™× 2062 +˜ 1857 +ט 1857 +· 1828 +Ö· 1828 +Ÿ 1793 +ן 1793 +_× 1537 +Ÿ_ 1532 +ן_ 1532 +·× 1527 +Ö·× 1527 +×Ö· 1517 +Ö· 1517 +×¨× 1355 +¨× 1355 +œ 1270 +ל 1270 +¸ 1268 +×Ö¸ 1268 +Ö¸ 1268 +Ö¸ 1268 +Ö·× 1240 +×Ö·× 1240 +× 1232 + 1232 +×Ö¸× 1229 +Ö¸× 1229 +¸× 1229 +Ö¸× 1229 +× × 1181 + × 1181 +ו 1116 +• 1116 +“ 1031 +ד 1031 +×˜× 1012 +˜× 1012 +×œ× 1007 +œ× 1007 +×•× 975 +•× 975 +×“× 965 +“× 965 +×¤Ö 929 +¤ 929 +פ 929 +¤Ö 929 +¢×¨ 873 +ער 873 +×× 870 +× 870 +×– 868 +– 868 +¡ 832 +ס 832 +§ 797 +ק 797 +_×× 784 +מ 766 +ž 766 +_×Ö 752 +ž× 747 +×ž× 747 +’ 742 +×’ 742 +© 725 +ש 725 +ט_ 699 +˜_ 699 +×° 691 +° 691 +×°× 674 +°× 674 +² 667 +ײ 667 +’× 664 +×’× 664 +¿ 661 +Ö¿ 661 +×§× 654 +§× 654 +¿× 652 +Ö¿× 652 +×©× 651 +©× 651 +–× 643 +×–× 643 +פֿ 641 +¤Ö¿ 641 +×¤Ö¿× 638 +¤Ö¿× 638 +_×Ö· 638 +ר_ 592 +¨_ 592 +_ד 560 +_×“× 554 +, 551 +,_ 534 +ב 517 +‘ 517 +_×¤Ö 502 +_פ 502 +×‘× 472 +‘× 472 +×¡× 436 +¡× 436 +צ 436 +¦ 436 +×¦× 433 +¦× 433 +_×° 430 +_×°× 429 +” 410 +×” 410 +¢×¨_ 404 +_פֿ 404 +ער_ 404 +_מ 400 +_×ž× 396 +×’×¢ 390 +’×¢ 390 +_×– 390 +_×–× 386 +’×¢× 384 +ען 384 +×’×¢× 384 +¢×Ÿ 384 +×¢×¨× 382 +¢×¨× 382 +. 374 +××™ 372 +×™ 372 +Ö¼ 366 +¼ 366 +”× 363 +×”× 363 +×™× 362 +××™× 362 +_×’ 359 +×± 357 +± 357 +_×’× 356 +ון 349 +•×Ÿ 349 +×™_ 346 +™_ 346 +×¢_ 343 +¢_ 343 +•×Ÿ_ 337 +ון_ 337 +“×¢ 334 +דע 334 +_××™ 331 +ך 330 +š 330 +“×¢× 323 +¼× 323 +×“×¢× 323 +Ö¼× 323 +²× 322 +×²× 322 +ען_ 321 +¢×Ÿ_ 321 +±× 320 +×±× 320 +“×™ 317 +די 317 +²Ö· 311 +ַר 311 +×²Ö 311 +·×¨ 311 +ַר 311 +ײַ 311 +²Ö 311 +לע 310 +œ×¢ 310 +._ 307 +- 305 +·_ 297 +Ö·_ 297 +› 294 +×› 294 + ×¢ 290 +× ×¢ 290 +ס_ 289 +¡_ 289 +פּ 288 +¤Ö¼ 288 +×²Ö·× 287 +²Ö·× 287 +ך_ 283 +š_ 283 +Ö·_ 274 +×Ö·_ 274 +Ö·×¨× 272 +·×¨× 272 +×œ×¢× 261 +œ×¢× 261 +_×”× 260 +_×” 260 +_צ 259 +™×© 257 +יש 257 +_×¦× 256 +×–×™ 254 +–×™ 254 +×¤Ö¼× 252 +¤Ö¼× 252 +מע 248 +ž×¢ 248 +_ק 247 +× ×¢× 247 + ×¢× 247 +טע 245 +˜×¢ 245 +_×§× 245 +×ו 244 +×•× 244 +ו 244 +××•× 244 +¢×œ 238 +על 238 +”× 235 +×”× 235 +×”×Ö 235 +שט 235 +©×˜ 235 +”×Ö 235 +×›× 234 +›× 234 +_×‘× 231 +“ער 231 +_ב 231 +Ö·× 229 +·× 229 +·× × 228 +Ö·× × 228 +_×ו 227 +_×’×¢ 226 + 219 +× 219 +" 218 +ž×¢× 217 +×ž×¢× 217 +°×¢ 217 +×°×¢ 217 +·×œ 216 +ַל 216 +_×–×™ 215 +_× 215 +×¢×œ× 212 +¢×œ× 212 +¨× 211 +×¨× 211 +_× × 211 +×˜×¢× 211 +˜×¢× 211 +×°× 210 +°× 210 +_דע 210 +°×Ö 209 +×°×¢× 209 +×°×Ö 209 +°×¢× 209 +™×©× 208 +×™×©× 208 +™×§ 207 +יק 207 +ר×Ö 206 +¨×Ö 206 +–_ 205 +×–_ 205 +ž×™ 196 +מי 196 +_ש 195 +×ž×™× 195 +ž×™× 195 +ַל 193 +_×©× 191 +Ö¿× 189 +¿× 189 +¤Ö¿× 189 +Ö¿×Ö 188 +¿×• 188 +Ö¿×•× 188 +ֿו 188 +¿×•× 188 +¿×Ö 188 +ון 187 +”×Ö¸ 186 +_×”× 185 +¤Ö¿×• 184 +_×¢ 179 +_די 178 +˜× 176 +×˜× 176 +ט×Ö 175 +˜×Ö 175 +ָס 174 +ָס 174 +יט 174 +™×˜ 174 +¸×¡ 174 +ל_ 173 +œ_ 173 +“×™_ 173 +די_ 173 +×_ 171 +·×œ× 171 +_ 171 +Ö·×œ× 171 +_×¢× 171 +ָט 169 +¸×˜ 169 +ָט 169 +יך 168 +™×š 168 +ָר 166 +–×™× 166 +ָר 166 +¸×¨ 166 +×–×™× 166 +× ×™ 164 +×¢× 164 + ×™ 164 +¢× 164 +¨×™ 163 +רי 163 +יך_ 163 +™×š_ 163 +°×Ö¸ 162 +×¢× × 162 +¢× × 162 +¿×Ö· 160 +¢×˜ 160 +עט 160 +_×™ 158 +¨×™× 157 +™×¨ 157 +×¨×™× 157 +יר 157 +-× 156 +ָס_ 155 +¸×¡_ 155 +œ×™ 154 +_מי 154 +לי 154 +קע 153 +×•× 153 +§×¢ 153 +•× 153 +_×°× 152 + ×™× 152 +× ×™× 152 +™×Ÿ 151 +ין 151 +××± 151 +×± 151 +×™Ö 150 +·×˜ 150 +´ 150 +™Ö 150 +×™Ö´ 150 +™Ö´ 150 +Ö´ 150 +ַט 150 +Ö´× 149 +™Ö´× 149 +´× 149 +×™Ö´× 149 +œ×™× 148 +_×™× 148 +×œ×™× 148 +×±× 146 +××±× 146 +Ö·× 146 +_××± 145 +¿×•×Ÿ 145 +×™×– 143 +™×– 143 +“×™× 142 +×“×™× 142 diff --git a/libtextcat/data/new_fingerprints/lm/zulu.lm b/libtextcat/data/new_fingerprints/lm/zulu.lm new file mode 100644 index 000000000000..f30c09ced93f --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/zulu.lm @@ -0,0 +1,400 @@ +_ +a +e +i +n +u +o +l +k +h +s +a_ +b +t +m +g +w +z +e_ +i_ +ng +ku +d +y +la +an +_n +th +le +_u +o_ +el +ba +_k +en +in +wa +p +_e +zi +. +hi +si +al +ha +uk +ab +_i +r +is +ka +_a +kh +we +li +ni +ma +_ng +he +ul +._ +ga +thi +la_ +be +ak +c +on +nd +na +ok +am +lo +ho +, +se +ph +hi_ +ut +es +nga +_ku +,_ +ez +thi_ +un +uth +le_ +uku +hu +f +u_ +um +ek +ne +go +q +_uk +at +aba +_l +sh +lu +M +_uku +ol +_b +hl +ni_ +ngo +kw +- +N +ik +oku +em +nt +as +ge +az +ya +iz +sa +_o +S +uthi +A +za +_w +wa_ +_s +mb +kut +kuth +ela +ye +_y +uthi_ +il +ay +ele +ba_ +I +dl +nge +ath +ub +ke +U +zo +na_ +yi +us +kuthi +esi +ob +v +om +ama +it +lo_ +bu +L +ezi +j +ny +im +ing +li_ +_ab +eni +no +de +ela_ +ze +ang +ko +ala +lw +yo +zin +_U +lel +eng +mi +_ngo +eb +uz +me +gi +ti +ukut +so +ukuth +bo +da +_ba +nz +_aba +the +eli +akh +eni_ +E +ban +s_ +aka +_kw +ma_ +ap +_ukut +he_ +ini +di +K +ka_ +ib +kwa +ulu +ele_ +kho +nj +bi +_z +khu +we_ +lal +enz +ho_ +et +C +gu +zi_ +and +hla +ngi +pha +_um +_ka +isi +_nge +isa +aph +ung +izi +dla +ala_ +zw +nde +to +n_ +ne_ +nk +ke_ +_I +athi +_no +lan +_wa +kul +B +ind +fu +wen +ikh +azi +ule +kub +e. +_S +x +o. +ona +kha +_iz +je +bh +_M +er +kwe +oba +ane +O +_N +sa_ +a. +lwa +_ez +kus +ki +mu +od +" +ebe +P +_nga +hul +_m +ase +ben +_be +T +ic +nda +_si +_na +/ +ant +ngu +ad +anga +nje +ith +a._ +ye_ +athi_ +R +os +alo +tha +za_ +eth +_es +uma +ana +ile +te +ale +aban +: +_A +oba_ +hat +kun +ha_ +phe +be_ +ali +_am +si_ +wo +uy +sik +ise +kan +hath +dlal +_ne +zwe +aw +han +tu +nye +qe +_ko +ah +hel +thu +isa_ +gob +_K +_lo +ta +_ama +ot +ula +_em +ze_ +i. +ngob +_izi +hol +ar +ani +ole +uba +_in +up +eka +ini_ +goba +tho +hon +_ezi +ona_ +ezin +ngoba +lu_ +goba_ +ip +a, +eli_ +t_ +nya +ndl +sha +_is +the_ +i._ +amb diff --git a/libtextcat/libtextcat-2.2.patch b/libtextcat/libtextcat-2.2.patch new file mode 100644 index 000000000000..ca7a26cabfbf --- /dev/null +++ b/libtextcat/libtextcat-2.2.patch @@ -0,0 +1,4078 @@ +--- misc/libtextcat-2.2/configure Thu May 22 13:39:55 2003 ++++ misc/build/libtextcat-2.2/configure Mon Mar 31 11:29:14 2008 +@@ -3451,7 +3451,7 @@ + ;; + + # This must be Linux ELF. +-linux-gnu*) ++linux-gnu*|k*bsd*-gnu*) + case $host_cpu in + alpha* | hppa* | i*86 | mips | mipsel | powerpc* | sparc* | ia64*) + lt_cv_deplibs_check_method=pass_all ;; +@@ -5391,7 +5391,8 @@ + allow_undefined_flag= + no_undefined_flag= + need_lib_prefix=unknown +-need_version=unknown ++#need_version=unknown ++need_version=no + # when you set need_version to no, make sure it does not cause -set_version + # flags to be left without arguments + archive_cmds= +@@ -5785,7 +5786,7 @@ + # cross-compilation, but unfortunately the echo tests do not + # yet detect zsh echo's removal of \ escapes. Also zsh mangles + # `"' quotes if we put them in here... so don't! +- archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' ++ archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$compiler_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' + # We need to add '_' to the symbols in $export_symbols first + #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols' + hardcode_direct=yes +@@ -6280,7 +6281,7 @@ + ;; + + freebsd*) +- objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout` ++ objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo elf` + version_type=freebsd-$objformat + case $version_type in + freebsd-elf*) +@@ -6365,7 +6365,7 @@ + ;; + + # This must be Linux ELF. +-linux-gnu*) ++linux-gnu*|k*bsd*-gnu*) + version_type=linux + need_lib_prefix=no + need_version=no +--- misc/libtextcat-2.2/src/Makefile.in Thu May 22 13:39:52 2003 ++++ misc/build/libtextcat-2.2/src/Makefile.in Mon Mar 31 11:29:14 2008 +@@ -124,20 +124,20 @@ + target_vendor = @target_vendor@ + AUTOMAKE_OPTIONS = 1.4 foreign + +-WARNS = -W -Wall -Wshadow -Wpointer-arith +-IFLAGS = +-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE ++#WARNS = -W -Wall -Wshadow -Wpointer-arith ++IFLAGS = ++#FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g + + noinst_HEADERS = \ +- common.h constants.h fingerprint.h textcat.h wg_mempool.h ++ common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h + + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +- common.c fingerprint.c textcat.c wg_mempool.c ++ common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c + + + bin_PROGRAMS = createfp +@@ -156,7 +156,7 @@ + libtextcat_la_LDFLAGS = + libtextcat_la_LIBADD = + am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ +- wg_mempool.lo ++ wg_mempool.lo utf8misc.lo + libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) + bin_PROGRAMS = createfp$(EXEEXT) + noinst_PROGRAMS = testtextcat$(EXEEXT) +@@ -177,7 +177,8 @@ + @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ + @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ + @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ +-@AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo ++@AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \ ++@AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo + COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ +@@ -213,7 +214,7 @@ + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status src/config.h + +-$(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) ++$(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOHEADER) + touch $(srcdir)/config.h.in + +@@ -247,8 +248,8 @@ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +-libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) +- $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) ++libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) ++ $(LINK) -avoid-version -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) + binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) + install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) +@@ -285,10 +286,10 @@ + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +-createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) ++createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) + @rm -f createfp$(EXEEXT) + $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) +-testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) ++testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) + @rm -f testtextcat$(EXEEXT) + $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) + +@@ -304,6 +305,7 @@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@ + + distclean-depend: + -rm -rf ./$(DEPDIR) +--- misc/libtextcat-2.2/src/common.c Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/common.c Mon Mar 31 11:29:14 2008 +@@ -3,23 +3,23 @@ + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -114,11 +114,11 @@ + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + +- return( result ); ++ return( result ); + } + +-extern void* wg_realloc( void *ptr, size_t size ) +-{ ++extern void* wg_realloc( void *ptr, size_t size ) ++{ + void *result; + + if (!size) { +@@ -131,7 +131,7 @@ + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + +- return( result ); ++ return( result ); + } + + extern void wg_free( void *mem ) +@@ -148,12 +148,12 @@ + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } +- ++ + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; +- } +- ++ } ++ + return line; + } + +@@ -164,39 +164,39 @@ + * + * ARGUMENTS: + * - result: +- * ++ * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. +- * +- * - dest: +- * ++ * ++ * - dest: ++ * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! +- * +- * - src: +- * ++ * ++ * - src: ++ * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. +- * ++ * + * Example: +- * ++ * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: +- * ++ * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" +- * +- * - maxsegments: +- * ++ * ++ * - maxsegments: ++ * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. +- * ++ * + * RETURN VALUE: + * The number of segments found. + */ +@@ -218,12 +218,12 @@ + switch (state) { + case 0: + /*** Skip spaces ***/ +- while ( isspace((int) *p) ) { ++ while ( isspace((unsigned char) *p) ) { + p++; + } + state = 1; + +- case 1: ++ case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; +@@ -232,12 +232,12 @@ + case 2: + /*** Unquoted segment ***/ + while (*p) { +- if ( isspace((int) *p) ) { ++ if ( isspace((unsigned char) *p) ) { + *w++ = '\0'; + p++; + state = 0; + break; +- } ++ } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; +@@ -292,17 +292,17 @@ + } + + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t) + { +-#ifdef HAVE_GETTIMEOFDAY + gettimeofday( &(t->start), NULL ); +-#endif + } ++#endif /* TL : no struct timeval under Win32 */ + + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern uint4 wg_timerstop(wgtimer_t *t) + { +-#ifdef HAVE_GETTIMEOFDAY + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + +@@ -312,25 +312,23 @@ + t->start.tv_usec = t->stop.tv_usec; + + return result; +-#else +- return 0; +-#endif + } ++#endif /* TL : no struct timeval under Win32 */ + + + /** + * wg_strgmov -- a guarded strcpy() variation +- * ++ * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be +- * finished, the function returns NULL after restoring the first +- * character in dest for your convenience (since this is usually a zero). ++ * finished, the function returns NULL after restoring the first ++ * character in dest for your convenience (since this is usually a zero). + */ + char *wg_strgmov( char *dest, const char *src, const char *destlimit ) + { + char tmp, *w; +- ++ + if ( !dest || dest >= destlimit ) { + return NULL; + } +@@ -355,7 +353,7 @@ + } + + /* +- * wg_trim() -- remove whitespace surrounding a string. ++ * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * +@@ -373,12 +371,12 @@ + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; +- +- while ( isspace((int)*p) ) { ++ ++ while ( isspace((unsigned char)*p) ) { + p++; + } + while (*p) { +- if ( !isspace((int)*p) ) { ++ if ( !isspace((unsigned char)*p) ) { + lastnonspace = w; + } + *w++ = *p++; +--- misc/libtextcat-2.2/src/common.h Thu May 22 15:02:29 2003 ++++ misc/build/libtextcat-2.2/src/common.h Mon Mar 31 11:29:14 2008 +@@ -1,28 +1,28 @@ + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +- * common.h -- a mixed bag of helper functions ++ * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -86,10 +86,12 @@ + typedef char boole; + #endif + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + typedef struct wgtimer_s { + struct timeval start; + struct timeval stop; + } wgtimer_t; ++#endif /* TL : no struct timeval under Win32 */ + + + extern void *wg_malloc( size_t size ); +@@ -101,13 +103,15 @@ + + extern char *wg_getline( char *line, int size, FILE *fp ); + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t); + extern uint4 wg_timerstop(wgtimer_t *t); ++#endif /* TL : no struct timeval under Win32 */ + + extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + +- ++ + #endif + +--- misc/libtextcat-2.2/src/constants.h Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/constants.h Mon Mar 31 11:29:14 2008 +@@ -39,6 +39,8 @@ + */ + #include <limits.h> + ++#define _UTF8_ ++ + #define DESCRIPTION "out of place" + + /* Reported matches are those fingerprints with a score less than best +@@ -59,14 +61,21 @@ + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +-/* Maximum size of an n-gram? */ +-#define MAXNGRAMSIZE 5 ++/* Maximum number of character of an n-gram? */ ++#define MAXNGRAMSYMBOL 5 ++ ++/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ ++#ifdef _UTF8_ ++#define MAXNGRAMSIZE 20 ++#else ++#define MAXNGRAMSIZE MAXNGRAMSYMBOL ++#endif + + /* Which characters are not acceptable in n-grams? */ +-#define INVALID(c) (isspace((int)c) || isdigit((int)c)) ++#define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c)) + + /* Minimum size (in characters) for accepting a document */ +-#define MINDOCSIZE 25 ++#define MINDOCSIZE 6 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +@@ -75,5 +84,8 @@ + #define TABLEPOW 13 + + #define MAXSCORE INT_MAX ++ ++/* where the fingerprints files are stored */ ++#define DEFAULT_FINGERPRINTS_PATH "" + + #endif +--- misc/libtextcat-2.2/src/fingerprint.c Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/fingerprint.c Mon Mar 31 11:29:14 2008 +@@ -6,23 +6,23 @@ + * All rights reserved. + * + * THE BSD LICENSE +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -51,7 +51,7 @@ + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in +- * nearly-sorted order. This causes quicksort to behave very badly. ++ * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * +@@ -63,6 +63,10 @@ + * - put table/heap datastructure in a separate file. + */ + ++#ifndef _UTF8_ ++#define _UTF8_ ++#endif ++ + #include "config.h" + #include <stdio.h> + #ifdef HAVE_STDLIB_H +@@ -80,10 +84,12 @@ + #include "wg_mempool.h" + #include "constants.h" + ++#include "utf8misc.h" + + #define TABLESIZE (1<<TABLEPOW) + #define TABLEMASK ((TABLESIZE)-1) + ++ + typedef struct { + + sint2 rank; +@@ -96,7 +102,7 @@ + const char *name; + ngram_t *fprint; + uint4 size; +- ++ + } fp_t; + + typedef struct entry_s { +@@ -105,13 +111,13 @@ + struct entry_s *next; + } entry_t; + +-typedef struct table_s { ++typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; +- ++ + uint4 heapsize; + uint4 size; + } table_t; +@@ -122,7 +128,7 @@ + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it +- * during n-gram construction) ++ * during n-gram construction) + */ + static uint4 simplehash( const char *p, int len ) + { +@@ -134,29 +140,14 @@ + } + + +-/* checks if n-gram lex is a prefix of key and of length len */ +-inline int issame( char *lex, char *key, int len ) +-{ +- int i; +- for (i=0; i<len; i++) { +- if ( key[i] != lex[i] ) { +- return 0; +- } +- } +- if ( lex[i] != 0 ) { +- return 0; +- } +- return 1; +-} +- + + /* increases frequency of ngram(p,len) */ +-static inline int increasefreq( table_t *t, char *p, int len ) +-{ +- uint4 hash = simplehash( p, len ) & TABLEMASK; ++static int increasefreq( table_t *t, char *p, int len ) ++{ ++ uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +- +- while ( entry ) { ++ ++ while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +@@ -168,7 +159,7 @@ + } + + /*** Not found, so create ***/ +- entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); ++ entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); + strcpy( entry->str, p ); + entry->cnt = 1; + +@@ -181,12 +172,12 @@ + #if 0 + + /* looks up ngram(p,len) */ +-static entry_t *findfreq( table_t *t, char *p, int len ) +-{ +- uint4 hash = simplehash( p, len ) & TABLEMASK; ++static entry_t *findfreq( table_t *t, char *p, int len ) ++{ ++ uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +- +- while ( entry ) { ++ ++ while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +@@ -219,7 +210,7 @@ + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +-inline static void siftup( table_t *t, unsigned int child ) ++static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +@@ -241,7 +232,7 @@ + } + + +-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) ++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +@@ -273,7 +264,7 @@ + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); +- t->size++; ++ t->size++; + return 0; + } + +@@ -316,18 +307,18 @@ + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { +- entry_t *p = t->table[i]; ++ entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } +- } ++ } + return 1; + } + + + static table_t *inittable(uint4 maxngrams) +-{ ++{ + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); +@@ -347,14 +338,14 @@ + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); +- wg_free(t); ++ wg_free(t); + } + + + extern void *fp_Init(const char *name) + { + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); +- ++ + if ( name ) { + h->name = wg_strdup(name); + } +@@ -458,21 +449,27 @@ + return dest; + } + +- ++/** ++* this function extract all n-gram from past buffer and put them into the table "t" ++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice ++*/ + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; ++ int pointer = 0; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +- for (;;p++) { ++ while(1) { + +- const char *q = p; ++ const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ + char *m = n; + + /*** First char may be an underscore ***/ +- *m++ = *q++; ++ int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ ++ q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ ++ m += decay; /*[modified]*/ + *m = '\0'; + + increasefreq( t, n, 1 ); +@@ -482,19 +479,22 @@ + } + + /*** Let the compiler unroll this ***/ +- for ( i=2; i<=MAXNGRAMSIZE; i++) { ++ for ( i=2; i<=MAXNGRAMSYMBOL; i++) { + +- *m++ = *q; ++ decay = charcopy(q, m); /*[modified] like above*/ ++ m += decay; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +- q++; ++ q += decay; + if ( *q == '\0' ) { + return; + } + } ++ ++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ + } + return; + } +@@ -514,7 +514,7 @@ + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +- ++ + return mystrcmp( x->str, y->str ); + } + +@@ -522,12 +522,12 @@ + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +- ++ + return x->rank - y->rank; + } + + /** +- * Create a fingerprint: ++ * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank +@@ -544,20 +544,21 @@ + } + + /*** Throw out all invalid chars ***/ +- tmp = prepbuffer( buffer, bufsize ); ++ tmp = prepbuffer( buffer, bufsize ); ++ /*printf("Cleaned buffer : %s\n",tmp);*/ + if ( tmp == NULL ) { + return 0; + } +- + h = (fp_t*)handle; + t = inittable(maxngrams); ++ /*printf("Table initialized\n");*/ + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); +- ++ /*printf("Table created\n");*/ + /*** Take the top N n-grams and add them to the profile ***/ +- table2heap(t); +- maxngrams = WGMIN( maxngrams, t->size ); ++ table2heap(t); ++ maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; +@@ -568,7 +569,7 @@ + entry_t tmp2; + + heapextract(t, &tmp2); +- ++ + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; +@@ -578,7 +579,7 @@ + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ +- qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); ++ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; + } + +@@ -608,7 +609,7 @@ + #endif + return 0; + } +- ++ + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { +@@ -635,7 +636,7 @@ + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ +- qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); ++ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + +@@ -648,14 +649,15 @@ + { + uint4 i; + fp_t *h = (fp_t *)handle; +- ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); +- ++ ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size ); ++ + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); +- qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); ++ qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { +- fprintf( fp, "%s\n", tmp[i].str ); ++ /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/ ++ fprintf( fp, "%s\n", tmp[i].str); + } + wg_free( tmp ); + } +@@ -669,7 +671,7 @@ + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; +- ++ + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + +@@ -705,7 +707,7 @@ + } + + return sum; +- ++ + } + + +--- misc/libtextcat-2.2/src/fingerprint.h Mon May 19 14:16:31 2003 ++++ misc/build/libtextcat-2.2/src/fingerprint.h Mon Mar 31 11:29:14 2008 +@@ -41,7 +41,13 @@ + extern int fp_Read( void *handle, const char *fname, int maxngrams ); + extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); + extern void fp_Show( void *handle ); ++#ifdef __cplusplus ++extern "C" { ++#endif + extern const char *fp_Name( void *handle ); ++#ifdef __cplusplus ++} ++#endif + extern void fp_Print( void *handle, FILE *fp ); + + #endif +--- misc/libtextcat-2.2/src/libtextcat.map Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/libtextcat.map Mon Mar 31 11:29:14 2008 +@@ -1 +1,40 @@ +-dummy ++{ ++ global: ++ charcopy ++ issame ++ nextcharstart ++ utfstrlen ++ wgmempool_Done ++ wgmempool_Init ++ wgmempool_Reset ++ wgmempool_alloc ++ wgmempool_getline ++ wgmempool_strdup ++ special_textcat_Init ++ textcat_Classify ++ textcat_Done ++ textcat_Init ++ textcat_Version ++ fp_Compare ++ fp_Create ++ fp_Debug ++ fp_Done ++ fp_Init ++ fp_Name ++ fp_Print ++ fp_Read ++ heapextract ++ wg_calloc ++ wg_free ++ wg_getline ++ wg_malloc ++ wg_split ++ wg_strdup ++ wg_strgmov ++ wg_trim ++ wg_zalloc ++ wgmem_error ++ ++ local: ++ *; ++} +--- misc/libtextcat-2.2/src/makefile.mk Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/makefile.mk Mon Mar 31 11:29:42 2008 +@@ -1 +1,87 @@ +-dummy ++#************************************************************************* ++# ++# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++# ++# Copyright 2000, 2010 Oracle and/or its affiliates. ++# ++# OpenOffice.org - a multi-platform office productivity suite ++# ++# This file is part of OpenOffice.org. ++# ++# OpenOffice.org is free software: you can redistribute it and/or modify ++# it under the terms of the GNU Lesser General Public License version 3 ++# only, as published by the Free Software Foundation. ++# ++# OpenOffice.org is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU Lesser General Public License version 3 for more details ++# (a copy is included in the LICENSE file that accompanied this code). ++# ++# You should have received a copy of the GNU Lesser General Public License ++# version 3 along with OpenOffice.org. If not, see ++# <http://www.openoffice.org/license.html> ++# for a copy of the LGPLv3 License. ++# ++#************************************************************************* ++ ++PRJ = ..$/..$/..$/..$/.. ++ ++PRJNAME = libtextcat ++TARGET = libtextcat ++CFLAGSCALL=gsd ++ ++USE_DEFFILE=TRUE ++EXTERNAL_WARNINGS_NOT_ERRORS := TRUE ++UWINAPILIB= ++ ++.INCLUDE : settings.mk ++ ++# --- Files -------------------------------------------------------- ++ ++# !! not to be compiled because those belong to a stand alone programs: !! ++# $(SLO)$/createfp.obj\ ++# $(SLO)$/testtextcat.obj ++ ++SLOFILES= \ ++ $(SLO)$/common.obj\ ++ $(SLO)$/fingerprint.obj\ ++ $(SLO)$/textcat.obj\ ++ $(SLO)$/wg_mempool.obj\ ++ $(SLO)$/utf8misc.obj ++ ++#SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX) ++SHL1TARGET= $(TARGET) ++ ++SHL1STDLIBS= ++ ++# build DLL ++SHL1LIBS= $(SLB)$/$(TARGET).lib ++SHL1IMPLIB= i$(TARGET) ++SHL1DEPN= $(SHL1LIBS) ++SHL1DEF= $(MISC)$/$(SHL1TARGET).def ++ ++# build DEF file ++DEF1NAME= $(SHL1TARGET) ++DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt ++ ++SHL1VERSIONMAP= libtextcat.map ++ ++# --- Targets ------------------------------------------------------ ++ ++.INCLUDE : target.mk ++ ++# copy hand supplied configuration file for Win32 builds to the file ++# which is included in the source code ++$(SLOFILES) : config.h ++config.h : ++ $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h ++ ++ ++$(MISC)$/$(SHL1TARGET).flt: makefile.mk ++ @echo ------------------------------ ++ @echo Making: $@ ++ @echo Imp>$@ ++ @echo __CT>>$@ ++ @echo _real>>$@ ++ @echo unnamed>>$@ +--- misc/libtextcat-2.2/src/textcat.c Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/textcat.c Mon Mar 31 11:29:14 2008 +@@ -4,23 +4,23 @@ + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -74,6 +74,7 @@ + typedef struct { + + void **fprint; ++ char *fprint_disable; + uint4 size; + uint4 maxsize; + +@@ -112,11 +113,21 @@ + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); ++ wg_free( h->fprint_disable ); + wg_free( h ); + + } + +-extern void *textcat_Init( const char *conffile ) ++/** Replaces older function */ ++extern void *textcat_Init( const char *conffile ){ ++ return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); ++} ++ ++/** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo use ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++extern void *special_textcat_Init( const char *conffile, const char *prefix ) + { + textcat_t *h; + char line[1024]; +@@ -134,11 +145,13 @@ + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +- int res; ++ char finger_print_file_name[512]; ++ int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +@@ -156,17 +169,23 @@ + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +- h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ++ h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +- if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { ++ finger_print_file_name[0] = '\0'; ++ strcat(finger_print_file_name, prefix); ++ strcat(finger_print_file_name, segment[0]); ++ ++ if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +- } ++ } ++ h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ + h->size++; + } + +@@ -203,11 +222,18 @@ + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +- ++ + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +- int score = fp_Compare( h->fprint[i], unknown, threshold ); +- candidates[i].score = score; ++ int score; ++ if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ ++ score = MAXSCORE; ++ } ++ else{ ++ score = fp_Compare( h->fprint[i], unknown, threshold ); ++ /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ ++ } ++ candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +@@ -218,7 +244,6 @@ + /*** Find the best performers ***/ + for (i=0; i<h->size; i++) { + if ( candidates[i].score < threshold ) { +- + if ( ++cnt == MAXCANDIDATES+1 ) { + break; + } +@@ -235,7 +260,7 @@ + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; +- ++ + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; +@@ -247,7 +272,7 @@ + } + READY: + fp_Done(unknown); +-#ifdef SHOULD_FREE ++#ifdef SHOULD_FREE + free(candidates); + #undef SHOULD_FREE + #endif +--- misc/libtextcat-2.2/src/textcat.h Mon May 19 14:16:31 2003 ++++ misc/build/libtextcat-2.2/src/textcat.h Mon Mar 31 11:29:14 2008 +@@ -40,6 +40,9 @@ + #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" + #define _TEXTCAT_RESULT_SHORT "SHORT" + ++#ifdef __cplusplus ++extern "C" { ++#endif + + /** + * textcat_Init() - Initialize the text classifier. The textfile +@@ -51,10 +54,19 @@ + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) ++ * ++ * Replace older function (and has exacly the same behaviour) ++ * see below + */ + extern void *textcat_Init( const char *conffile ); + + /** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++extern void *special_textcat_Init( const char *conffile, const char *prefix ); ++ ++/** + * textcat_Done() - Free up resources for handle + */ + extern void textcat_Done( void *handle ); +@@ -77,4 +89,8 @@ + * textcat_Version() - Returns a string describing the version of this classifier. + */ + extern char *textcat_Version(); ++ ++#ifdef __cplusplus ++} ++#endif + #endif +--- misc/libtextcat-2.2/src/utf8misc.c Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/utf8misc.c Mon Mar 31 11:29:14 2008 +@@ -1 +1,132 @@ +-dummy ++/*************************************************************************** ++ * Copyright (C) 2006 by Jocelyn Merand * ++ * joc.mer@gmail.com * ++ * * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ***************************************************************************/ ++ ++#ifndef _UTF8_MISC_H_ ++#include "utf8misc.h" ++#endif ++ ++ ++int nextcharstart(const char *str, int position){ ++ int pointer = position; ++ ++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then str[pointer] is an escape character*/ ++ ++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ ++ ++pointer; ++ } ++ return pointer; ++} ++ ++ ++int charcopy(const char *str, char *dest){ ++ ++ int pointer = 0; ++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then str[pointer] is an escape character*/ ++ ++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ++ dest[pointer] = str[pointer]; ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ if(str[pointer]){ ++ dest[pointer] = str[pointer]; ++ ++pointer; ++ } ++ ++ return pointer; ++} ++ ++ ++int issame( char *lex, char *key, int len ) ++{ ++ /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ ++ int char_counter = 0; ++ int pointer = 0; ++ while(char_counter < len) { ++ ++ if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then key[pointer] is an escap character*/ ++ ++ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ ++char_counter; /*and we are on a new utf8 character*/ ++ if ( key[pointer] != lex[pointer] ) { ++ return 0; ++ /*printf(" NO\n", lex, key, len);*/ ++ } ++ ++pointer; ++ } ++ if ( lex[pointer] != '\0' ) { ++ return 0; ++ /*printf(" NO\n");*/ ++ } ++ ++ /*printf(" YES\n");*/ ++ ++ return 1; ++} ++ ++ ++extern int utfstrlen(const char* str){ ++ int char_counter = 0; ++ int pointer = 0; ++ while(str[pointer]) { ++ pointer = nextcharstart(str, pointer); ++ ++ ++char_counter; /*and we are on a new utf8 character*/ ++ } ++ return char_counter; ++} ++ +--- misc/libtextcat-2.2/src/utf8misc.h Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/utf8misc.h Mon Mar 31 11:29:14 2008 +@@ -1 +1,88 @@ +-dummy ++/*************************************************************************** ++ * Copyright (C) 2006 by Jocelyn Merand * ++ * joc.mer@gmail.com * ++ * * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ***************************************************************************/ ++ ++#ifndef _UTF8_MISC_H_ ++#define _UTF8_MISC_H_ ++ ++/** ++ * These variables are used in character processing functions ++ * These have been added to manage utf-8 symbols, particularly escape chars ++ */ ++#ifdef _UTF8_ ++#define ESCAPE_MASK 0x80 ++#define WEIGHT_MASK 0xF0 ++#else ++#define ESCAPE_MASK 0xFF ++#define WEIGHT_MASK 0x00 ++#endif ++ ++ ++/* ++ * Is used to jump to the next start of char ++ * of course it's only usefull when encoding is utf-8 ++ * This function have been added by Jocelyn Merand to use libtextcat in OOo ++ */ ++int nextcharstart(const char *str, int position); ++ ++ ++/*Copy the char in str to dest ++ * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char ++ * return the number of char jumped ++ * This function have been added by Jocelyn Merand to use libtextcat in OOo ++ */ ++int charcopy(const char *str, char *dest); ++ ++ ++/* checks if n-gram lex is a prefix of key and of length len ++* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex ++* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 ++*/ ++int issame( char *lex, char *key, int len ); ++ ++ ++/* Counts the number of characters ++* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str ++* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 ++*/ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern int utfstrlen(const char* str); ++#ifdef __cplusplus ++} ++#endif ++ ++#endif ++ +--- misc/libtextcat-2.2/src/win32_config.h Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/win32_config.h Mon Mar 31 11:29:14 2008 +@@ -1 +1,136 @@ +-dummy ++/* src/config.h. Generated by configure. */ ++/* src/config.h.in. Generated from configure.ac by autoheader. */ ++ ++/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP ++ systems. This function is required for `alloca.c' support on those systems. ++ */ ++/* #undef CRAY_STACKSEG_END */ ++ ++/* Define to 1 if using `alloca.c'. */ ++/* #undef C_ALLOCA */ ++ ++/* Define to 1 if you have `alloca', as a function or macro. */ ++/* #undef HAVE_ALLOCA */ ++ ++/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). ++ */ ++/* #undef HAVE_ALLOCA_H */ ++ ++/* Define to 1 if you have the <dlfcn.h> header file. */ ++#define HAVE_DLFCN_H 1 ++ ++/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ ++/* #undef HAVE_DOPRNT */ ++ ++/* Define to 1 if you have the `gettimeofday' function. */ ++/* #undef HAVE_GETTIMEOFDAY */ ++ ++/* Define to 1 if you have the <inttypes.h> header file. */ ++/* #undef HAVE_INTTYPES_H */ ++ ++/* Define to 1 if you have the <limits.h> header file. */ ++#define HAVE_LIMITS_H 1 ++ ++/* Define to 1 if your system has a GNU libc compatible `malloc' function, and ++ to 0 otherwise. */ ++#define HAVE_MALLOC 1 ++ ++/* Define to 1 if you have the <memory.h> header file. */ ++#define HAVE_MEMORY_H 1 ++ ++/* Define to 1 if you have the `memset' function. */ ++#define HAVE_MEMSET 1 ++ ++/* Define to 1 if your system has a GNU libc compatible `realloc' function, ++ and to 0 otherwise. */ ++#define HAVE_REALLOC 1 ++ ++/* Define to 1 if you have the <stdint.h> header file. */ ++/* #undef HAVE_STDINT_H */ ++ ++/* Define to 1 if you have the <stdlib.h> header file. */ ++#define HAVE_STDLIB_H 1 ++ ++/* Define to 1 if you have the `strchr' function. */ ++#define HAVE_STRCHR 1 ++ ++/* Define to 1 if you have the `strdup' function. */ ++#define HAVE_STRDUP 1 ++ ++/* Define to 1 if you have the <strings.h> header file. */ ++/* #undef HAVE_STRINGS_H */ ++ ++/* Define to 1 if you have the <string.h> header file. */ ++#define HAVE_STRING_H 1 ++ ++/* Define to 1 if you have the `strpbrk' function. */ ++#define HAVE_STRPBRK 1 ++ ++/* Define to 1 if you have the <sys/stat.h> header file. */ ++#define HAVE_SYS_STAT_H 1 ++ ++/* Define to 1 if you have the <sys/time.h> header file. */ ++/* #undef HAVE_SYS_TIME_H */ ++ ++/* Define to 1 if you have the <sys/types.h> header file. */ ++#define HAVE_SYS_TYPES_H 1 ++ ++/* Define to 1 if you have the <unistd.h> header file. */ ++#define HAVE_UNISTD_H 1 ++ ++/* Define to 1 if you have the `vprintf' function. */ ++#define HAVE_VPRINTF 1 ++ ++/* Name of package */ ++#define PACKAGE "libtextcat" ++ ++/* Define to the address where bug reports for this package should be sent. */ ++#define PACKAGE_BUGREPORT "" ++ ++/* Define to the full name of this package. */ ++#define PACKAGE_NAME "libtextcat" ++ ++/* Define to the full name and version of this package. */ ++#define PACKAGE_STRING "libtextcat 2.2" ++ ++/* Define to the one symbol short name of this package. */ ++#define PACKAGE_TARNAME "libtextcat" ++ ++/* Define to the version of this package. */ ++#define PACKAGE_VERSION "2.2" ++ ++/* If using the C implementation of alloca, define if you know the ++ direction of stack growth for your system; otherwise it will be ++ automatically deduced at run-time. ++ STACK_DIRECTION > 0 => grows toward higher addresses ++ STACK_DIRECTION < 0 => grows toward lower addresses ++ STACK_DIRECTION = 0 => direction of growth unknown */ ++/* #undef STACK_DIRECTION */ ++ ++/* Define to 1 if you have the ANSI C header files. */ ++#define STDC_HEADERS 1 ++ ++/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ ++#define TIME_WITH_SYS_TIME 1 ++ ++/* Define to 1 if your <sys/time.h> declares `struct tm'. */ ++/* #undef TM_IN_SYS_TIME */ ++ ++/* Version number of package */ ++#define VERSION "2.2" ++ ++/* Define to empty if `const' does not conform to ANSI C. */ ++/* #undef const */ ++ ++/* Define as `__inline' if that's what the C compiler calls it, or to nothing ++ if it is not supported. */ ++/* #undef inline */ ++ ++/* Define to rpl_malloc if the replacement function should be used. */ ++/* #undef malloc */ ++ ++/* Define to rpl_realloc if the replacement function should be used. */ ++/* #undef realloc */ ++ ++/* Define to `unsigned' if <sys/types.h> does not define. */ ++/* #undef size_t */ +--- misc/libtextcat-2.2/config.guess 2010-04-15 09:20:04.000000000 +0000 ++++ misc/build/libtextcat-2.2/config.guess 2010-04-15 09:20:41.000000000 +0000 +@@ -1,9 +1,10 @@ + #! /bin/sh + # Attempt to guess a canonical system name. + # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +-# 2000, 2001, 2002 Free Software Foundation, Inc. ++# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 ++# Free Software Foundation, Inc. + +-timestamp='2002-10-21' ++timestamp='2009-12-30' + + # This file is free software; you can redistribute it and/or modify it + # under the terms of the GNU General Public License as published by +@@ -17,23 +18,25 @@ + # + # You should have received a copy of the GNU General Public License + # along with this program; if not, write to the Free Software +-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA ++# 02110-1301, USA. + # + # As a special exception to the GNU General Public License, if you + # distribute this file as part of a program that contains a + # configuration script generated by Autoconf, you may include it under + # the same distribution terms that you use for the rest of that program. + +-# Originally written by Per Bothner <per@bothner.com>. +-# Please send patches to <config-patches@gnu.org>. Submit a context +-# diff and a properly formatted ChangeLog entry. ++ ++# Originally written by Per Bothner. Please send patches (context ++# diff format) to <config-patches@gnu.org> and include a ChangeLog ++# entry. + # + # This script attempts to guess a canonical system name similar to + # config.sub. If it succeeds, it prints the system name on stdout, and + # exits with 0. Otherwise, it exits with 1. + # +-# The plan is that this can be called by configure scripts if you +-# don't specify an explicit build system type. ++# You can get the latest version of this script from: ++# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD + + me=`echo "$0" | sed -e 's,.*/,,'` + +@@ -53,8 +56,9 @@ + GNU config.guess ($timestamp) + + Originally written by Per Bothner. +-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +-Free Software Foundation, Inc. ++Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ++2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free ++Software Foundation, Inc. + + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." +@@ -66,11 +70,11 @@ + while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) +- echo "$timestamp" ; exit 0 ;; ++ echo "$timestamp" ; exit ;; + --version | -v ) +- echo "$version" ; exit 0 ;; ++ echo "$version" ; exit ;; + --help | --h* | -h ) +- echo "$usage"; exit 0 ;; ++ echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. +@@ -98,14 +102,18 @@ + # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still + # use `HOST_CC' if defined, but it is deprecated. + +-# This shell variable is my proudest work .. or something. --bje ++# Portable tmp directory creation inspired by the Autoconf team. + +-set_cc_for_build='tmpdir=${TMPDIR-/tmp}/config-guess-$$ ; +-(old=`umask` && umask 077 && mkdir $tmpdir && umask $old && unset old) +- || (echo "$me: cannot create $tmpdir" >&2 && exit 1) ; +-dummy=$tmpdir/dummy ; +-files="$dummy.c $dummy.o $dummy.rel $dummy" ; +-trap '"'"'rm -f $files; rmdir $tmpdir; exit 1'"'"' 1 2 15 ; ++set_cc_for_build=' ++trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; ++trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; ++: ${TMPDIR=/tmp} ; ++ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || ++ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || ++ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || ++ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; ++dummy=$tmp/dummy ; ++tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; + case $CC_FOR_BUILD,$HOST_CC,$CC in + ,,) echo "int x;" > $dummy.c ; + for c in cc gcc c89 c99 ; do +@@ -113,15 +121,13 @@ + CC_FOR_BUILD="$c"; break ; + fi ; + done ; +- rm -f $files ; + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found ; + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; +-esac ; +-unset files' ++esac ; set_cc_for_build= ;' + + # This is needed to find uname on a Pyramid OSx when run in the BSD universe. + # (ghazi@noc.rutgers.edu 1994-08-24) +@@ -156,6 +162,7 @@ + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; ++ sh5el) machine=sh5le-unknown ;; + *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + esac + # The Operating System including object format, if it has switched +@@ -164,7 +171,7 @@ + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + eval $set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ +- | grep __ELF__ >/dev/null ++ | grep -q __ELF__ + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? +@@ -178,144 +185,128 @@ + ;; + esac + # The OS release +- release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ++ # Debian GNU/NetBSD machines have a different userland, and ++ # thus, need a distinct triplet. However, they do not need ++ # kernel version information, so it can be replaced with a ++ # suitable tag, in the style of linux-gnu. ++ case "${UNAME_VERSION}" in ++ Debian*) ++ release='-gnu' ++ ;; ++ *) ++ release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ++ ;; ++ esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + echo "${machine}-${os}${release}" +- exit 0 ;; +- amiga:OpenBSD:*:*) +- echo m68k-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- arc:OpenBSD:*:*) +- echo mipsel-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- hp300:OpenBSD:*:*) +- echo m68k-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- mac68k:OpenBSD:*:*) +- echo m68k-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- macppc:OpenBSD:*:*) +- echo powerpc-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- mvme68k:OpenBSD:*:*) +- echo m68k-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- mvme88k:OpenBSD:*:*) +- echo m88k-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- mvmeppc:OpenBSD:*:*) +- echo powerpc-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- pmax:OpenBSD:*:*) +- echo mipsel-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- sgi:OpenBSD:*:*) +- echo mipseb-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- sun3:OpenBSD:*:*) +- echo m68k-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; +- wgrisc:OpenBSD:*:*) +- echo mipsel-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:OpenBSD:*:*) +- echo ${UNAME_MACHINE}-unknown-openbsd${UNAME_RELEASE} +- exit 0 ;; ++ UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` ++ echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} ++ exit ;; ++ *:ekkoBSD:*:*) ++ echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} ++ exit ;; ++ *:SolidBSD:*:*) ++ echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} ++ exit ;; ++ macppc:MirBSD:*:*) ++ echo powerpc-unknown-mirbsd${UNAME_RELEASE} ++ exit ;; ++ *:MirBSD:*:*) ++ echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} ++ exit ;; + alpha:OSF1:*:*) +- if test $UNAME_RELEASE = "V4.0"; then ++ case $UNAME_RELEASE in ++ *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` +- fi ++ ;; ++ *5.*) ++ UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ++ ;; ++ esac ++ # According to Compaq, /usr/sbin/psrinfo has been available on ++ # OSF/1 and Tru64 systems produced since 1995. I hope that ++ # covers most systems running today. This code pipes the CPU ++ # types through head -n 1, so we only detect the type of CPU 0. ++ ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` ++ case "$ALPHA_CPU_TYPE" in ++ "EV4 (21064)") ++ UNAME_MACHINE="alpha" ;; ++ "EV4.5 (21064)") ++ UNAME_MACHINE="alpha" ;; ++ "LCA4 (21066/21068)") ++ UNAME_MACHINE="alpha" ;; ++ "EV5 (21164)") ++ UNAME_MACHINE="alphaev5" ;; ++ "EV5.6 (21164A)") ++ UNAME_MACHINE="alphaev56" ;; ++ "EV5.6 (21164PC)") ++ UNAME_MACHINE="alphapca56" ;; ++ "EV5.7 (21164PC)") ++ UNAME_MACHINE="alphapca57" ;; ++ "EV6 (21264)") ++ UNAME_MACHINE="alphaev6" ;; ++ "EV6.7 (21264A)") ++ UNAME_MACHINE="alphaev67" ;; ++ "EV6.8CB (21264C)") ++ UNAME_MACHINE="alphaev68" ;; ++ "EV6.8AL (21264B)") ++ UNAME_MACHINE="alphaev68" ;; ++ "EV6.8CX (21264D)") ++ UNAME_MACHINE="alphaev68" ;; ++ "EV6.9A (21264/EV69A)") ++ UNAME_MACHINE="alphaev69" ;; ++ "EV7 (21364)") ++ UNAME_MACHINE="alphaev7" ;; ++ "EV7.9 (21364A)") ++ UNAME_MACHINE="alphaev79" ;; ++ esac ++ # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. +- eval $set_cc_for_build +- cat <<EOF >$dummy.s +- .data +-\$Lformat: +- .byte 37,100,45,37,120,10,0 # "%d-%x\n" +- +- .text +- .globl main +- .align 4 +- .ent main +-main: +- .frame \$30,16,\$26,0 +- ldgp \$29,0(\$27) +- .prologue 1 +- .long 0x47e03d80 # implver \$0 +- lda \$2,-1 +- .long 0x47e20c21 # amask \$2,\$1 +- lda \$16,\$Lformat +- mov \$0,\$17 +- not \$1,\$18 +- jsr \$26,printf +- ldgp \$29,0(\$26) +- mov 0,\$16 +- jsr \$26,exit +- .end main +-EOF +- $CC_FOR_BUILD -o $dummy $dummy.s 2>/dev/null +- if test "$?" = 0 ; then +- case `$dummy` in +- 0-0) +- UNAME_MACHINE="alpha" +- ;; +- 1-0) +- UNAME_MACHINE="alphaev5" +- ;; +- 1-1) +- UNAME_MACHINE="alphaev56" +- ;; +- 1-101) +- UNAME_MACHINE="alphapca56" +- ;; +- 2-303) +- UNAME_MACHINE="alphaev6" +- ;; +- 2-307) +- UNAME_MACHINE="alphaev67" +- ;; +- 2-1307) +- UNAME_MACHINE="alphaev68" +- ;; +- 3-1307) +- UNAME_MACHINE="alphaev7" +- ;; +- esac +- fi +- rm -f $dummy.s $dummy && rmdir $tmpdir +- echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` +- exit 0 ;; ++ echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` ++ exit ;; + Alpha\ *:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # Should we change UNAME_MACHINE based on the output of uname instead + # of the specific Alpha model? + echo alpha-pc-interix +- exit 0 ;; ++ exit ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 +- exit 0 ;; ++ exit ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-unknown-sysv4 +- exit 0;; ++ exit ;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos +- exit 0 ;; ++ exit ;; + *:[Mm]orph[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-morphos +- exit 0 ;; ++ exit ;; + *:OS/390:*:*) + echo i370-ibm-openedition +- exit 0 ;; ++ exit ;; ++ *:z/VM:*:*) ++ echo s390-ibm-zvmoe ++ exit ;; ++ *:OS400:*:*) ++ echo powerpc-ibm-os400 ++ exit ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} +- exit 0;; ++ exit ;; ++ arm:riscos:*:*|arm:RISCOS:*:*) ++ echo arm-unknown-riscos ++ exit ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp +- exit 0;; ++ exit ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then +@@ -323,29 +314,51 @@ + else + echo pyramid-pyramid-bsd + fi +- exit 0 ;; ++ exit ;; + NILE*:*:*:dcosx) + echo pyramid-pyramid-svr4 +- exit 0 ;; +- DRS?6000:UNIX_SV:4.2*:7*) ++ exit ;; ++ DRS?6000:unix:4.0:6*) ++ echo sparc-icl-nx6 ++ exit ;; ++ DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in +- sparc) echo sparc-icl-nx7 && exit 0 ;; ++ sparc) echo sparc-icl-nx7; exit ;; + esac ;; ++ s390x:SunOS:*:*) ++ echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` ++ exit ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` +- exit 0 ;; ++ exit ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` +- exit 0 ;; +- i86pc:SunOS:5.*:*) +- echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` +- exit 0 ;; ++ exit ;; ++ i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) ++ echo i386-pc-auroraux${UNAME_RELEASE} ++ exit ;; ++ i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) ++ eval $set_cc_for_build ++ SUN_ARCH="i386" ++ # If there is a compiler, see if it is configured for 64-bit objects. ++ # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. ++ # This test works for both compilers. ++ if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then ++ if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ ++ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ ++ grep IS_64BIT_ARCH >/dev/null ++ then ++ SUN_ARCH="x86_64" ++ fi ++ fi ++ echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` ++ exit ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` +- exit 0 ;; ++ exit ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) +@@ -354,10 +367,10 @@ + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` +- exit 0 ;; ++ exit ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 +@@ -369,10 +382,10 @@ + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac +- exit 0 ;; ++ exit ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor +@@ -383,37 +396,40 @@ + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + echo m68k-milan-mint${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + echo m68k-hades-mint${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + echo m68k-unknown-mint${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; ++ m68k:machten:*:*) ++ echo m68k-apple-machten${UNAME_RELEASE} ++ exit ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 +- exit 0 ;; ++ exit ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +@@ -437,33 +453,33 @@ + exit (-1); + } + EOF +- $CC_FOR_BUILD -o $dummy $dummy.c \ +- && $dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \ +- && rm -f $dummy.c $dummy && rmdir $tmpdir && exit 0 +- rm -f $dummy.c $dummy && rmdir $tmpdir ++ $CC_FOR_BUILD -o $dummy $dummy.c && ++ dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && ++ SYSTEM_NAME=`$dummy $dummyarg` && ++ { echo "$SYSTEM_NAME"; exit; } + echo mips-mips-riscos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + Motorola:PowerMAX_OS:*:*) + echo powerpc-motorola-powermax +- exit 0 ;; ++ exit ;; + Motorola:*:4.3:PL8-*) + echo powerpc-harris-powermax +- exit 0 ;; +- Night_Hawk:*:*:PowerMAX_OS) ++ exit ;; ++ Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + echo powerpc-harris-powermax +- exit 0 ;; ++ exit ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix +- exit 0 ;; ++ exit ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 +- exit 0 ;; ++ exit ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 +- exit 0 ;; ++ exit ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 +- exit 0 ;; ++ exit ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` +@@ -479,29 +495,29 @@ + else + echo i586-dg-dgux${UNAME_RELEASE} + fi +- exit 0 ;; ++ exit ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 +- exit 0 ;; ++ exit ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 +- exit 0 ;; ++ exit ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 +- exit 0 ;; ++ exit ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd +- exit 0 ;; ++ exit ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` +- exit 0 ;; ++ exit ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. +- echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id +- exit 0 ;; # Note that: echo "'`uname -s`'" gives 'AIX ' ++ echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id ++ exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + echo i386-ibm-aix +- exit 0 ;; ++ exit ;; + ia64:AIX:*:*) + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` +@@ -509,7 +525,7 @@ + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} +- exit 0 ;; ++ exit ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + eval $set_cc_for_build +@@ -524,16 +540,19 @@ + exit(0); + } + EOF +- $CC_FOR_BUILD -o $dummy $dummy.c && $dummy && rm -f $dummy.c $dummy && rmdir $tmpdir && exit 0 +- rm -f $dummy.c $dummy && rmdir $tmpdir +- echo rs6000-ibm-aix3.2.5 ++ if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` ++ then ++ echo "$SYSTEM_NAME" ++ else ++ echo rs6000-ibm-aix3.2.5 ++ fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi +- exit 0 ;; +- *:AIX:*:[45]) ++ exit ;; ++ *:AIX:*:[456]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 +@@ -546,28 +565,28 @@ + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} +- exit 0 ;; ++ exit ;; + *:AIX:*:*) + echo rs6000-ibm-aix +- exit 0 ;; ++ exit ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 +- exit 0 ;; ++ exit ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to +- exit 0 ;; # report: romp-ibm BSD 4.3 ++ exit ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx +- exit 0 ;; ++ exit ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 +- exit 0 ;; ++ exit ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd +- exit 0 ;; ++ exit ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 +- exit 0 ;; ++ exit ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + case "${UNAME_MACHINE}" in +@@ -624,16 +643,36 @@ + } + EOF + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` +- if test -z "$HP_ARCH"; then HP_ARCH=hppa; fi +- rm -f $dummy.c $dummy && rmdir $tmpdir ++ test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac ++ if [ ${HP_ARCH} = "hppa2.0w" ] ++ then ++ eval $set_cc_for_build ++ ++ # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating ++ # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler ++ # generating 64-bit code. GNU and HP use different nomenclature: ++ # ++ # $ CC_FOR_BUILD=cc ./config.guess ++ # => hppa2.0w-hp-hpux11.23 ++ # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess ++ # => hppa64-hp-hpux11.23 ++ ++ if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | ++ grep -q __LP64__ ++ then ++ HP_ARCH="hppa2.0w" ++ else ++ HP_ARCH="hppa64" ++ fi ++ fi + echo ${HP_ARCH}-hp-hpux${HPUX_REV} +- exit 0 ;; ++ exit ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux${HPUX_REV} +- exit 0 ;; ++ exit ;; + 3050*:HI-UX:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +@@ -661,186 +700,248 @@ + exit (0); + } + EOF +- $CC_FOR_BUILD -o $dummy $dummy.c && $dummy && rm -f $dummy.c $dummy && rmdir $tmpdir && exit 0 +- rm -f $dummy.c $dummy && rmdir $tmpdir ++ $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && ++ { echo "$SYSTEM_NAME"; exit; } + echo unknown-hitachi-hiuxwe2 +- exit 0 ;; ++ exit ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd +- exit 0 ;; ++ exit ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd +- exit 0 ;; ++ exit ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + echo hppa1.0-hp-mpeix +- exit 0 ;; ++ exit ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf +- exit 0 ;; ++ exit ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf +- exit 0 ;; ++ exit ;; + i*86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi +- exit 0 ;; ++ exit ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites +- exit 0 ;; ++ exit ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd +- exit 0 ;; ++ exit ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi +- exit 0 ;; ++ exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd +- exit 0 ;; ++ exit ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd +- exit 0 ;; ++ exit ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd +- exit 0 ;; ++ exit ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' +- exit 0 ;; ++ exit ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' +- exit 0 ;; ++ exit ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' +- exit 0 ;; +- CRAY*T3D:*:*:*) +- echo alpha-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' +- exit 0 ;; ++ exit ;; + CRAY*T3E:*:*:*) + echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' +- exit 0 ;; ++ exit ;; + CRAY*SV1:*:*:*) + echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' +- exit 0 ;; ++ exit ;; ++ *:UNICOS/mp:*:*) ++ echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' ++ exit ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" +- exit 0 ;; ++ exit ;; ++ 5000:UNIX_System_V:4.*:*) ++ FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` ++ FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` ++ echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" ++ exit ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:FreeBSD:*:*) +- # Determine whether the default compiler uses glibc. +- eval $set_cc_for_build +- sed 's/^ //' << EOF >$dummy.c +- #include <features.h> +- #if __GLIBC__ >= 2 +- LIBC=gnu +- #else +- LIBC= +- #endif +-EOF +- eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=` +- rm -f $dummy.c && rmdir $tmpdir +- echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`${LIBC:+-$LIBC} +- exit 0 ;; ++ case ${UNAME_MACHINE} in ++ pc98) ++ echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; ++ amd64) ++ echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; ++ *) ++ echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; ++ esac ++ exit ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin +- exit 0 ;; +- i*:MINGW*:*) ++ exit ;; ++ *:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 +- exit 0 ;; ++ exit ;; ++ i*:windows32*:*) ++ # uname -m includes "-pc" on this system. ++ echo ${UNAME_MACHINE}-mingw32 ++ exit ;; + i*:PW*:*) + echo ${UNAME_MACHINE}-pc-pw32 +- exit 0 ;; +- x86:Interix*:3*) +- echo i386-pc-interix3 +- exit 0 ;; ++ exit ;; ++ *:Interix*:*) ++ case ${UNAME_MACHINE} in ++ x86) ++ echo i586-pc-interix${UNAME_RELEASE} ++ exit ;; ++ authenticamd | genuineintel | EM64T) ++ echo x86_64-unknown-interix${UNAME_RELEASE} ++ exit ;; ++ IA64) ++ echo ia64-unknown-interix${UNAME_RELEASE} ++ exit ;; ++ esac ;; ++ [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) ++ echo i${UNAME_MACHINE}-pc-mks ++ exit ;; ++ 8664:Windows_NT:*) ++ echo x86_64-pc-mks ++ exit ;; + i*:Windows_NT*:* | Pentium*:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we + # UNAME_MACHINE based on the output of uname instead of i386? +- echo i386-pc-interix +- exit 0 ;; ++ echo i586-pc-interix ++ exit ;; + i*:UWIN*:*) + echo ${UNAME_MACHINE}-pc-uwin +- exit 0 ;; ++ exit ;; ++ amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) ++ echo x86_64-unknown-cygwin ++ exit ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin +- exit 0 ;; ++ exit ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` +- exit 0 ;; ++ exit ;; + *:GNU:*:*) ++ # the GNU system + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` +- exit 0 ;; ++ exit ;; ++ *:GNU/*:*:*) ++ # other systems with GNU libc and userland ++ echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu ++ exit ;; + i*86:Minix:*:*) + echo ${UNAME_MACHINE}-pc-minix +- exit 0 ;; ++ exit ;; ++ alpha:Linux:*:*) ++ case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in ++ EV5) UNAME_MACHINE=alphaev5 ;; ++ EV56) UNAME_MACHINE=alphaev56 ;; ++ PCA56) UNAME_MACHINE=alphapca56 ;; ++ PCA57) UNAME_MACHINE=alphapca56 ;; ++ EV6) UNAME_MACHINE=alphaev6 ;; ++ EV67) UNAME_MACHINE=alphaev67 ;; ++ EV68*) UNAME_MACHINE=alphaev68 ;; ++ esac ++ objdump --private-headers /bin/sh | grep -q ld.so.1 ++ if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi ++ echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ++ exit ;; + arm*:Linux:*:*) ++ eval $set_cc_for_build ++ if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ ++ | grep -q __ARM_EABI__ ++ then ++ echo ${UNAME_MACHINE}-unknown-linux-gnu ++ else ++ echo ${UNAME_MACHINE}-unknown-linux-gnueabi ++ fi ++ exit ;; ++ avr32*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu +- exit 0 ;; ++ exit ;; ++ cris:Linux:*:*) ++ echo cris-axis-linux-gnu ++ exit ;; ++ crisv32:Linux:*:*) ++ echo crisv32-axis-linux-gnu ++ exit ;; ++ frv:Linux:*:*) ++ echo frv-unknown-linux-gnu ++ exit ;; ++ i*86:Linux:*:*) ++ LIBC=gnu ++ eval $set_cc_for_build ++ sed 's/^ //' << EOF >$dummy.c ++ #ifdef __dietlibc__ ++ LIBC=dietlibc ++ #endif ++EOF ++ eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` ++ echo "${UNAME_MACHINE}-pc-linux-${LIBC}" ++ exit ;; + ia64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu +- exit 0 ;; ++ exit ;; ++ m32r*:Linux:*:*) ++ echo ${UNAME_MACHINE}-unknown-linux-gnu ++ exit ;; + m68*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu +- exit 0 ;; +- mips:Linux:*:*) ++ exit ;; ++ mips:Linux:*:* | mips64:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU +- #undef mips +- #undef mipsel ++ #undef ${UNAME_MACHINE} ++ #undef ${UNAME_MACHINE}el + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) +- CPU=mipsel ++ CPU=${UNAME_MACHINE}el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) +- CPU=mips ++ CPU=${UNAME_MACHINE} + #else + CPU= + #endif + #endif + EOF +- eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=` +- rm -f $dummy.c && rmdir $tmpdir +- test x"${CPU}" != x && echo "${CPU}-pc-linux-gnu" && exit 0 ++ eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` ++ test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; +- ppc:Linux:*:*) +- echo powerpc-unknown-linux-gnu +- exit 0 ;; +- ppc64:Linux:*:*) +- echo powerpc64-unknown-linux-gnu +- exit 0 ;; +- alpha:Linux:*:*) +- case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in +- EV5) UNAME_MACHINE=alphaev5 ;; +- EV56) UNAME_MACHINE=alphaev56 ;; +- PCA56) UNAME_MACHINE=alphapca56 ;; +- PCA57) UNAME_MACHINE=alphapca56 ;; +- EV6) UNAME_MACHINE=alphaev6 ;; +- EV67) UNAME_MACHINE=alphaev67 ;; +- EV68*) UNAME_MACHINE=alphaev68 ;; +- esac +- objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null +- if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi +- echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} +- exit 0 ;; ++ or32:Linux:*:*) ++ echo or32-unknown-linux-gnu ++ exit ;; ++ padre:Linux:*:*) ++ echo sparc-unknown-linux-gnu ++ exit ;; ++ parisc64:Linux:*:* | hppa64:Linux:*:*) ++ echo hppa64-unknown-linux-gnu ++ exit ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in +@@ -848,82 +949,40 @@ + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; + esac +- exit 0 ;; +- parisc64:Linux:*:* | hppa64:Linux:*:*) +- echo hppa64-unknown-linux-gnu +- exit 0 ;; ++ exit ;; ++ ppc64:Linux:*:*) ++ echo powerpc64-unknown-linux-gnu ++ exit ;; ++ ppc:Linux:*:*) ++ echo powerpc-unknown-linux-gnu ++ exit ;; + s390:Linux:*:* | s390x:Linux:*:*) + echo ${UNAME_MACHINE}-ibm-linux +- exit 0 ;; ++ exit ;; ++ sh64*:Linux:*:*) ++ echo ${UNAME_MACHINE}-unknown-linux-gnu ++ exit ;; + sh*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu +- exit 0 ;; ++ exit ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu +- exit 0 ;; ++ exit ;; ++ vax:Linux:*:*) ++ echo ${UNAME_MACHINE}-dec-linux-gnu ++ exit ;; + x86_64:Linux:*:*) + echo x86_64-unknown-linux-gnu +- exit 0 ;; +- i*86:Linux:*:*) +- # The BFD linker knows what the default object file format is, so +- # first see if it will tell us. cd to the root directory to prevent +- # problems with other programs or directories called `ld' in the path. +- # Set LC_ALL=C to ensure ld outputs messages in English. +- ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ +- | sed -ne '/supported targets:/!d +- s/[ ][ ]*/ /g +- s/.*supported targets: *// +- s/ .*// +- p'` +- case "$ld_supported_targets" in +- elf32-i386) +- TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" +- ;; +- a.out-i386-linux) +- echo "${UNAME_MACHINE}-pc-linux-gnuaout" +- exit 0 ;; +- coff-i386) +- echo "${UNAME_MACHINE}-pc-linux-gnucoff" +- exit 0 ;; +- "") +- # Either a pre-BFD a.out linker (linux-gnuoldld) or +- # one that does not give us useful --help. +- echo "${UNAME_MACHINE}-pc-linux-gnuoldld" +- exit 0 ;; +- esac +- # Determine whether the default compiler is a.out or elf +- eval $set_cc_for_build +- sed 's/^ //' << EOF >$dummy.c +- #include <features.h> +- #ifdef __ELF__ +- # ifdef __GLIBC__ +- # if __GLIBC__ >= 2 +- LIBC=gnu +- # else +- LIBC=gnulibc1 +- # endif +- # else +- LIBC=gnulibc1 +- # endif +- #else +- #ifdef __INTEL_COMPILER +- LIBC=gnu +- #else +- LIBC=gnuaout +- #endif +- #endif +-EOF +- eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=` +- rm -f $dummy.c && rmdir $tmpdir +- test x"${LIBC}" != x && echo "${UNAME_MACHINE}-pc-linux-${LIBC}" && exit 0 +- test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0 +- ;; ++ exit ;; ++ xtensa*:Linux:*:*) ++ echo ${UNAME_MACHINE}-unknown-linux-gnu ++ exit ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + echo i386-sequent-sysv4 +- exit 0 ;; ++ exit ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... +@@ -931,7 +990,27 @@ + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} +- exit 0 ;; ++ exit ;; ++ i*86:OS/2:*:*) ++ # If we were able to find `uname', then EMX Unix compatibility ++ # is probably installed. ++ echo ${UNAME_MACHINE}-pc-os2-emx ++ exit ;; ++ i*86:XTS-300:*:STOP) ++ echo ${UNAME_MACHINE}-unknown-stop ++ exit ;; ++ i*86:atheos:*:*) ++ echo ${UNAME_MACHINE}-unknown-atheos ++ exit ;; ++ i*86:syllable:*:*) ++ echo ${UNAME_MACHINE}-pc-syllable ++ exit ;; ++ i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) ++ echo i386-unknown-lynxos${UNAME_RELEASE} ++ exit ;; ++ i*86:*DOS:*:*) ++ echo ${UNAME_MACHINE}-pc-msdosdjgpp ++ exit ;; + i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then +@@ -939,15 +1018,16 @@ + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + fi +- exit 0 ;; +- i*86:*:5:[78]*) ++ exit ;; ++ i*86:*:5:[678]*) ++ # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} +- exit 0 ;; ++ exit ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name` +@@ -965,76 +1045,86 @@ + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi +- exit 0 ;; +- i*86:*DOS:*:*) +- echo ${UNAME_MACHINE}-pc-msdosdjgpp +- exit 0 ;; ++ exit ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about +- # the processor, so we play safe by assuming i386. +- echo i386-pc-msdosdjgpp +- exit 0 ;; ++ # the processor, so we play safe by assuming i586. ++ # Note: whatever this is, it MUST be the same as what config.sub ++ # prints for the "djgpp" host, or else GDB configury will decide that ++ # this is a cross-build. ++ echo i586-pc-msdosdjgpp ++ exit ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 +- exit 0 ;; ++ exit ;; + paragon:*:*:*) + echo i860-intel-osf1 +- exit 0 ;; ++ exit ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi +- exit 0 ;; ++ exit ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv +- exit 0 ;; ++ exit ;; + mc68k:UNIX:SYSTEM5:3.51m) + echo m68k-convergent-sysv +- exit 0 ;; +- M68*:*:R3V[567]*:*) +- test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;; +- 3[34]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0) ++ exit ;; ++ M680?0:D-NIX:5.3:*) ++ echo m68k-diab-dnix ++ exit ;; ++ M68*:*:R3V[5678]*:*) ++ test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; ++ 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ +- && echo i486-ncr-sysv4.3${OS_REL} && exit 0 ++ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ +- && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;; ++ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ +- && echo i486-ncr-sysv4 && exit 0 ;; ++ && { echo i486-ncr-sysv4; exit; } ;; ++ NCR*:*:4.2:* | MPRAS*:*:4.2:*) ++ OS_REL='.3' ++ test -r /etc/.relid \ ++ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` ++ /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ ++ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } ++ /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ ++ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ++ /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ ++ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 +- exit 0 ;; +- i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) +- echo i386-unknown-lynxos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + rs6000:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} +- exit 0 ;; +- PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) ++ exit ;; ++ PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) + echo powerpc-unknown-lynxos${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + RM*:ReliantUNIX-*:*:*) + echo mips-sni-sysv4 +- exit 0 ;; ++ exit ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 +- exit 0 ;; ++ exit ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` +@@ -1042,64 +1132,94 @@ + else + echo ns32k-sni-sysv + fi +- exit 0 ;; ++ exit ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says <Richard.M.Bartel@ccMail.Census.GOV> + echo i586-unisys-sysv4 +- exit 0 ;; ++ exit ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes <hewes@openmarket.com>. + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 +- exit 0 ;; ++ exit ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + echo i860-stratus-sysv4 +- exit 0 ;; ++ exit ;; ++ i*86:VOS:*:*) ++ # From Paul.Green@stratus.com. ++ echo ${UNAME_MACHINE}-stratus-vos ++ exit ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + echo hppa1.1-stratus-vos +- exit 0 ;; ++ exit ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + news*:NEWS-OS:6*:*) + echo mips-sony-newsos6 +- exit 0 ;; ++ exit ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi +- exit 0 ;; ++ exit ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos +- exit 0 ;; ++ exit ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos +- exit 0 ;; ++ exit ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos +- exit 0 ;; ++ exit ;; ++ BePC:Haiku:*:*) # Haiku running on Intel PC compatible. ++ echo i586-pc-haiku ++ exit ;; + SX-4:SUPER-UX:*:*) + echo sx4-nec-superux${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + SX-5:SUPER-UX:*:*) + echo sx5-nec-superux${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + SX-6:SUPER-UX:*:*) + echo sx6-nec-superux${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; ++ SX-7:SUPER-UX:*:*) ++ echo sx7-nec-superux${UNAME_RELEASE} ++ exit ;; ++ SX-8:SUPER-UX:*:*) ++ echo sx8-nec-superux${UNAME_RELEASE} ++ exit ;; ++ SX-8R:SUPER-UX:*:*) ++ echo sx8r-nec-superux${UNAME_RELEASE} ++ exit ;; + Power*:Rhapsody:*:*) + echo powerpc-apple-rhapsody${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:Rhapsody:*:*) + echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:Darwin:*:*) +- echo `uname -p`-apple-darwin${UNAME_RELEASE} +- exit 0 ;; ++ UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown ++ case $UNAME_PROCESSOR in ++ i386) ++ eval $set_cc_for_build ++ if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then ++ if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ ++ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ ++ grep IS_64BIT_ARCH >/dev/null ++ then ++ UNAME_PROCESSOR="x86_64" ++ fi ++ fi ;; ++ unknown) UNAME_PROCESSOR=powerpc ;; ++ esac ++ echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} ++ exit ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = "x86"; then +@@ -1107,22 +1227,25 @@ + UNAME_MACHINE=pc + fi + echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:QNX:*:4*) + echo i386-pc-qnx +- exit 0 ;; +- NSR-[DGKLNPTVW]:NONSTOP_KERNEL:*:*) ++ exit ;; ++ NSE-?:NONSTOP_KERNEL:*:*) ++ echo nse-tandem-nsk${UNAME_RELEASE} ++ exit ;; ++ NSR-?:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:NonStop-UX:*:*) + echo mips-compaq-nonstopux +- exit 0 ;; ++ exit ;; + BS2000:POSIX*:*:*) + echo bs2000-siemens-sysv +- exit 0 ;; ++ exit ;; + DS/*:UNIX_System_V:*:*) + echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} +- exit 0 ;; ++ exit ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 +@@ -1133,36 +1256,50 @@ + UNAME_MACHINE="$cputype" + fi + echo ${UNAME_MACHINE}-unknown-plan9 +- exit 0 ;; +- i*86:OS/2:*:*) +- # If we were able to find `uname', then EMX Unix compatibility +- # is probably installed. +- echo ${UNAME_MACHINE}-pc-os2-emx +- exit 0 ;; ++ exit ;; + *:TOPS-10:*:*) + echo pdp10-unknown-tops10 +- exit 0 ;; ++ exit ;; + *:TENEX:*:*) + echo pdp10-unknown-tenex +- exit 0 ;; ++ exit ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + echo pdp10-dec-tops20 +- exit 0 ;; ++ exit ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + echo pdp10-xkl-tops20 +- exit 0 ;; ++ exit ;; + *:TOPS-20:*:*) + echo pdp10-unknown-tops20 +- exit 0 ;; ++ exit ;; + *:ITS:*:*) + echo pdp10-unknown-its +- exit 0 ;; +- i*86:XTS-300:*:STOP) +- echo ${UNAME_MACHINE}-unknown-stop +- exit 0 ;; +- i*86:atheos:*:*) +- echo ${UNAME_MACHINE}-unknown-atheos +- exit 0 ;; ++ exit ;; ++ SEI:*:*:SEIUX) ++ echo mips-sei-seiux${UNAME_RELEASE} ++ exit ;; ++ *:DragonFly:*:*) ++ echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ++ exit ;; ++ *:*VMS:*:*) ++ UNAME_MACHINE=`(uname -p) 2>/dev/null` ++ case "${UNAME_MACHINE}" in ++ A*) echo alpha-dec-vms ; exit ;; ++ I*) echo ia64-dec-vms ; exit ;; ++ V*) echo vax-dec-vms ; exit ;; ++ esac ;; ++ *:XENIX:*:SysV) ++ echo i386-pc-xenix ++ exit ;; ++ i*86:skyos:*:*) ++ echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' ++ exit ;; ++ i*86:rdos:*:*) ++ echo ${UNAME_MACHINE}-pc-rdos ++ exit ;; ++ i*86:AROS:*:*) ++ echo ${UNAME_MACHINE}-pc-aros ++ exit ;; + esac + + #echo '(No uname command or uname output not recognized.)' 1>&2 +@@ -1194,7 +1331,7 @@ + #endif + + #if defined (__arm) && defined (__acorn) && defined (__unix) +- printf ("arm-acorn-riscix"); exit (0); ++ printf ("arm-acorn-riscix\n"); exit (0); + #endif + + #if defined (hp300) && !defined (hpux) +@@ -1283,12 +1420,12 @@ + } + EOF + +-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && $dummy && rm -f $dummy.c $dummy && rmdir $tmpdir && exit 0 +-rm -f $dummy.c $dummy && rmdir $tmpdir ++$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && ++ { echo "$SYSTEM_NAME"; exit; } + + # Apollos put the system type in the environment. + +-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; } ++test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } + + # Convex versions that predate uname can use getsysinfo(1) + +@@ -1297,22 +1434,22 @@ + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd +- exit 0 ;; ++ exit ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi +- exit 0 ;; ++ exit ;; + c34*) + echo c34-convex-bsd +- exit 0 ;; ++ exit ;; + c38*) + echo c38-convex-bsd +- exit 0 ;; ++ exit ;; + c4*) + echo c4-convex-bsd +- exit 0 ;; ++ exit ;; + esac + fi + +@@ -1323,7 +1460,9 @@ + the operating system you are using. It is advised that you + download the most up to date version of the config scripts from + +- ftp://ftp.gnu.org/pub/gnu/config/ ++ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD ++and ++ http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD + + If the version you run ($0) is already up to date, please + send the following data and any information you think might be +--- misc/libtextcat-2.2/config.sub 2010-04-15 09:20:04.000000000 +0000 ++++ misc/build/libtextcat-2.2/config.sub 2010-04-15 09:20:41.000000000 +0000 +@@ -1,9 +1,10 @@ + #! /bin/sh + # Configuration validation subroutine script. + # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +-# 2000, 2001, 2002 Free Software Foundation, Inc. ++# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 ++# Free Software Foundation, Inc. + +-timestamp='2002-09-05' ++timestamp='2010-01-22' + + # This file is (in principle) common to ALL GNU software. + # The presence of a machine in this file suggests that SOME GNU software +@@ -21,22 +22,26 @@ + # + # You should have received a copy of the GNU General Public License + # along with this program; if not, write to the Free Software +-# Foundation, Inc., 59 Temple Place - Suite 330, +-# Boston, MA 02111-1307, USA. +- ++# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA ++# 02110-1301, USA. ++# + # As a special exception to the GNU General Public License, if you + # distribute this file as part of a program that contains a + # configuration script generated by Autoconf, you may include it under + # the same distribution terms that you use for the rest of that program. + ++ + # Please send patches to <config-patches@gnu.org>. Submit a context +-# diff and a properly formatted ChangeLog entry. ++# diff and a properly formatted GNU ChangeLog entry. + # + # Configuration subroutine to validate and canonicalize a configuration type. + # Supply the specified configuration type as an argument. + # If it is invalid, we print an error message on stderr and exit with code 1. + # Otherwise, we print the canonical config type on stdout and succeed. + ++# You can get the latest version of this script from: ++# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD ++ + # This file is supposed to be the same for all GNU packages + # and recognize all the CPU types, system types and aliases + # that are meaningful with *any* GNU software. +@@ -70,8 +75,9 @@ + version="\ + GNU config.sub ($timestamp) + +-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001 +-Free Software Foundation, Inc. ++Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, ++2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free ++Software Foundation, Inc. + + This is free software; see the source for copying conditions. There is NO + warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." +@@ -83,11 +89,11 @@ + while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) +- echo "$timestamp" ; exit 0 ;; ++ echo "$timestamp" ; exit ;; + --version | -v ) +- echo "$version" ; exit 0 ;; ++ echo "$version" ; exit ;; + --help | --h* | -h ) +- echo "$usage"; exit 0 ;; ++ echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. +@@ -99,7 +105,7 @@ + *local*) + # First pass through any local machine types. + echo $1 +- exit 0;; ++ exit ;; + + * ) + break ;; +@@ -118,7 +124,10 @@ + # Here we must recognize all the valid KERNEL-OS combinations. + maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` + case $maybe_os in +- nto-qnx* | linux-gnu* | freebsd*-gnu* | storm-chaos* | os2-emx* | windows32-* | rtmk-nova*) ++ nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \ ++ uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \ ++ kopensolaris*-gnu* | \ ++ storm-chaos* | os2-emx* | rtmk-nova*) + os=-$maybe_os + basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + ;; +@@ -144,10 +153,13 @@ + -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ + -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ + -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ +- -apple | -axis) ++ -apple | -axis | -knuth | -cray | -microblaze) + os= + basic_machine=$1 + ;; ++ -bluegene*) ++ os=-cnk ++ ;; + -sim | -cisco | -oki | -wec | -winbond) + os= + basic_machine=$1 +@@ -169,6 +181,10 @@ + -hiux*) + os=-hiuxwe2 + ;; ++ -sco6) ++ os=-sco5v6 ++ basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ++ ;; + -sco5) + os=-sco3.2v5 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` +@@ -185,6 +201,10 @@ + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; ++ -sco5v6*) ++ # Don't forget version if it is 3.2v4 or newer. ++ basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ++ ;; + -sco*) + os=-sco3.2v2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` +@@ -228,51 +248,71 @@ + | a29k \ + | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ + | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ +- | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr \ +- | clipper \ ++ | am33_2.0 \ ++ | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ ++ | bfin \ ++ | c4x | clipper \ + | d10v | d30v | dlx | dsp16xx \ +- | fr30 | frv \ ++ | fido | fr30 | frv \ + | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ + | i370 | i860 | i960 | ia64 \ +- | ip2k \ +- | m32r | m68000 | m68k | m88k | mcore \ ++ | ip2k | iq2000 \ ++ | lm32 \ ++ | m32c | m32r | m32rle | m68000 | m68k | m88k \ ++ | maxq | mb | microblaze | mcore | mep | metag \ + | mips | mipsbe | mipseb | mipsel | mipsle \ + | mips16 \ + | mips64 | mips64el \ +- | mips64vr | mips64vrel \ ++ | mips64octeon | mips64octeonel \ + | mips64orion | mips64orionel \ ++ | mips64r5900 | mips64r5900el \ ++ | mips64vr | mips64vrel \ + | mips64vr4100 | mips64vr4100el \ + | mips64vr4300 | mips64vr4300el \ + | mips64vr5000 | mips64vr5000el \ ++ | mips64vr5900 | mips64vr5900el \ + | mipsisa32 | mipsisa32el \ ++ | mipsisa32r2 | mipsisa32r2el \ + | mipsisa64 | mipsisa64el \ ++ | mipsisa64r2 | mipsisa64r2el \ + | mipsisa64sb1 | mipsisa64sb1el \ + | mipsisa64sr71k | mipsisa64sr71kel \ + | mipstx39 | mipstx39el \ + | mn10200 | mn10300 \ ++ | moxie \ ++ | mt \ ++ | msp430 \ ++ | nios | nios2 \ + | ns16k | ns32k \ +- | openrisc | or32 \ ++ | or32 \ + | pdp10 | pdp11 | pj | pjl \ + | powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \ + | pyramid \ +- | sh | sh[1234] | sh3e | sh[34]eb | shbe | shle | sh[1234]le | sh3ele \ ++ | rx \ ++ | score \ ++ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ + | sh64 | sh64le \ +- | sparc | sparc64 | sparc86x | sparclet | sparclite | sparcv9 | sparcv9b \ +- | strongarm \ +- | tahoe | thumb | tic80 | tron \ ++ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ ++ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ ++ | spu | strongarm \ ++ | tahoe | thumb | tic4x | tic80 | tron \ ++ | ubicom32 \ + | v850 | v850e \ + | we32k \ +- | x86 | xscale | xstormy16 | xtensa \ +- | z8k) ++ | x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \ ++ | z8k | z80) + basic_machine=$basic_machine-unknown + ;; +- m6811 | m68hc11 | m6812 | m68hc12) ++ m6811 | m68hc11 | m6812 | m68hc12 | picochip) + # Motorola 68HC11/12. + basic_machine=$basic_machine-unknown + os=-none + ;; + m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) + ;; ++ ms1) ++ basic_machine=mt-unknown ++ ;; + + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and +@@ -292,50 +332,69 @@ + | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ + | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ + | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ +- | avr-* \ +- | bs2000-* \ +- | c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* \ +- | clipper-* | cydra-* \ ++ | avr-* | avr32-* \ ++ | bfin-* | bs2000-* \ ++ | c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \ ++ | clipper-* | craynv-* | cydra-* \ + | d10v-* | d30v-* | dlx-* \ + | elxsi-* \ +- | f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \ ++ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ + | h8300-* | h8500-* \ + | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ + | i*86-* | i860-* | i960-* | ia64-* \ +- | ip2k-* \ +- | m32r-* \ ++ | ip2k-* | iq2000-* \ ++ | lm32-* \ ++ | m32c-* | m32r-* | m32rle-* \ + | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ +- | m88110-* | m88k-* | mcore-* \ ++ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \ + | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ + | mips16-* \ + | mips64-* | mips64el-* \ +- | mips64vr-* | mips64vrel-* \ ++ | mips64octeon-* | mips64octeonel-* \ + | mips64orion-* | mips64orionel-* \ ++ | mips64r5900-* | mips64r5900el-* \ ++ | mips64vr-* | mips64vrel-* \ + | mips64vr4100-* | mips64vr4100el-* \ + | mips64vr4300-* | mips64vr4300el-* \ + | mips64vr5000-* | mips64vr5000el-* \ ++ | mips64vr5900-* | mips64vr5900el-* \ + | mipsisa32-* | mipsisa32el-* \ ++ | mipsisa32r2-* | mipsisa32r2el-* \ + | mipsisa64-* | mipsisa64el-* \ ++ | mipsisa64r2-* | mipsisa64r2el-* \ + | mipsisa64sb1-* | mipsisa64sb1el-* \ + | mipsisa64sr71k-* | mipsisa64sr71kel-* \ +- | mipstx39 | mipstx39el \ ++ | mipstx39-* | mipstx39el-* \ ++ | mmix-* \ ++ | mt-* \ ++ | msp430-* \ ++ | nios-* | nios2-* \ + | none-* | np1-* | ns16k-* | ns32k-* \ + | orion-* \ + | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ + | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \ + | pyramid-* \ +- | romp-* | rs6000-* \ +- | sh-* | sh[1234]-* | sh3e-* | sh[34]eb-* | shbe-* \ ++ | romp-* | rs6000-* | rx-* \ ++ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ + | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ +- | sparc-* | sparc64-* | sparc86x-* | sparclet-* | sparclite-* \ +- | sparcv9-* | sparcv9b-* | strongarm-* | sv1-* | sx?-* \ +- | tahoe-* | thumb-* | tic30-* | tic4x-* | tic54x-* | tic80-* | tron-* \ ++ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ ++ | sparclite-* \ ++ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \ ++ | tahoe-* | thumb-* \ ++ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ ++ | tile-* | tilegx-* \ ++ | tron-* \ ++ | ubicom32-* \ + | v850-* | v850e-* | vax-* \ + | we32k-* \ +- | x86-* | x86_64-* | xps100-* | xscale-* | xstormy16-* \ +- | xtensa-* \ ++ | x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \ ++ | xstormy16-* | xtensa*-* \ + | ymp-* \ +- | z8k-*) ++ | z8k-* | z80-*) ++ ;; ++ # Recognize the basic CPU types without company name, with glob match. ++ xtensa*) ++ basic_machine=$basic_machine-unknown + ;; + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. +@@ -353,6 +412,9 @@ + basic_machine=a29k-amd + os=-udi + ;; ++ abacus) ++ basic_machine=abacus-unknown ++ ;; + adobe68k) + basic_machine=m68010-adobe + os=-scout +@@ -367,6 +429,12 @@ + basic_machine=a29k-none + os=-bsd + ;; ++ amd64) ++ basic_machine=x86_64-pc ++ ;; ++ amd64-*) ++ basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` ++ ;; + amdahl) + basic_machine=580-amdahl + os=-sysv +@@ -390,6 +458,10 @@ + basic_machine=m68k-apollo + os=-bsd + ;; ++ aros) ++ basic_machine=i386-pc ++ os=-aros ++ ;; + aux) + basic_machine=m68k-apple + os=-aux +@@ -398,10 +470,26 @@ + basic_machine=ns32k-sequent + os=-dynix + ;; ++ blackfin) ++ basic_machine=bfin-unknown ++ os=-linux ++ ;; ++ blackfin-*) ++ basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` ++ os=-linux ++ ;; ++ bluegene*) ++ basic_machine=powerpc-ibm ++ os=-cnk ++ ;; + c90) + basic_machine=c90-cray + os=-unicos + ;; ++ cegcc) ++ basic_machine=arm-unknown ++ os=-cegcc ++ ;; + convex-c1) + basic_machine=c1-convex + os=-bsd +@@ -426,12 +514,27 @@ + basic_machine=j90-cray + os=-unicos + ;; ++ craynv) ++ basic_machine=craynv-cray ++ os=-unicosmp ++ ;; ++ cr16) ++ basic_machine=cr16-unknown ++ os=-elf ++ ;; + crds | unos) + basic_machine=m68k-crds + ;; ++ crisv32 | crisv32-* | etraxfs*) ++ basic_machine=crisv32-axis ++ ;; + cris | cris-* | etrax*) + basic_machine=cris-axis + ;; ++ crx) ++ basic_machine=crx-unknown ++ os=-elf ++ ;; + da30 | da30-*) + basic_machine=m68k-da30 + ;; +@@ -454,6 +557,14 @@ + basic_machine=m88k-motorola + os=-sysv3 + ;; ++ dicos) ++ basic_machine=i686-pc ++ os=-dicos ++ ;; ++ djgpp) ++ basic_machine=i586-pc ++ os=-msdosdjgpp ++ ;; + dpx20 | dpx20-*) + basic_machine=rs6000-bull + os=-bosx +@@ -604,6 +715,14 @@ + basic_machine=m68k-isi + os=-sysv + ;; ++ m68knommu) ++ basic_machine=m68k-unknown ++ os=-linux ++ ;; ++ m68knommu-*) ++ basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` ++ os=-linux ++ ;; + m88k-omron*) + basic_machine=m88k-omron + ;; +@@ -615,10 +734,17 @@ + basic_machine=ns32k-utek + os=-sysv + ;; ++ microblaze) ++ basic_machine=microblaze-xilinx ++ ;; + mingw32) + basic_machine=i386-pc + os=-mingw32 + ;; ++ mingw32ce) ++ basic_machine=arm-unknown ++ os=-mingw32ce ++ ;; + miniframe) + basic_machine=m68000-convergent + ;; +@@ -632,10 +758,6 @@ + mips3*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + ;; +- mmix*) +- basic_machine=mmix-knuth +- os=-mmixware +- ;; + monitor) + basic_machine=m68k-rom68k + os=-coff +@@ -648,6 +770,9 @@ + basic_machine=i386-pc + os=-msdos + ;; ++ ms1-*) ++ basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ++ ;; + mvs) + basic_machine=i370-ibm + os=-mvs +@@ -723,9 +848,12 @@ + basic_machine=hppa1.1-oki + os=-proelf + ;; +- or32 | or32-*) ++ openrisc | openrisc-*) + basic_machine=or32-unknown +- os=-coff ++ ;; ++ os400) ++ basic_machine=powerpc-ibm ++ os=-os400 + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson +@@ -743,6 +871,14 @@ + basic_machine=i860-intel + os=-osf + ;; ++ parisc) ++ basic_machine=hppa-unknown ++ os=-linux ++ ;; ++ parisc-*) ++ basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` ++ os=-linux ++ ;; + pbd) + basic_machine=sparc-tti + ;; +@@ -752,24 +888,36 @@ + pc532 | pc532-*) + basic_machine=ns32k-pc532 + ;; ++ pc98) ++ basic_machine=i386-pc ++ ;; ++ pc98-*) ++ basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` ++ ;; + pentium | p5 | k5 | k6 | nexgen | viac3) + basic_machine=i586-pc + ;; + pentiumpro | p6 | 6x86 | athlon | athlon_*) + basic_machine=i686-pc + ;; +- pentiumii | pentium2) ++ pentiumii | pentium2 | pentiumiii | pentium3) + basic_machine=i686-pc + ;; ++ pentium4) ++ basic_machine=i786-pc ++ ;; + pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) + basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumpro-* | p6-* | 6x86-* | athlon-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; +- pentiumii-* | pentium2-*) ++ pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; ++ pentium4-*) ++ basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` ++ ;; + pn) + basic_machine=pn-gould + ;; +@@ -802,6 +950,10 @@ + basic_machine=i586-unknown + os=-pw32 + ;; ++ rdos) ++ basic_machine=i386-pc ++ os=-rdos ++ ;; + rom68k) + basic_machine=m68k-rom68k + os=-coff +@@ -828,6 +980,14 @@ + sb1el) + basic_machine=mipsisa64sb1el-unknown + ;; ++ sde) ++ basic_machine=mipsisa32-sde ++ os=-elf ++ ;; ++ sei) ++ basic_machine=mips-sei ++ os=-seiux ++ ;; + sequent) + basic_machine=i386-sequent + ;; +@@ -835,6 +995,12 @@ + basic_machine=sh-hitachi + os=-hms + ;; ++ sh5el) ++ basic_machine=sh5le-unknown ++ ;; ++ sh64) ++ basic_machine=sh64-unknown ++ ;; + sparclite-wrs | simso-wrs) + basic_machine=sparclite-wrs + os=-vxworks +@@ -901,10 +1067,6 @@ + basic_machine=i386-sequent + os=-dynix + ;; +- t3d) +- basic_machine=alpha-cray +- os=-unicos +- ;; + t3e) + basic_machine=alphaev5-cray + os=-unicos +@@ -913,14 +1075,27 @@ + basic_machine=t90-cray + os=-unicos + ;; +- tic4x | c4x*) +- basic_machine=tic4x-unknown +- os=-coff +- ;; + tic54x | c54x*) + basic_machine=tic54x-unknown + os=-coff + ;; ++ tic55x | c55x*) ++ basic_machine=tic55x-unknown ++ os=-coff ++ ;; ++ tic6x | c6x*) ++ basic_machine=tic6x-unknown ++ os=-coff ++ ;; ++ # This must be matched before tile*. ++ tilegx*) ++ basic_machine=tilegx-unknown ++ os=-linux-gnu ++ ;; ++ tile*) ++ basic_machine=tile-unknown ++ os=-linux-gnu ++ ;; + tx39) + basic_machine=mipstx39-unknown + ;; +@@ -934,6 +1109,10 @@ + tower | tower-32) + basic_machine=m68k-ncr + ;; ++ tpf) ++ basic_machine=s390x-ibm ++ os=-tpf ++ ;; + udi29k) + basic_machine=a29k-amd + os=-udi +@@ -977,9 +1156,9 @@ + basic_machine=hppa1.1-winbond + os=-proelf + ;; +- windows32) +- basic_machine=i386-pc +- os=-windows32-msvcrt ++ xbox) ++ basic_machine=i686-pc ++ os=-mingw32 + ;; + xps | xps100) + basic_machine=xps100-honeywell +@@ -992,6 +1171,10 @@ + basic_machine=z8k-unknown + os=-sim + ;; ++ z80-*-coff) ++ basic_machine=z80-unknown ++ os=-sim ++ ;; + none) + basic_machine=none-none + os=-none +@@ -1011,6 +1194,9 @@ + romp) + basic_machine=romp-ibm + ;; ++ mmix) ++ basic_machine=mmix-knuth ++ ;; + rs6000) + basic_machine=rs6000-ibm + ;; +@@ -1027,13 +1213,10 @@ + we32k) + basic_machine=we32k-att + ;; +- sh3 | sh4 | sh3eb | sh4eb | sh[1234]le | sh3ele) ++ sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) + basic_machine=sh-unknown + ;; +- sh64) +- basic_machine=sh64-unknown +- ;; +- sparc | sparcv9 | sparcv9b) ++ sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) + basic_machine=sparc-sun + ;; + cydra) +@@ -1080,6 +1263,9 @@ + # First match some system type aliases + # that might get confused with valid system types. + # -solaris* is a basic system type, with this one exception. ++ -auroraux) ++ os=-auroraux ++ ;; + -solaris1 | -solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` + ;; +@@ -1100,24 +1286,30 @@ + # Each alternative MUST END IN A *, to match a version number. + # -sysv* is not here because it comes later, after sysvr4. + -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ +- | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ +- | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \ ++ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ ++ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ ++ | -sym* | -kopensolaris* \ + | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ +- | -aos* \ ++ | -aos* | -aros* \ + | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ + | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ +- | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \ +- | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ ++ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ ++ | -openbsd* | -solidbsd* \ ++ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ ++ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ + | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ + | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ +- | -chorusos* | -chorusrdb* \ ++ | -chorusos* | -chorusrdb* | -cegcc* \ + | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ +- | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \ +- | -interix* | -uwin* | -rhapsody* | -darwin* | -opened* \ ++ | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \ ++ | -uxpv* | -beos* | -mpeix* | -udk* \ ++ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ + | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ + | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ + | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ +- | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* | -powermax*) ++ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ ++ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ ++ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*) + # Remember, each alternative MUST END IN *, to match a version number. + ;; + -qnx*) +@@ -1129,16 +1321,21 @@ + ;; + esac + ;; ++ -nto-qnx*) ++ ;; + -nto*) +- os=-nto-qnx ++ os=`echo $os | sed -e 's|nto|nto-qnx|'` + ;; + -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ +- | -windows* | -osx | -abug | -netware* | -os9* | -beos* \ ++ | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ + | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) + ;; + -mac*) + os=`echo $os | sed -e 's|mac|macos|'` + ;; ++ -linux-dietlibc) ++ os=-linux-dietlibc ++ ;; + -linux*) + os=`echo $os | sed -e 's|linux|linux-gnu|'` + ;; +@@ -1151,6 +1348,9 @@ + -opened*) + os=-openedition + ;; ++ -os400*) ++ os=-os400 ++ ;; + -wince*) + os=-wince + ;; +@@ -1172,6 +1372,9 @@ + -atheos*) + os=-atheos + ;; ++ -syllable*) ++ os=-syllable ++ ;; + -386bsd) + os=-bsd + ;; +@@ -1194,6 +1397,9 @@ + -sinix*) + os=-sysv4 + ;; ++ -tpf*) ++ os=-tpf ++ ;; + -triton*) + os=-sysv3 + ;; +@@ -1224,6 +1430,20 @@ + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + os=-mint + ;; ++ -aros*) ++ os=-aros ++ ;; ++ -kaos*) ++ os=-kaos ++ ;; ++ -zvmoe) ++ os=-zvmoe ++ ;; ++ -dicos*) ++ os=-dicos ++ ;; ++ -nacl*) ++ ;; + -none) + ;; + *) +@@ -1246,6 +1466,12 @@ + # system, and we'll never get to this point. + + case $basic_machine in ++ score-*) ++ os=-elf ++ ;; ++ spu-*) ++ os=-elf ++ ;; + *-acorn) + os=-riscix1.2 + ;; +@@ -1255,6 +1481,9 @@ + arm*-semi) + os=-aout + ;; ++ c4x-* | tic4x-*) ++ os=-coff ++ ;; + # This must come before the *-dec entry. + pdp10-*) + os=-tops20 +@@ -1280,6 +1509,9 @@ + m68*-cisco) + os=-aout + ;; ++ mep-*) ++ os=-elf ++ ;; + mips*-cisco) + os=-elf + ;; +@@ -1298,9 +1530,15 @@ + *-be) + os=-beos + ;; ++ *-haiku) ++ os=-haiku ++ ;; + *-ibm) + os=-aix + ;; ++ *-knuth) ++ os=-mmixware ++ ;; + *-wec) + os=-proelf + ;; +@@ -1403,7 +1641,7 @@ + -sunos*) + vendor=sun + ;; +- -aix*) ++ -cnk*|-aix*) + vendor=ibm + ;; + -beos*) +@@ -1433,9 +1671,15 @@ + -mvs* | -opened*) + vendor=ibm + ;; ++ -os400*) ++ vendor=ibm ++ ;; + -ptx*) + vendor=sequent + ;; ++ -tpf*) ++ vendor=ibm ++ ;; + -vxsim* | -vxworks* | -windiss*) + vendor=wrs + ;; +@@ -1460,7 +1704,7 @@ + esac + + echo $basic_machine$os +-exit 0 ++exit + + # Local variables: + # eval: (add-hook 'write-file-hooks 'time-stamp) diff --git a/libtextcat/makefile.mk b/libtextcat/makefile.mk new file mode 100644 index 000000000000..01a2a6eadc36 --- /dev/null +++ b/libtextcat/makefile.mk @@ -0,0 +1,85 @@ +#************************************************************************* +# +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# Copyright 2000, 2010 Oracle and/or its affiliates. +# +# OpenOffice.org - a multi-platform office productivity suite +# +# This file is part of OpenOffice.org. +# +# OpenOffice.org is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 +# only, as published by the Free Software Foundation. +# +# OpenOffice.org is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License version 3 for more details +# (a copy is included in the LICENSE file that accompanied this code). +# +# You should have received a copy of the GNU Lesser General Public License +# version 3 along with OpenOffice.org. If not, see +# <http://www.openoffice.org/license.html> +# for a copy of the LGPLv3 License. +# +#************************************************************************* + +PRJ=. + +PRJNAME=libtextcat +TARGET=libtextcat + +# --- Settings ----------------------------------------------------- + +.INCLUDE : settings.mk + +# --- Files -------------------------------------------------------- + +TARFILE_NAME=libtextcat-2.2 +TARFILE_MD5=128cfc86ed5953e57fe0f5ae98b62c2e +TARFILE_ROOTDIR=libtextcat-2.2 + +PATCH_FILES=libtextcat-2.2.patch + + +ADDITIONAL_FILES= \ + src$/utf8misc.h \ + src$/utf8misc.c \ + src$/win32_config.h \ + src$/makefile.mk \ + src$/libtextcat.map + +.IF "$(GUI)"=="UNX" +#CONFIGURE_DIR=$(BUILD_DIR) + +#relative to CONFIGURE_DIR +CONFIGURE_ACTION=configure CFLAGS="$(ARCH_FLAGS) $(EXTRA_CFLAGS)" +CONFIGURE_FLAGS=$(eq,$(OS),MACOSX CPPFLAGS="$(EXTRA_CDEFS)" $(NULL)) + +BUILD_ACTION=make + +OUT2LIB=$(BUILD_DIR)$/src$/.libs$/libtextcat*$(DLLPOST) + +.ENDIF # "$(GUI)"=="UNX" + + +.IF "$(GUI)"=="WNT" || "$(GUI)"=="OS2" +BUILD_ACTION=cd src && dmake $(MAKEMACROS) +.ENDIF # "$(GUI)"=="WNT" || "$(GUI)"=="OS2" + + +OUT2INC= \ + $(BUILD_DIR)$/src$/config.h \ + $(BUILD_DIR)$/src$/common.h \ + $(BUILD_DIR)$/src$/fingerprint.h \ + $(BUILD_DIR)$/src$/textcat.h \ + $(BUILD_DIR)$/src$/wg_mempool.h + + +# --- Targets ------------------------------------------------------ + +.INCLUDE : set_ext.mk +.INCLUDE : target.mk +.INCLUDE : tg_ext.mk + diff --git a/libtextcat/prj/build.lst b/libtextcat/prj/build.lst new file mode 100644 index 000000000000..da155db3d291 --- /dev/null +++ b/libtextcat/prj/build.lst @@ -0,0 +1,3 @@ +ltc libtextcat : stlport soltools solenv NULL +ltc libtextcat usr1 - all ltc_mkout NULL +ltc libtextcat nmake - all ltc_libtextcat NULL diff --git a/libtextcat/prj/d.lst b/libtextcat/prj/d.lst new file mode 100644 index 000000000000..0e7f5636bdc0 --- /dev/null +++ b/libtextcat/prj/d.lst @@ -0,0 +1,12 @@ + +..\%__SRC%\lib\lib*.* %_DEST%\lib%_EXT%\lib*.* +..\%__SRC%\lib\ilib*.* %_DEST%\lib%_EXT%\ilib*.* +..\%__SRC%\bin\l*.dll %_DEST%\bin%_EXT%\*.dll + +mkdir: %_DEST%\inc%_EXT%\libtextcat +..\%__SRC%\misc\build\libtextcat-2.2\src\*.h %_DEST%\inc%_EXT%\libtextcat\*.h + +# data for language guessing +..\data\new_fingerprints\fpdb.conf %COMMON_DEST%\pck%_EXT%\fpdb.conf +..\data\new_fingerprints\lm\*.lm %COMMON_DEST%\pck%_EXT%\*.lm + |