diff options
Diffstat (limited to 'i18npool/source/breakiterator/data/count_word.txt')
-rw-r--r-- | i18npool/source/breakiterator/data/count_word.txt | 125 |
1 files changed, 0 insertions, 125 deletions
diff --git a/i18npool/source/breakiterator/data/count_word.txt b/i18npool/source/breakiterator/data/count_word.txt deleted file mode 100644 index 4ba882bb3b..0000000000 --- a/i18npool/source/breakiterator/data/count_word.txt +++ /dev/null @@ -1,125 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: count_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - - -$dash = \u002d; - -$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] - [:P:] [:S:] [:LineBreak = Numeric:] - - $dash - - $Katakana - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$FormatEx = $Format $Extend*; - -# -# Numbers. Rules 8, 11, 12 form the TR. -# - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $ALetterEx)*; # rules #6, #7 -$LetterSequence {200}; - -$ALetterEx* $dash+ {200}; -$ALetterEx* ($dash $LetterSequence)+ $dash* {200}; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -# [:IDEOGRAPHIC:] $Extend* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$ALetter $Katakana $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - |