# # Copyright (C) 2002-2003, International Business Machines Corporation and others. # All Rights Reserved. # # file: dict_word.txt # # ICU Word Break Rules # See Unicode Standard Annex #29. # These rules are based on Version 4.0.0, dated 2003-04-17 # #################################################################################### # # Character class definitions from TR 29 # #################################################################################### $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; $Ideographic = [:Ideographic:]; $Hangul = [:Script = HANGUL:]; # list of dashes or hyphens that should be accepted as part of the word if a single one of these # pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to # be part of the word in order to have it properly spell checked etc. $PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ]; $ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - $Ideographic - $Katakana - $Hangul - [:Script = Thai:] - [:Script = Lao:] - [:Script = Hiragana:]]; $MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] ]; $SufixLetter = [:name= FULL STOP:]; $MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] [:name = PRIME:]]; $Numeric = [:LineBreak = Numeric:]; $TheZWSP = \u200b; # # Character Class Definitions. # The names are those from TR29. # $CR = \u000d; $LF = \u000a; $Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; $Extend = [[:Grapheme_Extend = TRUE:]]; #################################################################################### # # Word Break Rules. Definitions and Rules specific to word break begin Here. # #################################################################################### $Format = [[:Cf:] - $TheZWSP]; # Rule 3: Treat a grapheme cluster as if it were a single character. # Hangul Syllables are easier to deal with here than they are in Grapheme Clusters # because we don't need to find the boundaries between adjacent syllables - # they won't be word boundaries. # # # "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. # $ALetterEx = $ALetter $Extend*; $NumericEx = $Numeric $Extend*; $MidNumEx = $MidNum $Extend*; $MidLetterEx = $MidLetter $Extend*; $SufixLetterEx= $SufixLetter $Extend*; $KatakanaEx = $Katakana $Extend*; $IdeographicEx= $Ideographic $Extend*; $HangulEx = $Hangul $Extend*; $FormatEx = $Format $Extend*; # # Numbers. Rules 8, 11, 12 form the TR. # $NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; $NumberSequence {100}; # # Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 # - must include at least one letter. # - may include both letters and numbers. # - may include MideLetter, MidNumber punctuation. # # At most one leading or trailing dash/hyphen should be accepted as well. # E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to # be part of the word in order to have it properly spell checked etc. $LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7 ($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; [[:P:][:S:]]*; # # Do not break between Katakana. Rule #13. # $KatakanaEx ($FormatEx* $KatakanaEx)* {300}; [:Hiragana:] $Extend* {300}; # # Ideographic Characters. Stand by themselves as words. # Separated from the "Everything Else" rule, below, only so that they # can be tagged with a return value. TODO: is this what we want? # $IdeographicEx ($FormatEx* $IdeographicEx)* {400}; $HangulEx ($FormatEx* $HangulEx)* {400}; # # Everything Else, with no tag. # Non-Control chars combine with $Extend (combining) chars. # Controls are do not. # [^$Control [:Ideographic:]] $Extend*; $CR $LF; # # Reverse Rules. Back up over any of the chars that can group together. # (Reverse rules do not need to be exact; they can back up too far, # but must back up at least enough, and must stop on a boundary.) # # NonStarters are the set of all characters that can appear at the 2nd - nth position of # a word. (They may also be the first.) The reverse rule skips over these, until it # reaches something that can only be the start (and probably only) char in a "word". # A space or punctuation meets the test. # $NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; #!.*; ! ($NonStarters* | \n \r) .;