From 93ea507dcc9db5dc0da4191997b7a80cd2249a01 Mon Sep 17 00:00:00 2001 From: Eike Rathke Date: Fri, 11 Jun 2010 01:50:25 +0200 Subject: locales33a: #i111152# treat Indic aksaras with virama as one grapheme cluster for character iterator; patch from --- i18npool/source/breakiterator/data/char_in.txt | 112 +++++++++++++++++++------ 1 file changed, 86 insertions(+), 26 deletions(-) (limited to 'i18npool') diff --git a/i18npool/source/breakiterator/data/char_in.txt b/i18npool/source/breakiterator/data/char_in.txt index 72c4a44720cd..5e1ed67596c0 100644 --- a/i18npool/source/breakiterator/data/char_in.txt +++ b/i18npool/source/breakiterator/data/char_in.txt @@ -1,48 +1,108 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# Copyright (C) 2002-2009, International Business Machines Corporation and others. # All Rights Reserved. # # file: char.txt # # ICU Character Break Rules, also known as Grapheme Cluster Boundaries # See Unicode Standard Annex #29. -# These rules are based on TR29 Version 4.0.0 +# These rules are based on TR29 Revision 13, for Unicode Version 5.1 # # # Character Class Definitions. -# The names are those from TR29. # -$CR = \r; -$LF = \n; -$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]]; - -# add Japanese Half Width voicing marks to $Extend -$VoiceMarks = [\uff9e\uff9f]; -$cmcextend = [ \u0903 \u093e-\u0940 \u0949-\u094C \u09bf-\u09c0 \u09c7-\u09c8 \u09cb-\u09cc \u0bc1-\u0bc2 \u0bc6-\u0bc8 \u0bca-\u0bcc \u0c01-\u0c03 \u0c41-\u0c44]; -$Extend = [[:Grapheme_Extend = TRUE:] $VoiceMarks $cmcextend]; +$CR = [\p{Grapheme_Cluster_Break = CR}]; +$LF = [\p{Grapheme_Cluster_Break = LF}]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; +$Extend = [\p{Grapheme_Cluster_Break = Extend}]; +$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; +$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; +$BengaliSignVirama = \u09CD; +$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; +$GujaratiSignVirama = \u0ACD; +$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; +$DevanagariSignVirama = \u094D; +$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; +$KannadaSignVirama = \u0CCD; +$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; +$MalayalamSignVirama = \u0D4D; +$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; +$OriyaSignVirama = \u0B4D; +$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; +$GurmukhiSignVirama = \u0A4D; +$TamilLetter = [\u0B85-\u0BB9]; +$TamilSignVirama = \u0BCD; +$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; +$TeluguSignVirama = \u0C4D; # # Korean Syllable Definitions # -$L = [:Hangul_Syllable_Type = L:]; -$V = [:Hangul_Syllable_Type = V:]; -$T = [:Hangul_Syllable_Type = T:]; +$L = [\p{Grapheme_Cluster_Break = L}]; +$V = [\p{Grapheme_Cluster_Break = V}]; +$T = [\p{Grapheme_Cluster_Break = T}]; -$LV = [:Hangul_Syllable_Type = LV:]; -$LVT = [:Hangul_Syllable_Type = LVT:]; +$LV = [\p{Grapheme_Cluster_Break = LV}]; +$LVT = [\p{Grapheme_Cluster_Break = LVT}]; -$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+; -# -# Forward Break Rules -# +## ------------------------------------------------- +!!chain; + +!!forward; + $CR $LF; -([^$Control] | $HangulSyllable) $Extend*; -.; +$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; +$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; +$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; +$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; +$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; +$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+; +$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; +$TamilLetter ($TamilSignVirama $TamilLetter?)+; +$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; + +$L ($L | $V | $LV | $LVT); +($LV | $V) ($V | $T); +($LVT | $T) $T; + +[^$Control $CR $LF] $Extend; + +[^$Control $CR $LF] $SpacingMark; +$Prepend [^$Control $CR $LF]; + + +## ------------------------------------------------- + +!!reverse; +$LF $CR; +($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; +($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; +($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; +($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; +($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; +($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter; +($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; +($TamilLetter? $TamilSignVirama)+ $TamilLetter; +($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; +($L | $V | $LV | $LVT) $L; +($V | $T) ($LV | $V); +$T ($LVT | $T); + +$Extend [^$Control $CR $LF]; +$SpacingMark [^$Control $CR $LF]; +[^$Control $CR $LF] $Prepend; + + +## ------------------------------------------------- + +!!safe_reverse; + + +## ------------------------------------------------- + +!!safe_forward; -# -# Reverse Rule, back up to the beginning of some preceding grapheme cluster. -# -! ($Extend | $V | $T )* ($LF $CR | ($LV | $LVT)?$L* | .); -- cgit v1.2.3