summaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorJeroen Nijhof <jeroen@nijhof.co.uk>2014-05-01 19:32:17 +0200
committerMichael Meeks <michael.meeks@collabora.com>2014-05-02 09:15:51 +0000
commitcda4ee0c50d2a4d3b471e63fe115b1a90b285f89 (patch)
tree6e8f5a7c675ec6e3379300cadb04a4d1686aa60e /bin
parent6efd9725912ae67bafbfe613220d9c09443ea0ba (diff)
Speed up find-german-comments: speed up text_cat -s
follow-up commit to https://gerrit.libreoffice.org/#/c/9226/ . text_cat -s was documented as "Not very efficient yet, because language models are re-loaded after each line." So if we want to use text_cat -s for thousands of lines, better read the language models only once and store them. When tested on svl/source, the speed-up was a futher factor 1.5 (reduced from 6 s to 4 s). Change-Id: I654a250b0e369e01c5eac5970b64df1390f0ef35 Reviewed-on: https://gerrit.libreoffice.org/9227 Reviewed-by: Michael Meeks <michael.meeks@collabora.com> Tested-by: Michael Meeks <michael.meeks@collabora.com>
Diffstat (limited to 'bin')
-rwxr-xr-xbin/text_cat/text_cat81
1 files changed, 46 insertions, 35 deletions
diff --git a/bin/text_cat/text_cat b/bin/text_cat/text_cat
index c907f50b96bb..74dae861d802 100755
--- a/bin/text_cat/text_cat
+++ b/bin/text_cat/text_cat
@@ -8,6 +8,8 @@ use Getopt::Std;
use Benchmark;
my $non_word_characters='0-9\s';
+my @languages; # languages (sorted by name)
+my %ngram_for; # map language x ngram => rang
# OPTIONS
getopts('a:d:f:hi:lnst:u:v');
@@ -94,55 +96,64 @@ if ($opt_n) {
classify(input());
}
-# CLASSIFICATION
-sub classify {
- my ($input)=@_;
- my %results=();
- my $maxp = $opt_t;
+sub read_model {
+ my ($file) = @_;
+ open(LM,"$file") or die "cannot open $file: $!\n";
+ my %ngram;
+ my $rang = 1;
+ while (<LM>) {
+ chomp;
+ # only use lines starting with appropriate character. Others are
+ # ignored.
+ if (/^[^$non_word_characters]+/o) {
+ $ngram{$&} = $rang++;
+ }
+ }
+ return \%ngram;
+}
+
+sub read_models {
# open directory to find which languages are supported
opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
- my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
+ @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
closedir DIR;
@languages or die "sorry, can't read any language models from $opt_d\n" .
"language models must reside in files with .lm ending\n";
+ foreach my $language (@languages) {
+ $ngram_for{$language} = read_model("$opt_d/$language.lm");
+ }
+}
+
+# CLASSIFICATION
+sub classify {
+ my ($input)=@_;
+ my %results=();
+ my $maxp = $opt_t;
+ read_models() if !@languages;
# create ngrams for input. Note that hash %unknown is not used;
# it contains the actual counts which are only used under -n: creating
# new language model (and even then they are not really required).
my @unknown=create_lm($input);
- # load model and count for each language.
- my $language;
+
my $t1 = new Benchmark;
- foreach $language (@languages) {
- # loads the language model into hash %$language.
- my %ngram=();
- my $rang=1;
- open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
- while (<LM>) {
- chomp;
- # only use lines starting with appropriate character. Others are
- # ignored.
- if (/^[^$non_word_characters]+/o) {
- $ngram{$&} = $rang++;
- }
- }
- close(LM);
- #print STDERR "loaded language model $language\n" if $opt_v;
-
- # compares the language model with input ngrams list
- my ($i,$p)=(0,0);
- while ($i < @unknown) {
- if ($ngram{$unknown[$i]}) {
- $p=$p+abs($ngram{$unknown[$i]}-$i);
- } else {
- $p=$p+$maxp;
+ foreach my $language (@languages) {
+ # compares the language model with input ngrams list
+ my $ngram = $ngram_for{$language} or die "no ngrams for $language";
+
+ my ($i,$p)=(0,0);
+ while ($i < @unknown) {
+ if ($ngram->{$unknown[$i]}) {
+ $p=$p+abs($ngram->{$unknown[$i]}-$i);
+ } else {
+ $p=$p+$maxp;
+ }
+ ++$i;
}
- ++$i;
- }
- #print STDERR "$language: $p\n" if $opt_v;
+ #print STDERR "$language: $p\n" if $opt_v;
- $results{$language} = $p;
+ $results{$language} = $p;
}
print STDERR "read language models done (" .
timestr(timediff(new Benchmark, $t1)) .