#! /usr/local/bin/perl # if(grep(/^\-(h|\-help)$/i, @ARGV)) { print STDOUT << "ENDOFHELP"; Read N-gram tables and caluclate the probability of a POS tagging of the sentence. use: ngramprobability [--verbose] [--count ] [--help] \ "" --verbose -v Print all contributions --count -c Maximum count for Good-Turing smoothing. Actually, this is a variant of Good-Turing discounting: Katz smoothing. Default 5. Suggested value: --count = 5 --match-factor -m The Language Model Match Factor adjusts the weighting of the language model transition probability in the viterby search. The new -log(P(sentence(t))) = -log(P(sentence(t-1))) + -log(P(word|POS)) + m*-log(P(POS|POS(t-1))) Default 1. --license -l Print license information --help -h This message N-gramtable is a list of files (or - for STDIN) with whitespace delimited lines of N-gram counts and N-grams with the lowercase words in reverse order. Lines starting with '#' are removed. From an N-gram table (N<= 4) all smaller Ngrams are reconstructed. You can enter several tables, even mixing different N-grams. For example: 435 not do i sentence is a list of words enclosed in quotes. It MUST be the following the tables. All non-alphabetic characters are removed. For the start of the sentence, the largest possible N-gram is used, i.e., unigrams for the first word, bigrams for the second etc. ENDOFHELP exit; } # # ############################################################################### if(grep(/^\-(l|\-license)$/i, @ARGV)) { print STDOUT << "ENDOFLICENSE" ; Copyright R.J.J.H. van Son © 20002 Author Rob van Son Institute of Phonetic Sciences & ACLC University of Amsterdam Herengracht 338 NL-1016CG Amsterdam, The Netherlands Email: Rob.van.Son\@hum.uva.nl rob.van.son\@workmail.com WWW : http://www.fon.hum.uva.nl/rob/ mail: Institute of Phonetic Sciences University of Amsterdam Herengracht 338 NL-1016CG Amsterdam The Netherlands tel +31 205252183 fax +31 205252197 License for use and disclaimers This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ENDOFLICENSE exit; }; ####################################################### # # Get Smoothing factors (Good-Turing) # Default is 5 my $k=5; my $i; for($i=0; $i{0} = 0; # Define emission tables my @EmissionTables = ({}); $EmissionTables[2] = {}; $EmissionTables[1] = {}; $EmissionTables[0] = {}; # N-gram depth $n = 2; # Read in the N-gram tables my $NgramInputTable; foreach $NgramInputTable (@NgramtableList) { # Allow standard input $NgramInputTable = "STDIN" if $NgramInputTable eq '-'; # Open table or stop with error message open(INPUTTABLE, "<$NgramInputTable") || die "$NgramInputTable: $!\n" unless $NgramInputTable eq 'STDIN'; open(INPUTTABLE, "<&STDIN") || die "$NgramInputTable: $!\n" if $NgramInputTable eq 'STDIN'; # Read table while() { # Skip empty lines next unless /\S/; # Skip comments next if /^\s*\#/; # Split input (input stored in $_, split on whitespace: split(' ', $_)) my ($Count, $Observable, @W) = split; # Store Count, if all relevant words are present $n = scalar(@W) unless $n > 0; my $NgramPower = $n < scalar(@W) ? $n : scalar(@W); my $i; for($i=$NgramPower; $i>=0; --$i) { $NgramTables[$i] = {} unless ref($NgramTables[$i]) eq 'HASH'; $Key = $i ? join(" ", reverse(@W[0..$i-1])) : 'Total'; $NgramTables[$i]->{$Key} += $Count; }; # Handle observables $EmissionTables[2]->{"$Observable $W[0]"} += $Count; $EmissionTables[1]->{$Observable} += $Count; $EmissionTables[0]->{'Total'} += $Count; }; }; # Calculate the maximum number of N-grams # and the class size distribution of counts Nc my @TotalNgrams = (); my @MaxNgrams = (); my @NgramNc = (); for($i=0; $i<=$n; ++$i) { $TotalNgrams[$i] = scalar(keys(%{$NgramTables[$i]})); $MaxNgrams[$i] = $i ? $TotalNgrams[1]**$i : 1; $NgramNc[$i][0] = ($MaxNgrams[$i] - $TotalNgrams[$i]); }; my @TotalObservables = (1); my @MaxObservables = (1); my @ObservablesNC = (); $TotalObservables[1] = scalar(keys(%{$EmissionTables[1]})); $TotalObservables[2] = scalar(keys(%{$EmissionTables[2]})); $MaxObservables[1] = $TotalObservables[1]; $MaxObservables[2] = $TotalObservables[1] * $TotalNgrams[1]; $ObservablesNc[1][0] = ($MaxObservables[1] - $TotalObservables[1]); $ObservablesNc[2][0] = ($MaxObservables[2] - $TotalObservables[2]); # Distribution of counts from N1-Nk # Iterate over Ngram depth for($i=0;$i<=$n; ++$i) { my $NgramEntry; foreach $NgramEntry (keys(%{$NgramTables[$i]})) { my $count = $NgramTables[$i]->{$NgramEntry}; if($count && $count <= $k + 1) { ++$NgramNc[$i][$count]; }; }; }; # Iterate over Observables for($i=0;$i<=2; ++$i) { my $EmissionEntry; foreach $EmissionEntry (keys(%{$EmissionTables[$i]})) { my $count = $EmissionTables[$i]->{$EmissionEntry}; if($count && $count <= $k + 1) { ++$ObservablesNc[$i][$count]; }; }; }; # Good-Turing discounting. 0* = N1 / N0 # Actually, this is Katz smoothing, a variant of Good-Turing # discounting sub GoodTuringCount # ($count, $Nc, $Nc1, $k, $N1, $Nk1) -> $newcount { my $count = shift; my $Nc = shift; my $Nc1 = shift; my $k = shift; my $N1 = shift; my $Nk1 = shift; # Check for existence of counts return 0 unless($Nc && $N1 && (1 - ($k+1)*$Nk1/$N1)); return $count ? (($count+1)*$Nc1/$Nc - $count*($k+1)*$Nk1/$N1) / (1 - ($k+1)*$Nk1/$N1) : $Nc1 / $Nc; } # Smoothed N-gram counts my @SmoothedNgramCounts = (); for($j=0; $j<=$k; ++$j) { for($i=0;$i<=$n; ++$i) { # Smooth counts when there is a need for it, i.e., $NgramNc[$i][0] > 0 $SmoothedNgramCounts[$i][$j] = $NgramNc[$i][0] ? GoodTuringCount($j, $NgramNc[$i][$j], $NgramNc[$i][$j+1], $k, $NgramNc[$i][1], $NgramNc[$i][$k+1]) : $j; }; }; # Smoothed Observable counts my @SmoothedObservableCounts = (); for($j=0; $j<=$k; ++$j) { for($i=0;$i<=2; ++$i) { # Smooth counts when there is a need for it, i.e., $ObservableNc[$i][0] > 0 $SmoothedObservableCounts[$i][$j] = $ObservablesNc[$i][0] ? GoodTuringCount($j, $ObservablesNc[$i][$j], $ObservablesNc[$i][$j+1], $k, $ObservablesNc[$i][1], $ObservablesNc[$i][$k+1]) : $j; }; }; # Store smoothed counts for($i=0;$i<=$n; ++$i) { foreach $NgramEntry (keys(%{$NgramTables[$i]})) { my $count = $NgramTables[$i]->{$NgramEntry}; if($count <= $k) { $NgramTables[$i]->{$NgramEntry} = $SmoothedNgramCounts[$i][$count]; }; }; }; # Store smoothed counts for($i=0;$i<=2; ++$i) { foreach $ObservableEntry (keys(%{$EmissionTables[$i]})) { my $count = $EmissionTables[$i]->{$ObservableEntry}; if($count <= $k) { $EmissionTables[$i]->{$ObservableEntry} = $SmoothedObservableCounts[$i][$count]; }; }; }; # Check smoothed counts for($i=0;$i<=$n && $CheckAll; ++$i) { my $sumcount = 0; foreach $NgramEntry (keys(%{$NgramTables[$i]})) { $sumcount += $NgramTables[$i]->{$NgramEntry}; }; print "Total $i: $NgramTables[0]->{'Total'} = ", $sumcount + $NgramNc[$i][0]*$SmoothedNgramCounts[$i][0], " = $sumcount + $NgramNc[$i][0]*$SmoothedNgramCounts[$i][0] (Counts* + 0 counts)\n"; }; # Check smoothed counts for($i=0;$i<=$n && $CheckAll; ++$i) { my $sumcount = 0; foreach $ObservableEntry (keys(%{$EmissionTables[$i]})) { $sumcount += $EmissionTables[$i]->{$ObservableEntry}; }; print "Total $i: $EmissionTables[0]->{'Total'} = ", $sumcount + $ObservablesNc[$i][0]*$SmoothedObservableCounts[$i][0], " = $sumcount + $ObservablesNc[$i][0]*$SmoothedObservableCounts[$i][0] (Counts* + 0 counts)\n"; }; ############################################################# # # Ready with the preparations. The language model has been constructed, # now we are going to use it. # ############################################################# # Handle file input if($SentenceInput) { if($SentenceInput eq '-' || $SentenceInput eq 'STDIN' || $SentenceInput eq 'stdin') { open(TEXTINPUT, "<&STDIN") || die "STDIN $SentenceInput: $!\n"; } else { open(TEXTINPUT, "<$SentenceInput") || die "$SentenceInput: $!\n"; }; }; # Iterate over input while($Sentence || ($Sentence = )) { # Skip empty lines and comments next unless $Sentence =~ /\S/; next if $Sentence =~ /\s*\#/; # Clean up sentence (lowercase, only alphabetic characters) $Sentence = lc($Sentence); # Protect word internal ' s/([a-z])\'([a-z])/\1__\2/ig; # Remove non-sentence ending periods s/\b(dr|mr|mrs|drv|blvd)\./\1/ig; # Restore word internal ' s/([a-z])\_\_([a-z])/\1\'\2/ig; # Remove unwanted characters $Sentence =~ s/[^a-z\']+/ /ig; # Add enough sentence start symbols (.) and a sentence END symbol my @WordSequence = split(' ', (". "x($n-1)).$Sentence.' .'); # Clean out $Sentence $Sentence = ""; # Calculate probabilities of the words my @ViterbiCost = (); $ViterbiCost[0]->{'.'} = 0; my @ViterbiBackPointer = (); $ViterbiBackPointer[0]->{'.'} = ''; my @POStags = keys(%{$NgramTables[1]}); my @Vocabulary = keys(%{$EmissionTables[1]}); my $j; for($j=$n-1; $j{"$WordSequence[$j] $POStag"} || $SmoothedObservableCounts[2][0]; # Smoothed sum counts for normalization $SumCount = 0; foreach $NewEmission (@Vocabulary) { $SumCount += $EmissionTables[2]->{"$NewEmission $POStag"} || $SmoothedObservableCounts[2][0]; }; # Total emission cost is log Probability my $EmissionCost = -(log($EmissionCount) - log($SumCount)) / log(2); # Get the transition log probability of the previous tag to the current tag foreach $PrevTag (keys(%{$ViterbiCost[$j-1]})) { my $PreviousCost = $ViterbiCost[$j-1]->{$PrevTag}; # Smoothed transition counts my $TransitionCount = $NgramTables[2]->{"$PrevTag $POStag"} || $SmoothedNgramCounts[2][0]; # Smoothed sum counts for normalization my $SumCount = 0; foreach $NewPOStag (@POStags) { $SumCount += $NgramTables[2]->{"$PrevTag $NewPOStag"} || $SmoothedNgramCounts[2][0]; }; # Total transition cost is log Probability my $TransitionCost = $SumCount ? -(log($TransitionCount) - log($SumCount)) : 0; # Get the lowest my $CurrentCost = $PreviousCost + $lmf * $TransitionCost/log(2); if($CurrentCost < $LowestCost) { $LowestCost = $CurrentCost; $LowestPointer = $PrevTag; }; }; # Store the current lowest cost and backpointer $ViterbiCost[$j]->{$POStag} = $LowestCost + $EmissionCost; $ViterbiBackPointer[$j]->{$POStag} = $LowestPointer; }; }; # Print POS tags and words (costs) my @TaggedWordList; $POStag = '.'; $PathCost = 0; for($j=scalar(@WordSequence) - 1; $j >= $n-1; --$j) { # Get next POStag down. $NextPOStag = $ViterbiBackPointer[$j]->{$POStag}; # Use next POS tag to get the next cost $PathCost = $ViterbiCost[$j-1]->{$NextPOStag}; push(@TaggedWordList, "$WordSequence[$j]\t$POStag\t(".($ViterbiCost[$j]->{$POStag} - $PathCost).")"); # Update POS tag $POStag = $NextPOStag; }; my $TotalCostOfTagging = $ViterbiCost[scalar(@WordSequence) - 1]->{'.'}; print join("\n", reverse(@TaggedWordList)), "\nProbability: $TotalCostOfTagging (", $TotalCostOfTagging/(scalar(@WordSequence) - $n + 1),"/word)\n"; };