#! /usr/local/bin/perl
#
if(grep(/^\-(h|\-help)$/i, @ARGV))
{
print STDOUT << "ENDOFHELP";
Read N-gram tables and caluclate the probability
of a POS tagging of the sentence.

use:
ngramprobability [--verbose] [--count <k>] [--help] \
                 <POStable> "<sentence>"

--verbose
-v
Print all contributions

--count <k>
-c <k>
Maximum count for Good-Turing smoothing.  Actually, this is
a variant of Good-Turing discounting: Katz smoothing.
Default 5. Suggested value: --count = 5

--match-factor <m>
-m <m>
The Language Model Match Factor adjusts the weighting of
the language model transition probability in the viterby
search. The new -log(P(sentence(t))) = -log(P(sentence(t-1)))
+ -log(P(word|POS)) + m*-log(P(POS|POS(t-1)))
Default 1.

--license
-l
Print license information

--help
-h
This message

N-gramtable is a list of files (or - for STDIN) with whitespace 
delimited lines of N-gram counts and N-grams with the lowercase 
words in reverse order. Lines starting with '#' are removed.
From an N-gram table (N<= 4) all smaller Ngrams are reconstructed.
You can enter several tables, even mixing different N-grams.
For example:
     435	not	do	i

sentence is a list of words enclosed in quotes. It MUST be the
following the tables. All non-alphabetic characters are removed.
For the start of the sentence, the largest possible N-gram is used,
i.e., unigrams for the first word, bigrams for the second etc.

ENDOFHELP
exit;
}
#
#
###############################################################################
if(grep(/^\-(l|\-license)$/i, @ARGV))
{
print STDOUT << "ENDOFLICENSE" ;

Copyright R.J.J.H. van Son © 20002

Author Rob van Son
Institute of Phonetic Sciences & ACLC
University of Amsterdam
Herengracht 338
NL-1016CG Amsterdam, The Netherlands
Email: Rob.van.Son\@hum.uva.nl
	   rob.van.son\@workmail.com
WWW  : http://www.fon.hum.uva.nl/rob/
mail:  Institute of Phonetic Sciences
	   University of Amsterdam
	   Herengracht 338
	   NL-1016CG Amsterdam
	   The Netherlands
	   tel +31 205252183
	   fax +31 205252197

License for use and disclaimers

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


ENDOFLICENSE
exit;
};
#######################################################
#
# Get Smoothing factors (Good-Turing)
# Default is 5
my $k=5;
my $i;
for($i=0; $i<scalar(@ARGV); ++$i)
{
	if($ARGV[$i] =~ /^\-(c|\-count)$/i)
	{
		$k = $ARGV[$i+1];
		# Remove option
		splice(@ARGV, $i, 2);
	};
};

# Get Language Model Match Factor
# Default is 1
my $lmf=1;
$i;
for($i=0; $i<scalar(@ARGV); ++$i)
{
	if($ARGV[$i] =~ /^\-(m|\-match-factor)$/i)
	{
		$lmf = $ARGV[$i+1];
		# Remove option
		splice(@ARGV, $i, 2);
	};
};

# Print all? Verbose
my $verbose = scalar grep(/^\-(v|\-verbose)$/i, @ARGV);
@ARGV = grep(!/^\-(v|\-verbose)$/i, @ARGV);

# Check all? Verbose
my $CheckAll = scalar grep(/^\-\-CheckAll$/i, @ARGV);
@ARGV = grep(!/^\-\-CheckAll$/i, @ARGV);

#########################################################
#
# Start handling input
#
#########################################################
# Get Sentence(s)
my $Sentence = pop;
# Use a file if Sentence exists
my $SentenceInput = "";
if(-s $Sentence || $Sentence eq '-' || $Sentence eq 'STDIN'
    || $Sentence eq 'stdin')
{
	$SentenceInput = $Sentence;
	$Sentence = "";
};

# Get table
my @NgramtableList = @ARGV;

# Define Ngramtables
my @NgramTables = ({});
$NgramTables[0]->{0} = 0;

# Define emission tables
my @EmissionTables = ({});
$EmissionTables[2] = {};
$EmissionTables[1] = {};
$EmissionTables[0] = {};

# N-gram depth
$n = 2;

# Read in the N-gram tables
my $NgramInputTable;
foreach $NgramInputTable (@NgramtableList)
{
	# Allow standard input
	$NgramInputTable = "STDIN" if $NgramInputTable eq '-';
	
	# Open table or stop with error message
	open(INPUTTABLE, "<$NgramInputTable") || die "$NgramInputTable: $!\n" unless $NgramInputTable eq 'STDIN';
	open(INPUTTABLE, "<&STDIN") || die "$NgramInputTable: $!\n" if $NgramInputTable eq 'STDIN';

	# Read table
	while(<INPUTTABLE>)
	{
		# Skip empty lines
		next unless /\S/;
		# Skip comments
		next if /^\s*\#/;

		# Split input (input stored in $_, split on whitespace: split(' ', $_))
		my ($Count, $Observable, @W) = split;

		# Store Count, if all relevant words are present
		$n = scalar(@W) unless $n > 0;
		my $NgramPower = $n < scalar(@W) ? $n : scalar(@W);
		my $i;
		for($i=$NgramPower; $i>=0; --$i)
		{
			$NgramTables[$i] = {} unless ref($NgramTables[$i]) eq 'HASH';
			
			$Key = $i ? join(" ", reverse(@W[0..$i-1])) : 'Total';
			$NgramTables[$i]->{$Key} += $Count;
		};
		
		# Handle observables
		$EmissionTables[2]->{"$Observable $W[0]"} += $Count;
		$EmissionTables[1]->{$Observable} += $Count;
		$EmissionTables[0]->{'Total'} += $Count;

	};
};


# Calculate the maximum number of N-grams
# and the class size distribution of counts Nc
my @TotalNgrams = ();
my @MaxNgrams = ();
my @NgramNc = ();
for($i=0; $i<=$n; ++$i)
{
	$TotalNgrams[$i] = scalar(keys(%{$NgramTables[$i]}));
	$MaxNgrams[$i] = $i ? $TotalNgrams[1]**$i : 1;
	$NgramNc[$i][0] = ($MaxNgrams[$i] - $TotalNgrams[$i]);
};

my @TotalObservables = (1);
my @MaxObservables = (1);
my @ObservablesNC = ();
$TotalObservables[1] = scalar(keys(%{$EmissionTables[1]}));
$TotalObservables[2] = scalar(keys(%{$EmissionTables[2]}));
$MaxObservables[1] = $TotalObservables[1];
$MaxObservables[2] = $TotalObservables[1] * $TotalNgrams[1];
$ObservablesNc[1][0] = ($MaxObservables[1] - $TotalObservables[1]);
$ObservablesNc[2][0] = ($MaxObservables[2] - $TotalObservables[2]);


# Distribution of counts from N1-Nk
# Iterate over Ngram depth
for($i=0;$i<=$n; ++$i)
{
	my $NgramEntry;
	foreach $NgramEntry (keys(%{$NgramTables[$i]}))
	{
		my $count = $NgramTables[$i]->{$NgramEntry};
		if($count && $count <= $k + 1)
		{
			++$NgramNc[$i][$count];
		};
	};
};

# Iterate over Observables
for($i=0;$i<=2; ++$i)
{
	my $EmissionEntry;
	foreach $EmissionEntry (keys(%{$EmissionTables[$i]}))
	{
		my $count = $EmissionTables[$i]->{$EmissionEntry};
		if($count && $count <= $k + 1)
		{
			++$ObservablesNc[$i][$count];
		};
	};
};

# Good-Turing discounting. 0* = N1 / N0
# Actually, this is Katz smoothing, a variant of Good-Turing
# discounting
sub GoodTuringCount	# ($count, $Nc, $Nc1, $k, $N1, $Nk1) -> $newcount
{
	my $count = shift;
	my $Nc = shift;
	my $Nc1 = shift; 
	my $k = shift;
	my $N1 = shift;
	my $Nk1 = shift;
	
	# Check for existence of counts
	return 0 unless($Nc && $N1 && (1 - ($k+1)*$Nk1/$N1));
	
	return $count ? (($count+1)*$Nc1/$Nc - $count*($k+1)*$Nk1/$N1) 
			/ (1 - ($k+1)*$Nk1/$N1) : $Nc1 / $Nc;
}

# Smoothed N-gram counts
my @SmoothedNgramCounts = ();
for($j=0; $j<=$k; ++$j)
{
	for($i=0;$i<=$n; ++$i)
	{
		# Smooth counts when there is a need for it, i.e., $NgramNc[$i][0] > 0
		$SmoothedNgramCounts[$i][$j] = $NgramNc[$i][0] ?
					GoodTuringCount($j, $NgramNc[$i][$j], $NgramNc[$i][$j+1], 
									$k, $NgramNc[$i][1], $NgramNc[$i][$k+1])
					: $j;
	};
};

# Smoothed Observable counts
my @SmoothedObservableCounts = ();
for($j=0; $j<=$k; ++$j)
{
	for($i=0;$i<=2; ++$i)
	{
		# Smooth counts when there is a need for it, i.e., $ObservableNc[$i][0] > 0
		$SmoothedObservableCounts[$i][$j] = $ObservablesNc[$i][0] ?
					GoodTuringCount($j, $ObservablesNc[$i][$j], $ObservablesNc[$i][$j+1], 
									$k, $ObservablesNc[$i][1], $ObservablesNc[$i][$k+1])
					: $j;
	};
};

# Store smoothed counts
for($i=0;$i<=$n; ++$i)
{
	foreach $NgramEntry (keys(%{$NgramTables[$i]}))
	{
		my $count = $NgramTables[$i]->{$NgramEntry};
		if($count <= $k)
		{
			$NgramTables[$i]->{$NgramEntry} = $SmoothedNgramCounts[$i][$count];
		};
	};
};

# Store smoothed counts
for($i=0;$i<=2; ++$i)
{
	foreach $ObservableEntry (keys(%{$EmissionTables[$i]}))
	{
		my $count = $EmissionTables[$i]->{$ObservableEntry};
		if($count <= $k)
		{
			$EmissionTables[$i]->{$ObservableEntry} = $SmoothedObservableCounts[$i][$count];
		};
	};
};

# Check smoothed counts
for($i=0;$i<=$n && $CheckAll; ++$i)
{
	my $sumcount = 0;
	foreach $NgramEntry (keys(%{$NgramTables[$i]}))
	{
		$sumcount += $NgramTables[$i]->{$NgramEntry};
	};
	print "Total $i: $NgramTables[0]->{'Total'} = ", $sumcount + $NgramNc[$i][0]*$SmoothedNgramCounts[$i][0], 
	" = $sumcount + $NgramNc[$i][0]*$SmoothedNgramCounts[$i][0] (Counts* + 0 counts)\n";
};

# Check smoothed counts
for($i=0;$i<=$n && $CheckAll; ++$i)
{
	my $sumcount = 0;
	foreach $ObservableEntry (keys(%{$EmissionTables[$i]}))
	{
		$sumcount += $EmissionTables[$i]->{$ObservableEntry};
	};
	print "Total $i: $EmissionTables[0]->{'Total'} = ", $sumcount + $ObservablesNc[$i][0]*$SmoothedObservableCounts[$i][0], 
	" = $sumcount + $ObservablesNc[$i][0]*$SmoothedObservableCounts[$i][0] (Counts* + 0 counts)\n";
};

#############################################################
#
# Ready with the preparations. The language model has been constructed,
#  now we are going to use it.
#
#############################################################

# Handle file input
if($SentenceInput)
{
	if($SentenceInput eq '-' || $SentenceInput eq 'STDIN'  || $SentenceInput eq 'stdin')
	{
		open(TEXTINPUT, "<&STDIN") || die "STDIN $SentenceInput: $!\n";
	}
	else
	{
		open(TEXTINPUT, "<$SentenceInput") || die "$SentenceInput: $!\n";
	};
};

# Iterate over input
while($Sentence || ($Sentence = <TEXTINPUT>))
{
	# Skip empty lines and comments
	next unless $Sentence =~ /\S/;
	next if $Sentence =~ /\s*\#/;
	
	# Clean up sentence (lowercase, only alphabetic characters)
	$Sentence = lc($Sentence);
	
	# Protect word internal '
	s/([a-z])\'([a-z])/\1__\2/ig;
	
	# Remove non-sentence ending periods
	s/\b(dr|mr|mrs|drv|blvd)\./\1/ig;
	
	# Restore word internal '
	s/([a-z])\_\_([a-z])/\1\'\2/ig;
	
	# Remove unwanted characters
	$Sentence =~ s/[^a-z\']+/ /ig;
	
	# Add enough sentence start symbols (.) and a sentence END symbol
	my @WordSequence = split(' ', (". "x($n-1)).$Sentence.' .');
	
	# Clean out $Sentence
	$Sentence = "";
	
	# Calculate probabilities of the words
	my @ViterbiCost = ();
	$ViterbiCost[0]->{'.'} = 0;
	my @ViterbiBackPointer = ();
	$ViterbiBackPointer[0]->{'.'} = '';
	my @POStags = keys(%{$NgramTables[1]});
	my @Vocabulary = keys(%{$EmissionTables[1]});
	my $j;
	for($j=$n-1; $j<scalar(@WordSequence); ++$j)
	{
		my $LowestCost = 1e100;
		my $LowestPointer = -1;
		print STDERR $WordSequence[$j], "\n";
		foreach $POStag (@POStags)
		{
			# We need for each combination of possible previous and current tags
			# The transition cost (-log(probability)) and the emission cost
			
			# The emission costs
			# Smoothed emission counts
			my $EmissionCount = $EmissionTables[2]->{"$WordSequence[$j] $POStag"} || $SmoothedObservableCounts[2][0];
			# Smoothed sum counts for normalization
			$SumCount = 0;
			foreach $NewEmission (@Vocabulary)
			{
				$SumCount += $EmissionTables[2]->{"$NewEmission $POStag"} || $SmoothedObservableCounts[2][0];
			};
			# Total emission cost is log Probability
			my $EmissionCost = -(log($EmissionCount) - log($SumCount)) / log(2);

			# Get the transition log probability of the previous tag to the current tag
			foreach $PrevTag (keys(%{$ViterbiCost[$j-1]}))
			{
				my $PreviousCost = $ViterbiCost[$j-1]->{$PrevTag};
				
				# Smoothed transition counts
				my $TransitionCount = $NgramTables[2]->{"$PrevTag $POStag"} || $SmoothedNgramCounts[2][0];
				# Smoothed sum counts for normalization
				my $SumCount = 0;
				foreach $NewPOStag (@POStags)
				{
					$SumCount += $NgramTables[2]->{"$PrevTag $NewPOStag"} || $SmoothedNgramCounts[2][0];
				};
				# Total transition cost is log Probability
				my $TransitionCost = $SumCount ? -(log($TransitionCount) - log($SumCount)) : 0;
				
				# Get the lowest
				my $CurrentCost = $PreviousCost + $lmf * $TransitionCost/log(2);
				if($CurrentCost < $LowestCost)
				{
					$LowestCost = $CurrentCost;
					$LowestPointer = $PrevTag;
				};
			};
			
			# Store the current lowest cost and backpointer
			$ViterbiCost[$j]->{$POStag} = $LowestCost + $EmissionCost;
			$ViterbiBackPointer[$j]->{$POStag} = $LowestPointer;
		};
	};
	
	# Print POS tags and words (costs)
	my @TaggedWordList;
	$POStag = '.';
	$PathCost = 0;
	for($j=scalar(@WordSequence) - 1; $j >= $n-1; --$j)
	{
		# Get next POStag down.
		$NextPOStag = $ViterbiBackPointer[$j]->{$POStag};
		# Use next POS tag to get the next cost
		$PathCost = $ViterbiCost[$j-1]->{$NextPOStag};
		push(@TaggedWordList, "$WordSequence[$j]\t$POStag\t(".($ViterbiCost[$j]->{$POStag} - $PathCost).")");
		# Update POS tag
		$POStag = $NextPOStag;
	};
	
	my $TotalCostOfTagging = $ViterbiCost[scalar(@WordSequence) - 1]->{'.'};
	print join("\n", reverse(@TaggedWordList)), "\nProbability: $TotalCostOfTagging (", 
	           $TotalCostOfTagging/(scalar(@WordSequence) - $n + 1),"/word)\n";
	
	
};
