#! /usr/local/bin/perl
#
if(grep(/^\-(h|\-help)$/i, @ARGV))
{
print STDOUT << "ENDOFHELP";
POSngramcount.pl [depth] TaggedTextFile ...

Read POS tagged texts (CGN format)

depth is the POS N-gram size, default is 2.

TaggedTextFiles are CGN (Spoken Dutch Corpus) style tagged
files of the form:
Word	POS(....) ....
POSngramcount uses only the Word and the major POS tag.

Write POS n-grams to STDOUT of the form: 
Count	Word	POS		POS-1	...

--license
-l
Print license information

--help
-h
This message

ENDOFHELP
exit;
}
#
#
###############################################################################
if(grep(/^\-(l|\-license)$/i, @ARGV))
{
print STDOUT << "ENDOFLICENSE" ;

Copyright R.J.J.H. van Son © 20002

Author Rob van Son
Institute of Phonetic Sciences & ACLC
University of Amsterdam
Herengracht 338
NL-1016CG Amsterdam, The Netherlands
Email: Rob.van.Son\@hum.uva.nl
	   rob.van.son\@workmail.com
WWW  : http://www.fon.hum.uva.nl/rob/
mail:  Institute of Phonetic Sciences
	   University of Amsterdam
	   Herengracht 338
	   NL-1016CG Amsterdam
	   The Netherlands
	   tel +31 205252183
	   fax +31 205252197

License for use and disclaimers

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


ENDOFLICENSE
exit;
};
#######################################################
#
my $ngram = 1;
$ngram = shift if $ARGV[0] !~ /[^\d]/;

# Read in words
my %NgramCounts;
my @previous = split(//, "."x($ngram-1));
while(<>)
{
	# Skip comments
	next if /^\s*(\#|\<)/;
	
	my @List = split;
	# Count words
	my $CurrentWord = lc($List[0]);
	my $CurrentPOStag = $1 if $List[1] =~ /([\w]+)/;
	$CurrentPOStag = "." if $CurrentWord =~ /[\.\?\!]/;
	
	# Either the first or the second word must NOT be a period
	if($CurrentPOStag =~ /[a-z]/i || !grep(/^[\.]/, @previous))
	{
		my $Entry = join(" ", $CurrentWord, $CurrentPOStag, @previous);
		++$NgramCounts{$Entry};
	};
	
	# Store current POStag in context
	if($ngram-1>0)
	{
		pop(@previous);
		unshift(@previous, $CurrentPOStag);
	};
	
};

# Sort and write out the results
# Print header
print STDOUT "# N\tWord\tPOS\t";
for($i=1;$i<$ngram;++$i)
{
	print STDOUT "POS-$i\t";
};
print "\n";

my $types = 0;
my $tokens = 0;
my $sumCiLogCi = 0;
my $sumI = 0;
my $sumCilogI = 0;

my @OutputList=();
my $Entry;
foreach $Entry (keys(%NgramCounts))
{
	my $Currentcount = $NgramCounts{$Entry};
	push(@OutputList, "$Currentcount\t".(join("\t", split(' ',$Entry)))."\n");
};
@OutputList = sort {$b<=>$a} @OutputList;
foreach $Entry (@OutputList)
{
	my $Currentcount = [split(' ',$Entry)]->[0];
	# Store count and order information
	++$types;
	$tokens += $Currentcount;
	$sumCiLogCi += $Currentcount*log($Currentcount);
	$sumI += 1.0/$types;
	$sumCilogI += $Currentcount*log($types);
};
my $H = (log($tokens) - $sumCiLogCi / $tokens)/log(2);
my $KLdist = (log($sumI) + $sumCilogI / $tokens)/log(2) - $H;

print STDOUT @OutputList;
print "# ${ngram}-gram types = $types, tokens = $tokens\n";
print "# H = $H bits/type, Cross H(Zipfs dist) - H = $KLdist bits/type\n";