#! /usr/local/bin/perl
#
if(grep(/^\-(h|\-help)$/i, @ARGV))
{
print STDOUT << "ENDOFHELP";
Ngramcount.pl [depth] textfiles...

Construct a table with N-gram counts from the textfiles and write to STDOUT.
depth is the N-gram size.
textfiles are plain ASCII text files.

--license
-l
Print license information

--help
-h
This message

ENDOFHELP
exit;
}
#
#
###############################################################################
if(grep(/^\-(l|\-license)$/i, @ARGV))
{
print STDOUT << "ENDOFLICENSE" ;

Copyright R.J.J.H. van Son © 20002

Author Rob van Son
Institute of Phonetic Sciences & ACLC
University of Amsterdam
Herengracht 338
NL-1016CG Amsterdam, The Netherlands
Email: Rob.van.Son\@hum.uva.nl
	   rob.van.son\@workmail.com
WWW  : http://www.fon.hum.uva.nl/rob/
mail:  Institute of Phonetic Sciences
	   University of Amsterdam
	   Herengracht 338
	   NL-1016CG Amsterdam
	   The Netherlands
	   tel +31 205252183
	   fax +31 205252197

License for use and disclaimers

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


ENDOFLICENSE
exit;
};
#######################################################
#
my $ngram = 1;
$ngram = shift if $ARGV[0] !~ /[^\d]/;

# Read in words
my %NgramCounts;
my @previous = split(//, "."x($ngram-1));
while(<>)
{
	# Skip comments
	next if /^\s*\#/;
	
	# Protect word internal '
	s/([a-z])\'([a-z])/\1__\2/ig;
	
	# Remove all unwanted characters
	s/[^a-zA-Z\.\?\!\_]+/ /ig;
	
	# Skip empty lines
	next unless /\S/;
	
	# Remove non-sentence ending periods
	s/\b(dr|mr|mrs|drv|blvd)\./\1/ig;
	
	# Replace all periods by multiple periods
	s/[\.\?\!]/ . /ig;
	s/\s+\.\s+/{" . "x$ngram}/eg;
	
	# Restore word internal '
	s/([a-z])\_\_([a-z])/\1\'\2/ig;
	
	# Split into words
	@Words = split(' ', lc($_));
	
	# Count words
	my $CurrentWord;
	foreach $CurrentWord (@Words)
	{
		next unless $CurrentWord =~ /\S/;
		# Either the first or the second word must NOT be a period
		if($CurrentWord =~ /[a-z]/i || !grep(/^[\.]/, @previous))
		{
			my $Entry = join(" ", $CurrentWord, @previous);
			++$NgramCounts{$Entry};
		};
		
		# Store current word in context
		if($ngram-1>0)
		{
			pop(@previous);
			unshift(@previous, $CurrentWord);
		};
	};
	
};

# Sort and write out the results
# Print header
print STDOUT "# N\tWord\t";
for($i=1;$i<$ngram;++$i)
{
	print STDOUT "W-$i\t";
};
print "\n";

my $types = 0;
my $tokens = 0;
my $sumCiLogCi = 0;
my $sumI = 0;
my $sumCilogI = 0;

my @OutputList=();
my $Entry;
foreach $Entry (keys(%NgramCounts))
{
	my $Currentcount = $NgramCounts{$Entry};
	push(@OutputList, "$Currentcount\t".(join("\t", split(' ',$Entry)))."\n");
};
@OutputList = sort {$b<=>$a} @OutputList;
foreach $Entry (@OutputList)
{
	my $Currentcount = [split(' ',$Entry)]->[0];
	# Store count and order information
	++$types;
	$tokens += $Currentcount;
	$sumCiLogCi += $Currentcount*log($Currentcount);
	$sumI += 1.0/$types;
	$sumCilogI += $Currentcount*log($types);
};
my $H = (log($tokens) - $sumCiLogCi / $tokens)/log(2);
my $KLdist = (log($sumI) + $sumCilogI / $tokens)/log(2) - $H;

print STDOUT @OutputList;
print "# ${ngram}-gram types = $types, tokens = $tokens\n";
print "# H = $H bits/type, Cross H(Zipfs dist) - H = $KLdist bits/type\n";
