#! /bin/bash # # Count word-trigrams of the input # # The awk lines are for manipulating words # on a line (storing, reordering). # The tr lines are used as a tokenizer and some normalization. # The sed lines manage word normalization. # egrap lines remove rubbish. # # A long pipe which performs (in order): # Get the input files # All in lower case # Replace \' quotes with * for easier manipulation # Replace 's with _s # Remove periods following Dr, Mr, Mrs, etc # Isolate sentence ends (.?!) # Remove all non-alphabetic characters (except \.), split into lines # Insert the required sentence boundary markers (.) # Restore \' quotes (from 's) # Remove rubbish # Add previous N-1 words to line (reverse order) N-gram # Sort tokens # Count tokens of word types # Sort types on frequency # Remove rubbish (entries without alphabetic characters) # Append total type and token counts, the entropy per entry # and the difference between the cross entropy with a 1/f (Zipdf's law) # distribution and the real entropy. # # Copyright (C) 2002 R.J.J.H. van Son # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # HELP!!! if [[ $# < 1 || $1 == '-h' || $1 == '--help' ]]; then echo Ngramcount [depth] textfiles... echo -e "\nConstruct a table with N-gram counts from the textfiles and write to STDOUT" echo "depth is the N-gram size" exit fi; # Get Ngram depth if [[ -e $1 ]]; then Ngram=1; else Ngram=$1; # Remove first parameter shift; fi; # Check Ngram if (( $Ngram < 1 )); then Ngram=1; fi; export Ngram # Start Pipe cat "$@" \ | tr "[:upper:]" "[:lower:]" \ | tr "\'" '\`' \ | sed 's/\([a-z]\)\`s/\1\_s/g' \ | sed 's/\([dm]rs*\)[\.]/\1/g' \ | sed 's/[\.\?\!]/ \. /g' \ | tr -cs "[:alpha:]_\." "[\n*]" \ | gawk 'BEGIN {ngram=ENVIRON["Ngram"]} {if($1 ~ /[\.]/){for(i=1;i<=ngram;++i)print "\."} else print $0;}' \ | tr '_' "\'" \ | sed "s/^\'//g" | sed "s/\'$//g" \ | egrep -v '^[x]*$' \ | gawk 'BEGIN {ngram=ENVIRON["Ngram"];for(i=1;i1;--i)prev[i] = prev[i-1]; prev[1] = $1}' \ | sort \ | uniq -c \ | sort -nr \ | egrep '[a-z\.]' \ | gawk 'BEGIN{p="";ngram=ENVIRON["Ngram"];for(i=1;i