#! /usr/bin/perl
#
# Special purpose program for Unix/Linux:
# Construct a list of corresponding Sentences.
#
# Prints out resulting list
#
# Use:
# ./ConstructCorrespondingSentenceList.pl DirectoryPath [TableGlob IndexAttribute]
#
# DirectoryPath: The path to the directory that contains the list-files
# (defaults to: $HomeDir/DatabaseFiles/phoneme)
# TableGlob: A glob for the list-files (defaults to '*TRANSLITphoneme.txt.gz')
# IndexAttribute: The name of the column which should be indexed (defaults to 'id')
#
# If you want to enter an optional value, you must specify all preceding optional
# values.
#
# The list-file format:
# Start with a series of lines of the following form (one for each column):
# "#> ColumnName\t[TYPE]\tDescription"
#
# Type should be one of the official PostgreSQL types (e.g., INT4, CHAR(2), TEXT)
# The description can be any kind of text, as long as it doesn't contain tabs.
#
# Then you can add comments. Every (other) line that starts with a plain '#'
# will be ignored, empty lines are ignored too.
#
# Data lines are tab delimited relational table-records of the following form:
# "ValueColumn1\tValueColumn2\tValueColumn3\tValueColumn4\tValueColumn5\t.....\n";
#
# All values of a single record should fit on the same line.
#
# It is advisable to have a (unique) indexing column which can be used to
# identify individual records.
#
###############################################################################
#
# Copyright R.J.J.H. van Son © 2000, 2001
#
# Author Rob van Son
# Institute of Phonetic Sciences & ACLC
# University of Amsterdam
# Herengracht 338
# NL-1016CG Amsterdam, The Netherlands
# Email: Rob.van.Son@hum.uva.nl
# rob.van.son@workmail.com
# WWW : http://www.fon.hum.uva.nl/rob/
# mail: Institute of Phonetic Sciences
# University of Amsterdam
# Herengracht 338
# NL-1016CG Amsterdam
# The Netherlands
# tel +31 205252183
# fax +31 205252197
#
# License for use and disclaimers
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
#
#######################################################
#
$HomeDir = "..";
require "$HomeDir/Links.pl";
# First argument
my $DatabaseDir = shift(@ARGV) || "$HomeDir/DatabaseFiles/phoneme";
$DatabaseDir =~ s@/+\s*$@@g; # remove final /
# The default table name is the directory name
$DatabaseDir =~ m!([^/]+)$!;
# More arguments or Default Values
my $TableGlob = shift(@ARGV) || '*TRANSLITphoneme.txt.gz'; # Just all text or zipped files
my $IndexAttribute = shift(@ARGV) || 'id'; # The column on which the index should be created
# The delimiter of the table lines
my $Delimiter = "\t";
my @FileList = ();
my @Columns = ();
# A hash table of column names versus argument numbers
my %ColumnsTable = ();
my @ValueTypes = ();
# Use arguments if given
@FileList = glob("$DatabaseDir/$TableGlob");
# Values to be calculated once per database!
my $HeaderListCreatedColumns = ();
my @ValueTypes = ();
my $FilePath;
my @result = ();
foreach $FilePath (@FileList)
{
print STDERR "File: $FilePath\n";
if($FilePath =~ /\.gz/)
{
open(INPUT, "gunzip -qc $FilePath|") || die "gunzip -qc $FilePath|: $!\n";
}
else
{
open(INPUT, "<$FilePath") || die "<$FilePath: $!\n";
};
# Get the first line
my $Header = ;
chomp($Header);
# Remove leading #
$Header =~ s/^\s*\#[\>]?\s*//g;
# Skip comments
while(($Line = ) =~ /^\s*(\#[\>]?\s*|$)/)
{
if($Line =~ /^\s*\#[\>]\s*/)
{
$Header .= "$Delimiter$'";
chomp($Header);
};
};
# Create Header (column) list, unless this has already been done before
unless($HeaderListCreated)
{
@Columns = ();
@ValueTypes = ();
# Column names
@Columns = split(/[$Delimiter]+/, $Header);
@ValueTypes = map {/\[([^\]]+)\]/; $1;} @Columns;
@Columns = map {/\s+\[([^\]]+)\]/i; $`;} @Columns;
# Construct table definition
my $i;
push(@Columns, 'SORTstring');
for($i=0; $i < scalar(@Columns); ++$i)
{
$Columns[$i] = "EndTime" if $Columns[$i] eq 'End';
$ColumnsTable{lc($Columns[$i])} = $i;
};
$HeaderListCreated = scalar(@Columns);
};
# Fill Table (NOTE: the first line has already been read!)
while($Line)
{
# Skip comments and empty lines
($Line = , next) if $Line =~ /^\s*(\#|$)/;
chomp($Line);
$Line =~ s/\s+$//g;
# Split the lines
my @Values = split(/$Delimiter/, $Line);
my $String = lc($Values[$ColumnsTable{'value'}]);
# Construct a string representation for sorting
$String =~ s/(^|\s+)(\'t|\'n|\'m|z\'n|\'r|d\'r|m\'n|de)(\s+|$)/ /isg;
$String =~ s/(^|\s+)(uhm|enne|uh|hmm)(\s+|$)/ /isg;
$String =~ s/(^|\s+)([^\s\.\,]+\*[ax])(\s+|$)/ /isg;
$String =~ s/\*v//isg;
$String =~ s/\s*[\.\,\'\"\?\?]\s*/ /isg;
$String =~ s/\s+/ /isg;
$String =~ s/^\s+//isg;
$String =~ s/\s+$//isg;
push(@Values, $String);
# Read next line
$Line = ;
# Make sure NO values are entered that do NOT have a the first two entries
next unless $Values[0] && $Values[1];
if(scalar(@Values) != $HeaderListCreated)
{
print STDERR "ERROR: ", scalar(@Values), " found where $HeaderListCreated expected\n";
print STDERR join("\t", @Columns), "\n";
print STDERR join("\t", @Values), "\n";
next;
};
# Ignore empty or broken records
# (i.e., with an undefined value like '', or no value at all)
if(grep(/^(\'\'|\s*)$/, @Values[0..2]))
{
print STDERR "ERROR: broken record\n";
print STDERR join("\t", @Values), "\n";
next;
};
# Insert the record
push(@result, \@Values);
};
close(INPUT);
};
# Sort the result
my @SortedResult = ();
my @OrigResult = ();
my @SortColumns = ();
push(@SortColumns, $ColumnsTable{'sortstring'});
push(@SortColumns, $ColumnsTable{'speaker'});
push(@SortColumns, $ColumnsTable{'texttype'});
push(@SortColumns, $ColumnsTable{'chunkid'});
my $i = 0;
foreach $line (@result)
{
my $Argument;
my @SortValues = ();
foreach $Argument (@SortColumns)
{
push(@SortValues, $line->[$Argument]);
};
push(@SortValues, $i);
++$i;
my $SortingLine = join("\t", @SortValues);
push(@OrigResult, $SortingLine)
};
@SortedResult = sort {$a cmp $b} @OrigResult;
# Print output
my @PrintColumns = ();
push(@PrintColumns, $ColumnsTable{'id'});
push(@PrintColumns, $ColumnsTable{'speaker'});
push(@PrintColumns, $ColumnsTable{'value'});
my $StringBlock = ();
my $PrevValue = "";
foreach $line (@SortedResult)
{
my @CurrentSortedLine = split(/\t/, $line);
my $CurrentOriginalLineNumber = $CurrentSortedLine[-1];
my $CurrentString = "";
foreach $Argument (@PrintColumns)
{
$CurrentString .= ($result[$CurrentOriginalLineNumber]->[$Argument]) . "\t";
};
# Print
print "$CurrentString\n" if $result[$CurrentOriginalLineNumber]->[$ColumnsTable{'sortstring'}] =~ /\S/;
};