#! /usr/bin/perl
#
# Special purpose program for Unix/Linux:
# Read all arguments as aligned praat Celex label files with 
# translit, words, POS, wordfrequency, syllables, and syllable frequency tiers.
#
# Construct a Database table for each tier.
#
# Use:
# ./ConstructCelexTables.pl <FileGlob>
#
# This will read in all files from <FileGlob> (e.g., ../F*/sentences/F*)
# and constructs Table files with the names <Speaker><tier>.txt
# in the the DatabaseFiles/ directory
#
#
###############################################################################
#
# Copyright R.J.J.H. van Son © 2000, 2001
#
# Author Rob van Son
# Institute of Phonetic Sciences & ACLC
# University of Amsterdam
# Herengracht 338
# NL-1016CG Amsterdam, The Netherlands
# Email: Rob.van.Son@hum.uva.nl
#        rob.van.son@workmail.com
# WWW  : http://www.fon.hum.uva.nl/rob/
# mail:  Institute of Phonetic Sciences
#        University of Amsterdam
#        Herengracht 338
#        NL-1016CG Amsterdam
#        The Netherlands
#        tel +31 205252183
#        fax +31 205252197
#
# License for use and disclaimers
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#
#######################################################
#
$HomeDir = '../../..';
require "$HomeDir/Links.pl";
require "$Scripts/CGN_phonemes.pl";  # For the phoneme classes
require "$Scripts/SentenceLabel.pl";

#
# Where are the files
my $ShadowingDir = "$HomeDir/home/Shadowing";
my $TypeExtension = 'shadow';
my $TableDirectory = "$ShadowingDir/DBMStables";
my $SpeakerDirs = "$ShadowingDir/";

#
# Tiers to handle
@TierList = ('PHONEMES', 'SYLLPARTS', 'SYLLABLES', 'LEXSYLL', 'WORDS', 'CGN', 'LEXCGN', 'TRANSLIT');

# Postediting routines for all tiers
# Given a Sentence Label and a Record list, they will change/add the
# values in the Record.
# 
# ColumnHeaderPostEditing list will change column header names (should work on $_)
my @ColumnHeaderPostEditing = (
's/Value\s+\[TEXT\]/Value [FLOAT4]/g if $Tier =~ /FREQ/i',
's/TRANSLIT/SENTENCE/g',
's/WORDFREQ/WORDS/g',
's/LEXSYLLFREQ/SYLLABLES/g',
's/SYLLFREQ/SYLLABLES/g',
's/POS/WORDS/g',
's/(LEX)?CGN/WORDS/g',
's/LEXSYLL/SYLLABLES/g',
);


# Extract the difference between the start of the current item with the Shadow 
# Tier. If $End is true, align the end points
sub getShaddowDelay  # ($Label, $Tier, $End)
{
	my $Label = shift || return 0;
	my $Tier = shift || return 0;
	my $End = shift || 0;
	#
	# Make sure the PHONEMES tier is not shifted
	$Label->storeItem('PHONEMES');
	my $Delay = 0;
	unless($End)
	{
		my $ItemNumber = $Label->StartAlignedinTier($Tier, 'PHONEMES');
		return 0 unless $ItemNumber > -1;
		$Label->currentItem('SHADOW', $ItemNumber);
		$Delay = $Label->currentStart('SHADOW') - $Label->currentStart($Tier);
	}
	else
	{
		my $ItemNumber = $Label->EndAlignedinTier($Tier, 'PHONEMES');
		return 0 unless $ItemNumber > -1;
		$Label->currentItem('SHADOW', $ItemNumber);
		$Delay = $Label->currentEnd('SHADOW') - $Label->currentEnd($Tier);
	};
	
	# Make sure the PHONEMES tier is not shifted
	$Label->restoreItem('PHONEMES');
	
	#
	# Return delay
	return $Delay;
};

# The Column Headers list will add the corresponding column headers.
my @PHONEMEScolumnHeaders = (
'Manner [CHAR(1)] Manner of Articulation', 
'Articulator [CHAR(1)] Primary Articulator', 
'Voicing [CHAR(1)] Voiced (+) / Unvoiced (-)',
'prevManner [CHAR(1)] Manner for previous item', 
'prevArticulator [CHAR(1)] Articulator for previous item', 
'prevVoicing [CHAR(1)] Voicing of previous item',
'nextManner [CHAR(1)] Manner for next item', 
'nextArticulator [CHAR(1)] Articulator for next item', 
'nextVoicing [CHAR(1)] Voicing of next item',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub PHONEMESedit        # ($Label, \@Records)
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # Phoneme information of current, previous and next Phoneme
    my $i;
    foreach $i (5, 13, 15)
    {
        # Get Manner, Articulator and Voicing
        my $Manner = $CGN_phonemes{$Record->[$i]}[0];
        my $Articulator = $CGN_phonemes{$Record->[$i]}[1];
        my $Voicing = $CGN_phonemes{$Record->[$i]}[2];
        
        push(@$Record, $Manner, $Articulator, $Voicing);
    };
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'PHONEMES', 0);
    my $enddelay = getShaddowDelay($Label, 'PHONEMES', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return @$Record;
}

my @SYLLPARTScolumnHeaders = (
'NumPhon [INT2] Number of phonemes in syllable part',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub SYLLPARTSedit       # ($Label, \@Records)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    my $Value = $Record->[5];
    $Value =~ s/[\+\-\~\:]//g;
    push(@$Record, length($Value));
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'SYLLPARTS', 0);
    my $enddelay = getShaddowDelay($Label, 'SYLLPARTS', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @SYLLABLEScolumnHeaders = (
'Stress [CHAR(1)] Syllable stress (+/-)', 
'prevStress [CHAR(1)] Stress of previous item', 
'nextStress [CHAR(1)] Stress of next item', 
'NumPhon [INT2] Number of phonemes in syllable',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub SYLLABLESedit       # ($Label, \@Record)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # The syllable stress marker
    my $StressedSyllable = ($Record->[5] =~ s/\"//g) ? '+' : '-';
    my $prevStressedSyllable = ($Record->[13] =~ s/\"//g) ? '+' : '-';
    my $nextStressedSyllable = ($Record->[15] =~ s/\"//g) ? '+' : '-';
    
    push(@$Record, $StressedSyllable, $prevStressedSyllable, $nextStressedSyllable);
    
    my $Value = $Record->[5];
    $Value =~ s/[\+\-\~\:]//g;
    push(@$Record, length($Value));
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'SYLLABLES', 0);
    my $enddelay = getShaddowDelay($Label, 'SYLLABLES', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @SYLLFREQcolumnHeaders = (
'Stress [CHAR(1)] Syllable stress (+/-)', 
'prevStress [CHAR(1)] Stress of previous item', 
'nextStress [CHAR(1)] Stress of next item', 
'NumPhon [INT2] Number of phonemes in syllable',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub SYLLFREQedit        # ($Label, \@Record)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # The syllable stress marker
    my $StressedSyllable = ($Record->[5] =~ s/\"//g) ? '+' : '-';
    my $prevStressedSyllable = ($Record->[13] =~ s/\"//g) ? '+' : '-';
    my $nextStressedSyllable = ($Record->[15] =~ s/\"//g) ? '+' : '-';
    
    push(@$Record, $StressedSyllable, $prevStressedSyllable, $nextStressedSyllable);
    
    my $Value = $Record->[5];
    $Value =~ s/[\+\-\~\:]//g;
    push(@$Record, length($Value));
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'SYLLFREQ', 0);
    my $enddelay = getShaddowDelay($Label, 'SYLLFREQ', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @LEXSYLLFREQcolumnHeaders = (
'Stress [CHAR(1)] Syllable stress (+/-)', 
'prevStress [CHAR(1)] Stress of previous item', 
'nextStress [CHAR(1)] Stress of next item', 
'NumPhon [INT2] Number of phonemes in syllable',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub LEXSYLLFREQedit     # ($Label, \@Record)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # The syllable stress marker
    my $StressedSyllable = ($Record->[5] =~ s/\"//g) ? '+' : '-';
    my $prevStressedSyllable = ($Record->[13] =~ s/\"//g) ? '+' : '-';
    my $nextStressedSyllable = ($Record->[15] =~ s/\"//g) ? '+' : '-';
    
    push(@$Record, $StressedSyllable, $prevStressedSyllable, $nextStressedSyllable);
    
    my $Value = $Record->[5];
    $Value =~ s/[\+\-\~\:]//g;
    push(@$Record, length($Value));
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'LEXSYLLFREQ', 0);
    my $enddelay = getShaddowDelay($Label, 'LEXSYLLFREQ', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @CGNcolumnHeaders = (
'NumSyll [INT2] Number of syllables in word', 
'StressPatt [TEXT] Stress patter of word', 
'NumPhon [INT2] Number of phonemes in word',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub CGNedit     # ($Label, \@Records)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    my $Value = $Record->[5];
    # Get the number of Syllables and the stress pattern
    my @Syllables = map {/\"/ ? 1 : 0;} split(/[\-\_]/, $Value);
    push(@$Record, scalar(@Syllables), join("", @Syllables));
    # Get the number of phonemes
    $Value =~ s/[\+\-\~\:]//g;
    push(@$Record, length($Value));
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'CGN', 0);
    my $enddelay = getShaddowDelay($Label, 'CGN', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @WORDScolumnHeaders = ('Marker [CHAR(1)] Star marker of transliteration (a/u/v/x)',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub WORDSedit   # ($Label, \@Records)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # Split of transcription markers like *a, *u, *v, and *x
    my $Value = $Record->[5];
    my ($Word, $Marker) = ($Value, 'NULL');
    if($Word =~ /\*(\S)/)
    {
        $Marker = uc($1);
        $Word = $`;
    };
    $Record->[5] = $Word;
    
    push(@$Record, $Marker);
    
    # Remove markers from other words
    $Record->[13] =~ s/\*[\w]+//g;
    $Record->[15] =~ s/\*[\w]+//g;
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'WORDS', 0);
    my $enddelay = getShaddowDelay($Label, 'WORDS', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @WORDFREQcolumnHeaders = (
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub WORDFREQedit        # ($Label, \@Records)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'WORDFREQ', 0);
    my $enddelay = getShaddowDelay($Label, 'WORDFREQ', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @POScolumnHeaders = (
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub POSedit     # ($Label, \@Records)  
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'POS', 0);
    my $enddelay = getShaddowDelay($Label, 'POS', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}

my @TRANSLITcolumnHeaders = (
'NrWords [INT2] Number of words in transliteration', 
'NrSyllables [INT2] Number of syllables in transliteration', 
'NrPhonemes [INT2] Number of phonemes in transliteration',
'WordRate [FLOAT4] Rate in words per second', 
'SyllableRate [FLOAT4] Rate in syllables per second', 
'PhonemeRate [FLOAT4] Rate in phonemes per second',
'startdelay [FLOAT4] The shadowing start delay',
'enddelay [FLOAT4] The shadowing end delay'
);
sub TRANSLITedit        # ($Label, \@Records) 
{
    my $Label = shift || return 0;
    my $Record = shift || return 0;
    
    # The word tier
    my $WordTier = 'CGN';
    
    # Remove unwanted items
    splice(@$Record, 12, 4);
    
    # Initialize
    my ($WordNumber, $SyllableNumber, $PhonemeNumber) = (0, 0, 0);
    my $Duration = 0;
    
    # Get the mean words, syllables and phonemes per second from CGN
    # First, count the non-pause words etc. and sum their durations
    $Label->presetItem($WordTier);
    while($Label->nextStartininterval($WordTier, 'TRANSLIT') > -1)
    {
        my $Word = $Label->currentValue($WordTier);
        
        # Skip (filled) pauses (Silence, uh, uhm, enne)
        next if $Word =~ /^(\*|m|\@[mh]?|E[\-n]+\@)$/;
        
        # Get the duration
        $Duration += $Label->currentEnd($WordTier) - $Label->currentStart($WordTier);
        
        # Get Syllables
        my @SyllableList = split(/[\-\_]/, $Word);
        my $Phonemes = $Word;
        # Remove all superfluous chars
        $Phonemes =~ s/[\"\'\-\:\~\+]//g;
        
        # Count words, syllables and phonemes
        ++$WordNumber;
        $SyllableNumber += scalar(@SyllableList);
        $PhonemeNumber += length($Phonemes);
    };
    
    # Calculate the rates (in items per second)
    my ($WordRate, $SyllableRate, $PhonemeRate) = (0, 0, 0);
    if($Duration > 0)
    {
        $WordRate = $WordNumber / $Duration;
        $SyllableRate = $SyllableNumber / $Duration;
        $PhonemeRate = $PhonemeNumber / $Duration;
    };
    
    # Add numbers and rates
    push(@$Record, $WordNumber, $SyllableNumber, $PhonemeNumber, $WordRate, $SyllableRate, $PhonemeRate);
    
    # Get delay
    my $startdelay = getShaddowDelay($Label, 'TRANSLIT', 0);
    my $enddelay = getShaddowDelay($Label, 'TRANSLIT', 1);
    push(@$Record, $startdelay, $enddelay);
    
    return 1;
}


my %PostEditors = (
'PHONEMES' => \&PHONEMESedit, 
'SYLLPARTS' => \&SYLLPARTSedit, 
'SYLLFREQ' => \&SYLLFREQedit, 
'LEXSYLLFREQ' => \&LEXSYLLFREQedit, 
'SYLLABLES' => \&SYLLABLESedit, 
'CGN' => \&CGNedit, 
'POS' => \&POSedit, 
'WORDFREQ' => \&WORDFREQedit, 
'WORDS' => \&WORDSedit, 
'TRANSLIT' => \&TRANSLITedit
);

my %PostHeader = (
'PHONEMES' => \@PHONEMEScolumnHeaders, 
'SYLLPARTS' => \@SYLLPARTScolumnHeaders, 
'SYLLFREQ' => \@SYLLFREQcolumnHeaders, 
'LEXSYLLFREQ' => \@LEXSYLLFREQcolumnHeaders, 
'SYLLABLES' => \@SYLLABLEScolumnHeaders, 
'CGN' => \@CGNcolumnHeaders, 
'POS' => \@POScolumnHeaders, 
'WORDS' => \@WORDScolumnHeaders, 
'WORDFREQ' => \@WORDFREQcolumnHeaders, 
'TRANSLIT' => \@TRANSLITcolumnHeaders
);


#
# Major loop, process all speakers
my $FileGlob;

foreach $FileGlob (@ARGV)
{
    # Get the label filenames
    @PhonemeFileList = glob("$FileGlob");
    # Process all files  
    my $File;
    foreach $File (@PhonemeFileList)
    {
        # Get the ID of the current speaker
        my $CurrentSpeaker = "";
	my $CurrentFileID = "";
	my $OrigID = "";
	my $OrigSpeaker = "";
        if($File =~ m@/([^/]+)/sentences/(([^/\.\+]+)\+([^/\.\+]+))\.$TypeExtension$@)
        {
            $CurrentSpeaker = $1;
	    $CurrentFileID = $2;
	    $OrigID = $4;
	    $OrigID =~ /^([FM][\d]+[A-Z]+)/;
	    $OrigSpeaker = $1;
        }
        else
        {
            die "No speaker found: $File\n";
        };
        
        # Get the ID of the current author
        my $CurrentAuthor = 'NULL';
        
        my $Label = new SentenceLabel;
        next unless $Label->ReadLabelFile($File);
        
        $Label->presetItem('TRANSLIT');
        while($Label->nextItem('TRANSLIT') > -1)
        {
            # Skip empty sentences
            next unless $Label->currentValue('TRANSLIT') =~ /[^\#\*\s]/;
            
            # Correct some odd errors (SentenceID =! TranslitID)
            $Label->currentItem('SENTENCE', $Label->currentItem('TRANSLIT'));
            $Label->currentIDcode('SENTENCE', $Label->currentIDcode('TRANSLIT'));
            
            my $Tier;
            foreach $Tier (@TierList)
            {
                my @Table = $Label->TierToTable($Tier, 'TRANSLIT', $PostEditors{$Tier}, 
		$CurrentSpeaker, $CurrentFileID, $CurrentAuthor);
                print STDERR "Error with '$Tier' from $File\n" unless @Table;
                
                # Print result to file
                my $PrintHeader = (-e "$TableDirectory/$CurrentSpeaker$Tier$TypeExtension.txt") ? 0 : 1;
                open(OUTPUT, ">>$TableDirectory/$CurrentSpeaker$Tier$TypeExtension.txt") 
                || die ">>$TableDirectory/$CurrentSpeaker$Tier$TypeExtension.txt: $!\n";
                
                # Print the column headers
                if($PrintHeader)
                {
                    # Make sure the tier is somewhere
                    $Label->currentItem($Tier, 1) unless $Tier eq 'TRANSLIT';
                    
                    # Construct the list with column headers
                    my @CurrentHeaders = $Label->TierToTableColumnheaders($Tier, @{$PostHeader{$Tier}}, 
                    'ShadowSpeaker [TEXT] ID of shadow speaker', 
                    'ShadowFile [TEXT] ID of shadow label file', 
		    'Author [TEXT] ID of annotator');
                    # Remove some items if using the TRANSLIT tier
                    splice(@CurrentHeaders, 12, 4) if $Tier eq 'TRANSLIT';
                    
                    # Post editing of the column headers
                    @CurrentHeaders = map {eval join(";", @ColumnHeaderPostEditing); $_;} @CurrentHeaders;
                    
                    # Print the column headers
                    print OUTPUT "\#> ", join("\n\#> ", @CurrentHeaders), "\n";
                    
                    # Print some extra information
                    print OUTPUT "\# ";
                    print OUTPUT "Tier: $Tier\n\# Speaker: $CurrentSpeaker\n\# Source: $TypeExtension\n";
                    print OUTPUT "\#\n";

		    # Print the copyright license
		    if(open(LICENSE, "<./licenseterms.txt"))
		    {
			    my $Date = localtime();
			    $Date =~ /\s([\d]+)\s*$/;
			    my $Year = $1;
			    while(<LICENSE>)
			    {
				    if(/\©/ && ! /$Year/)
				    {
					    s/(\d)\s{6}(\s+\#)/\1, $Year\2/ig;
				    };
				    print OUTPUT $_;
			    };
			    close(LICENSE)
		    };

                    
                    # Header is printed ONLY once
                    $PrintHeader = 0;
                };
                
                my $Record;
                foreach $Record (@Table)
                {
                    print OUTPUT join("\t", @$Record), "\n";
                };
                close(OUTPUT);
            };
        };
    };
    
};



