use strict;
use warnings;

my $VERSION;
$VERSION = '1.01';

# This script converts Praat (http://www.fon.hum.uva.nl/praat/) annotation
# format (.TextGrid) file to HTK (http://htk.eng.cam.ac.uk/) annotation format
# (.lab) file.

# See the POD documentation at the end of this file
# or run `perl tg2lab.pl --man'
# for more information.

use Getopt::Long;
use Pod::Usage;

my %opt = (
           help => 0,
           man => 0,
           i => undef,
           script => undef,
           ext => 'lab',
           verbose => 0,
           version => 0,
);

GetOptions(\%opt,
           'help|?',
           'man',
           'i=s',
           'script|S=s',
           'ext|x=s',
           'verbose|v',
           'version',
) or pod2usage(1);

pod2usage(1) if $opt{help};
pod2usage(-exitstatus => 0, -verbose => 2) if $opt{man};
print "$VERSION\n" and exit(0) if $opt{version};

if (@ARGV == 0 && !$opt{script}) {
    pod2usage(1);
    exit(0);
}

my $tgFileName = '';
my $labFileName = '';
my $mlfFileName = $opt{i};

my $mlfFile;
if ($mlfFileName) {
    if (!open($mlfFile, ">$mlfFileName")) {
        my $msg = "ERROR: Cannot create file $mlfFileName: $!.\n";
        die $msg;
    }
    print $mlfFile "#!MLF!#\n";
    close($mlfFile);
}

if (@ARGV > 0) {
    $tgFileName = $ARGV[0];
    if (@ARGV > 1) {
        $labFileName = $ARGV[1];
        if (@ARGV > 2) {
            warn "WARNING: Too many arguments is given to the script.\n";
        }
    }
    else {
        $labFileName = ChangeFileExt($tgFileName, $opt{ext});
    }
    
    print "\nConverting $tgFileName -> $labFileName\n";
    tg2lab($tgFileName, $labFileName, $mlfFileName);
    print "Done.\n";
}

if ($opt{script}) {
    my $scriptFileName = $opt{script};
    if (-e $scriptFileName) {
        print "\nProcessing $scriptFileName. Please wait...\n";
        my $scriptFile;
        if (!open($scriptFile, "<$scriptFileName")) {
            my $msg = "ERROR: Can't open file $scriptFileName for reading: $!.\n";
            die $msg;
        }
        my $lineNo = 0;
        my $numFiles = 0;
        while (<$scriptFile>) {
            chomp;
            $lineNo++;
            if ($_ !~ /^\s*$/) {        # if current line is not empty, then ...
                my @fields = ();
                @fields = ($_ =~ /^\s*("[^"]+"|\S+)\s*("[^"]+"|\S+)?s*$/);
                my $tgFileName = '';
                my $labFileName = '';
                if ($fields[0]) {
                    $tgFileName = $fields[0];
                    $tgFileName =~ s/"//g;
                    if ($fields[1]) {
                        $labFileName = $fields[1];
                        $labFileName =~ s/"//g;
                    }
                    else {
                        $labFileName = ChangeFileExt($tgFileName, $opt{ext});
                    }
                }
                else {
                    my $msg = "Bad format of file $scriptFileName at line $lineNo.\n";
                    die $msg;
                }
                $numFiles++;
                if ($opt{verbose}) {
                    print "$numFiles: Converting $tgFileName -> $labFileName\n";
                }
                tg2lab($tgFileName, $labFileName, $mlfFileName);
            }
        }
        close($scriptFile);
        print "Done. $numFiles files processed.\n";
    }
    else {
        my $msg = "ERROR: Can't find file $scriptFileName.\n";
        die $msg;
    }
}

sub tg2lab
{
    my $tgFileName = shift;
    my $labFileName = shift;
    my $mlfFileName = shift;

    # Reading TextGrid file
    my $tgFile;
    if (!open($tgFile, "<$tgFileName")) {
        my $msg = "ERROR: Can't open file $tgFileName for reading: $!.\n";
        die $msg;
    }
    my @lines = <$tgFile>;
    close($tgFile);
    
    # Analyzing TextGrid file
    my $l = 0;  # index of current line
    chomp(@lines);
    if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*File\s+type\s*=\s*"ooTextFile"\s*$/) {
        my $msg = "ERROR: Bad header of file $tgFileName: 'File type = \"ooTextFile\"' not found.\n";
        die $msg;
    }
    if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*Object\s+class\s*=\s*"TextGrid"\s*$/) {
        my $msg = "ERROR: Bad header of file $tgFileName: 'Object class = \"TextGrid\"' not found.\n";
        die $msg;
    }
    
    if (not exists($lines[$l])) {
        my $msg = "ERROR: File $tgFileName is corrupt.\n";
        die $msg;
    }
    
    if ($lines[$l] =~ /^\s*$/) {
        $l++;
    }
    
    my $xmin_pattern = '^\s*xmin\s*=\s*([\d\.,eE\+-]+)\s*$';
    my $xmax_pattern = '^\s*xmax\s*=\s*([\d\.,eE\+-]+)\s*$';
    
    if (not exists($lines[$l]) or $lines[$l++] !~ /$xmin_pattern/) {
        my $msg = "ERROR: Cannot read global xmin value.\n";
        die $msg;
    }
    
    if (not exists($lines[$l]) or $lines[$l++] !~ /$xmax_pattern/) {
        my $msg = "ERROR: Cannot read global xmax value.\n";
        die $msg;
    }
    
    if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*tiers\?\s*<exists>\s*$/) {
        my $msg = "ERROR: 'tiers? <exists>' not found.\n";
        die $msg;
    }
    
    my $items_size;
    if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*size\s*=\s*(\d+)\s*$/) {
        my $msg = "ERROR: Cannot read tiers size value.\n";
        die $msg;
    }
    $items_size = $1;
    
    if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*item\s*\[\]:\s*$/) {
        my $msg = "ERROR: 'item []: ' not found.\n";
        die $msg;
    }
    
    my @items = ();
    
    my $i = 1;  # index of current item
    while ($i <= $items_size) {
        if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*item\s*\[$i\]:\s*$/) {
            my $msg = "ERROR: 'item [$i]:' not found.\n";
            die $msg;
        }
        if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*class\s*=\s*"IntervalTier"\s*$/) {
            my $msg = "ERROR: 'class = \"IntervalTier\"' of 'item [$i]' not found.\n";
            die $msg;
        }
        if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*name\s*=\s*".+"\s*$/) {
            my $msg = "ERROR: Cannot read 'name' of 'item [$i]'.\n";
            die $msg;
        }
        
        if (not exists($lines[$l]) or $lines[$l++] !~ /$xmin_pattern/) {
            my $msg = "ERROR: Cannot read xmin value of 'item [$i]'.\n";
            die $msg;
        }
        
        if (not exists($lines[$l]) or $lines[$l++] !~ /$xmax_pattern/) {
            my $msg = "ERROR: Cannot read xmax value of 'item [$i]'.\n";
            die $msg;
        }
        
        my $intervals_size = 0;
        if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*intervals:\s*size\s*=\s*(\d+)\s*$/) {
            my $msg = "ERROR: Cannot read 'intervals: size' of 'item [$i]'.\n";
            die $msg;
        }
        $intervals_size = $1;
        
        my @intervals = ();
        
        my $j = 1;  # index of current interval in item
        while ($j <= $intervals_size) {
            if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*intervals\s*\[$j\]:\s*$/) {
                my $msg = "ERROR: 'intervals [$j]:' of 'item [$i]:' not found.\n";
                die $msg;
            }
            my $xmin = 0;
            my $xmax = 0;
            my $text = 0;
            
            if (not exists($lines[$l]) or $lines[$l++] !~ /$xmin_pattern/) {
                my $msg = "ERROR: Cannot read xmin value of 'intervals [$j]:' of 'item [$i]'.\n";
                die $msg;
            }
            $xmin = $1;
            
            if (not exists($lines[$l]) or $lines[$l++] !~ /$xmax_pattern/) {
                my $msg = "ERROR: Cannot read xmax value of 'intervals [$j]:' of 'item [$i]'.\n";
                die $msg;
            }
            $xmax = $1;
            
            if (not exists($lines[$l]) or $lines[$l++] !~ /^\s*text\s*=\s*"(.+)"\s*$/) {
                my $msg = "ERROR: Cannot read xmax value of 'intervals [$j]:' of 'item [$i]'.\n";
                die $msg;
            }
            $text = $1;
            $text =~ s/""/"/g;
            
            my %interval = (
                xmin => $xmin,
                xmax => $xmax,
                text => $text,
            );
            
            push @intervals, { %interval };
            
            $j++;
        }
        
        push @items, [ @intervals ];
        
        $i++;
    }
    
    # sort items by intervals size
    if ($#items > 0) {
        for ($i = 0; $i < $#items; $i++) {
            for (my $j = $#items; $j > $i; $j--) {
                if ( $#{$items[$i]} < $#{$items[$j]} ) {
                    my $tmp_item = $items[$i];
                    $items[$i] = $items[$j];
                    $items[$j] = $tmp_item;
                }
            }
        }
    }
    
    my @k = (); # auxiliary array
    for ($i = 0; $i <= $#items; $i++) {
        $k[$i] = 0;
    }
    
    my ($labFile, $mlfFile);
    if (!open($labFile, ">$labFileName")) {
        my $msg = "ERROR: Can't create file $labFileName: $!.\n";
        die $msg;
    }
    
    if ($mlfFileName) {
        if (!open($mlfFile, ">>$mlfFileName")) {
            my $msg = "ERROR: Cannot open file $mlfFileName for appending: $!.\n";
            die $msg;
        }
        print $mlfFile "\"$labFileName\"\n";
    }
    
    my ($diff, $next_diff) = 0;
    for (my $j = 0; $j <= $#{$items[0]}; $j++) {
        my $xmin = sprintf "%0.0f", $items[0][$j]{xmin} * 1.0e7;
        my $xmax = sprintf "%0.0f", $items[0][$j]{xmax} * 1.0e7;
        my $t = "$xmin $xmax $items[0][$j]{text}";
        print $labFile $t;
        if ($mlfFileName) {
            print $mlfFile $t;
        }
        for ($i = 1; $i <= $#items; $i++) {
            if ($k[$i] <= $#{$items[$i]}) {
                $diff = abs($items[$i][$k[$i]]{xmin} - $items[0][$j]{xmin});
                if ($j+1 <= $#{$items[0]}) {
                    $next_diff = abs($items[$i][$k[$i]]{xmin} - $items[0][$j+1]{xmin});
                }
                if ($diff <= $next_diff) {
                    $t = " $items[$i][$k[$i]]{text}";
                    print $labFile $t;
                    if ($mlfFileName) {
                        print $mlfFile $t;
                    }
                    $k[$i]++;
                }
            }
        }
        print $labFile "\n";
        if ($mlfFileName) {
            print $mlfFile "\n";
        }
    }
    
    if ($mlfFileName) {
        print $mlfFile ".\n";
    }
    
    close($labFile);
    if ($mlfFileName) {
        close($mlfFile);
    }
}

sub ChangeFileExt
{
    my $fileName = shift;
    my $ext = shift;
    
    if ($fileName =~ s/\.[-\w\ ]*$/\.$ext/) {
    }
    else {
        $fileName = $fileName.'.'.$ext;
    }
    return $fileName;
}

__END__

=head1 NAME

tg2lab.pl - convert Praat (http://www.fon.hum.uva.nl/praat/) annotation format
(.TextGrid) file to HTK (http://htk.eng.cam.ac.uk/) annotation format (.lab)
file.

=head1 SYNOPSIS

=over

=item B<tg2lab.pl> [I<options>] I<textGridFile> [I<labFile>]

=item B<tg2lab.pl> [I<options>] I<--script f> [I<textGridFile>] [I<labFile>]

=back

=head1 DESCRIPTION

Converts Praat (http://www.fon.hum.uva.nl/praat/) annotation format (.TextGrid)
file I<textGridFile> to HTK (http://htk.eng.cam.ac.uk/) annotation format (.lab)
file I<labFile>. If I<labFile> (output) file name is not provided,
I<textGridFile> (source) file name will be used but with different extension
(C<.lab> by default). Script file I<f> can be used for batch conversion of
multiple files. In this case, a list of each source file and (optional) its
corresponding output file should be provided in the script file.

=head1 OPTIONS

=over

=item B<-i> I<s>

Output transcriptions to Master Label File (MLF) I<s>. The default is off.

=item B<-S> I<f>, B<--script> I<f>

Set script file to I<f>. The script file I<f> can be used for batch conversion
of multiple files. In this case, a list of each source file and (optional) its
corresponding output file should be provided in the script file. The default is
none.

=item B<-x> I<ext>, B<--ext> I<ext>

Set default TextGrid output file extension to I<ext>. The default is
C<.TextGrid>.

=item B<-v>, B<--verbose>

Verbose output to the screen. The default is off.

=item B<-?>, B<--help>

Prints the B<SYNOPSIS> and B<OPTIONS> sections.

=item B<--man>

Prints the tg2lab.pl manual.

=item B<--version>

Prints the current version number of tg2lab.pl and exits.

=back

=head1 HISTORY

v1.01 (20090701):
  Fixed SCRIPT CATEGORIES section of POD documentation.

v1.00 (20090629):
  First public release.

=head1 AUTHOR

Mark Filipovic <F<markfi@cpan.org>>

=head1 COPYRIGHT

  Copyright (c) 2009 Mark Filipovic.  All rights reserved.
  This program is free software; you can redistribute it and/or modify it
  under the same terms as Perl itself.

=begin CPAN

=head1 README

This script converts Praat (http://www.fon.hum.uva.nl/praat/) annotation format
(.TextGrid) file to HTK (http://htk.eng.cam.ac.uk/) annotation format (.lab)
file.

=head1 PREREQUISITES

This script requires C<strict>, C<warnings>, C<Getopt::Long>, and C<Pod::Usage>
modules.

=head1 OSNAMES

any

=head1 SCRIPT CATEGORIES

Speech/Annotation
Speech/Labelling

=end CPAN

=cut
