#!/usr/bin/perl -w
# aif2sent: convert aif file (from Callisto) to sentences with tags
# usage: aif2sent < file
# 20060103 erikt@science.uva.nl

$command = $0;
$noTag = "O";  # tag for words outside of spans

$id = "";
$type = "";
$textFile = "";
$tmpFile = "aif2sent.$$";
$base64 = "/home/erikt/bin/base64";
%ends = ();
%starts = ();
%types = ();
%parameters = ();

while (<STDIN>) {
   $line = $_;
   chomp($line);
   # name of the file containing the text: note should be in this directory
   if ($textFile eq "" and $line =~ /xlink:href=\"([^\"]*)\"/) {
      $textFile = $1;
   }
   if ($line =~ /^ *<body encoding=.Base64.>(.*)<.body>/) {
      open(OUTFILE,">$tmpFile.base64") or 
	  die "$command: cannot open $tmpFile.base64";
      print OUTFILE "$1\n";
      close(OUTFILE);
      system("$base64 -d $tmpFile.base64 > $tmpFile.txt");
   }
   # id for names of points, and start points and end points of spans
   if ($line =~ / id=\"([^\"]*)\"/) { $id = $1; }
   # type for types of spans
   if ($line =~ / type=\"([^\"]*)\"/) { $type = $1; }
   # %parameters contains the names of points in the text document
   if ($line =~ /(\d+)<\/Parameter>/) { $parameters{$id} = $1; }
   # %ends: end point of spans (non-inclusive)
   if ($line =~ /role=\"end\"/ and $line =~ /xlink:href=\"\#([^\"]*)\"/) {
      $ends{$id} = $parameters{$1};
   }
   # %starts: starting points of spans (inclusive)
   if ($line =~ /role=\"start\"/ and $line =~ /xlink:href=\"\#([^\"]*)\"/) {
      $starts{$id} = $parameters{$1};
   }
   # %types: types of spans
   if ($line =~ /role=\"text-extent\"/ and $line =~ /xlink:href=\"\#([^\"]*)\"/) {
      $types{$1} = $type;
   }   
}

# read the text document and convert it to a character array
$text = "";
open(INFILE,"$tmpFile.txt") or
   open(INFILE,$textFile) or 
   die "$command: cannot open $textFile\n";
while (<INFILE>) { $text .= $_; }
close(INFILE);
@chars = split(//,$text);

# mark the locations in the text document where a tag should be placed
# (behind each word within a span)
@tags = ();
foreach $key (keys %starts) {
   for ($i=$starts{$key};$i<$ends{$key};$i++) { 
      if ($chars[$i] =~ /\s/) { $tags[$i] = $types{$key}; }
      
   }
   if ($chars[$ends{$key}-1] !~ /\s/) { $tags[$ends{$key}] = $types{$key}; }
}

# output tagged text document
for ($i=0;$i<=$#chars;$i++) {
   if (defined $tags[$i]) { print "/$tags[$i]"; }
   elsif ($chars[$i] =~ /\s/) { print "/$noTag"; }
   print $chars[$i];
}
# if the final word received a tag that has not been printed yet
if ($chars[$#chars] !~ /\s/ and defined $tags[$#chars+1]) { 
   print "/",$tags[$chars+1]; 
}

# done
unlink("$tmpFile.base64","$tmpFile.txt");
exit(0);
