#!/usr/bin/perl -w
# recap: change initial upper case to lower case
# usage: cat tokenized-files | recap > file.stats
#        recap file.stats < filetocheck
# notes: tokenized files contain one sentence per line
#        based on casestats
# version 20030819
# 20030717 erikt@uia.ua.ac.be

use strict;
use locale;

if (not defined $ARGV[0]) { &findUpperCase(); }
else { &useUpperCase(); }
exit(0);

sub findUpperCase {
   my ($i,
       $bestScore,$bestW,$bigramL,$bigramW,$context,$ignore,
       $line,$lower,$lower1,$lower2,$word,$word1,$word2,
       @lower,@words,
       %case,%caseW);

   %case = ();
   $context = "";
   LOOPL: while (<STDIN>) {
      $line = $_;
      chomp($line);
      # skip html
      if ($line =~ /<.*>/) { 
         if ($line =~ /<(head|text)>/) { $context = $line; }
         next; 
      }
      # remove quotes, etc.
      $line =~ s/^\W*//;
      # skip headlines
      if ($context =~ /<head>/) { next; }
      if ($line !~ / [a-z]/) { next LOOPL; }

      @words = split(/\s+/,$line);
      @lower = ();
      # do not use sentence-initial word
      LOOP: while (@words) {
         $ignore = shift(@words);
         if ($ignore =~ /\w/) { last LOOP; }
      }
      LOOPW: for ($i=0;$i<=$#words;$i++) {
         $lower[$i] = lc($words[$i]);
         if ($lower[$i] !~ /[a-z]/) { 
            next LOOPW; 
         }
         if (not defined $case{$lower[$i]}) { 
            $case{$lower[$i]} = "$words[$i] 1"; 
         } else {
            %caseW = split(/ +/,$case{$lower[$i]});
            if (defined $caseW{$words[$i]}) { $caseW{$words[$i]}++; }
            else { $caseW{$words[$i]} = 1; }
            $case{$lower[$i]} = join(" ",%caseW);
         } # target format: $case{"new"} = "new 123 New 32 NEW 1"
         if ($i == 0 or 
             $lower[$i-1] !~ /[a-z]/) { 
            next LOOPW; 
         }
         $bigramL = $lower[$i-1]." $lower[$i]";
         $bigramW = $words[$i-1]."\t$words[$i]";
         if (not defined $case{$bigramL}) { 
            $case{$bigramL} = "$bigramW 1"; 
         } else {
            %caseW = split(/ +/,$case{$bigramL});
            if (defined $caseW{$bigramW}) { $caseW{$bigramW}++; }
            else { $caseW{$bigramW} = 1; }
            $case{$bigramL} = join(" ",%caseW);
         }
      }
   }
   foreach $lower (keys %case) {
      if ($lower !~ / /) {
         %caseW = split(/ +/,$case{$lower});
         $bestW = "";
         $bestScore = -1;
         foreach $word (keys %caseW) {
            if ($caseW{$word} > $bestScore or
                # tie breaker: choose last in alphabet (favors upper in en_US)
                ($caseW{$word} == $bestScore and $word gt $bestW)) {
               $bestW = $word;
               $bestScore = $caseW{$word};
            }
         }
         print "$bestW $bestScore";
         $case{$lower} = $bestW;
         # 20050412 commented away
#        foreach $word (keys %caseW) {
#           if ($word ne $bestW) { print " $word $caseW{$word}"; }
#        }
         print "\n";
      }
   }
   foreach $lower (keys %case) {
      if ($lower =~ / /) {
         %caseW = split(/ +/,$case{$lower});
         $bestW = "";
         $bestScore = -1;
         foreach $word (keys %caseW) {
            if ($caseW{$word} > $bestScore or
                # tie breaker: choose last in alphabet (favors upper in en_US)
                ($caseW{$word} == $bestScore and $word gt $bestW)) {
               $bestW = $word;
               $bestScore = $caseW{$word};
            }
         }
         ($lower1,$lower2) = split(/ +/,$lower);
         ($word1,$word2) = split(/\t+/,$bestW);
         if ($word1 ne $lower1 and $word2 ne $lower2 and
             ($word1 ne $case{$lower1} or $word2 ne $case{$lower2})) {
            print "$word1 $word2 $bestScore";
            # 20050412 commented away
#           foreach $word (keys %caseW) {
#              if ($word ne $bestW) {
#                 ($word1,$word2) = split(/\t+/,$word);
#                 print " $word1 $word2 $caseW{$word}"; 
#              }
#           }
            print "\n";
         }
      }
   }
}

sub useUpperCase {
   my ($i,
       $bigramL,$bigramR,$context,$file,
       $line,$lower,$lower1,$lower2,$word1,$word2,
       @words,
       %case);

   foreach $file (@ARGV) {
      open(INFILE,$file) or die "cannot open $file\n";
      LOOP: while (<INFILE>) {
         $line = $_;
         chomp($line);
         $line =~ s/\s\d+$//;
         $lower = lc($line);
         if ($lower !~ /[a-z]/) { 
            die "cannot happen: dictionary contains $line\n";
         } 
         $case{$lower} = $line;
      }
      close(INFILE);
   }
   $context = "";
   while (<STDIN>) {
      $line = $_;
      chomp($line);
      # skip html
      if ($line =~ /<.*>/) {
         if ($line =~ /<(head|text)>/) { $context = $line; }
         print "$line\n"; 
         next; 
      }
      # remove quotes, etc.
      # $line =~ s/^\W*//;
      @words = split(/\s+/,$line);
      LOOPL: for ($i=0;$i<=$#words;$i++) {
         $lower = lc($words[$i]);
         if ($i == $#words) { $bigramR = ""; }
         else { $bigramR = "$lower ".lc($words[$i+1]); }
         if ($i == 0) { $bigramL = ""; }
         else { $bigramL = lc($words[$i-1])." $lower"; }
         if (defined $case{$bigramR}) {
            ($word1,$word2) = split(/\s+/,$case{$bigramR});
            $words[$i] = $word1;
         } elsif (defined $case{$bigramL}) {
            ($word1,$word2) = split(/\s+/,$case{$bigramL});
            $words[$i] = $word2;
         } elsif (defined $case{$lower}) {
            $words[$i] = $case{$lower};
         }
         # not head line: skip rest
         if ($words[$i] =~ /\w/ and $context !~ /<head>/ and
             $line !~ /hke\d/ and
             $line =~ / [a-z]/) { last LOOPL; } 
      }
      $line = join(" ",@words);
      print "$line\n";
   }
}
 
