#!/usr/bin/perl -w
# smurfs: tranlate tokenized English text to Smurf langauge
# usage: smurfs dictionary < text
# 20061102 erikt@xs4all.nl

$command = $0;

$smurf = "smurf";  # the smurf word; replace it to get different texts
$factor = 12;      # target maximum number of words without smurf
$noSmurfTail = 8;  # add extra smurf word if sentence ends with this many words
$dictSize = 10000; # keep this many words in the dictionary
$compSize = 50000; # keep this many words in the compound dictionary

# read dictionary
$dict = shift(@ARGV);
if (not defined $dict or not -f $dict) {
   die "$command: cannot find dictionary\n"; 
}
# get alternative word
if (defined $ARGV[0] and $ARGV[0] ne "") { $smurf = shift(@ARGV); }

$i = 0;
%dict = ();
$dictValue = 1; # 1: valid word; 0: use only for compounds
open(INFILE,$dict) or die "$command: cannot open $dict\n";
while (<INFILE>) {
   $line = $_;
   chomp($line);
   $line =~ s/^\s*//;
   ($_,$word) = split(/\s+/,$line);
   $dict{$word} = $dictValue;
   $i++;
   if ($i >= $dictSize) { 
      $dictValue = 0;
      if ($i >= $compSize) { last; }
   }
}
close(INFILE);

# read and process text
$lastWord = "";
while (<STDIN>) {
   $line = $_;
   chomp($line);
   @words = split(/\s+/,$line);
   $newline = "";
   $smurfWords = 0;
   $noSmurf = 0;
   for ($i=0;$i<=$#words;$i++) {
      # replacement rule:
      # word must contain an alphabetic character and the 
      # case-insensitive version of the word must not be 
      # present in the dictionary
      # addition: replace all words preceded by infinitive marker (te)
      if ($words[$i] =~ /[a-zA-Z]/ and
          (((not defined $dict{$words[$i]} or 
             $dict{$words[$i]} <= 0) and
            (not defined $dict{lc($words[$i])} or
             $dict{lc($words[$i])} <= 0)) or 
           ($i > 0 and $words[$i-1] =~ /^te$/i))) {
         $nextWord = &smurf($words[$i]);
         # forbidden: two successive unchanged smurf words
         if ($lastWord !~ /^$smurf$/i or $nextWord !~ /^$smurf$/i) {
            # forbidden: two successive capitalized smurf words
            if ($lastWord =~ /^[A-Z]/ and $lastWord =~ /$smurf/i and
                $nextWord =~ /^[A-Z]/ and $nextWord =~ /$smurf/i) {
               $newline =~ s/\S+\s$//;
               $newline .= "$words[$i-1] ";
               $smurfWords--;
               $nextWord = &caps($smurf);
            } 
            $newline .= "$nextWord ";
            $smurfWords++;
            $noSmurf = 0;
         }
         $lastWord = $nextWord;
      } else { 
         $newline .= "$words[$i] ";
         $lastWord = $words[$i];
         $noSmurf++;
      }
   }
   # add smurf word to the end of the line if necessary
   if ($newline !~ /$smurf/i or 
       $smurfWords < $#words/$factor or
       $noSmurf >= $noSmurfTail) {
      @words = split(/\s+/,$newline);
      $i = $#words;
      # never replace non-words or prepositions
      while ($i > 0 and 
             ($words[$i] !~ /[a-z]/i or
              $words[$i] =~ /^(aan|af|door|hij|in|is|mee|niet|op|over|terug|toe|uit|voor|weg|ze|zij")$/)) { $i--; }
      if ($i >= 0 and $words[$i] !~ /$smurf/i) { 
         $words[$i] = &smurf($words[$i]); 
         $smurfWords++;
      }
      $newline = join(" ",@words);
   }
   # add smurf word to the beginning of the line if necessary
   if ($smurfWords < $#words/$factor) {
      @words = split(/\s+/,$newline);
      $i = 0;
      # look for noun
      while ($i < $#words-1 and 
             ($words[$i] !~ /^(de|een|het)$/ or $words[$i+1] =~ /$smurf/i)) {
         $i++; 
      }
      if ($words[$i] =~ /^(de|een|het)$/ and $words[$i+1] !~ /$smurf/i) {
         $words[$i+1] = &smurf($words[$i+1]); 
         $smurfWords++;
      }
      $newline = join(" ",@words);  
   }
   $newline =~ s/ (de|een|het) ${smurf}t / $1 $smurf /g;
   $newline =~ s/het ($smurf) /de $1 /ig;
   print "$newline\n";
}

exit(0);

# convert word to variant of smurf
sub smurf {
   my ($word) = shift(@_);
   my ($nextWord) = "$smurf";
   my ($suffix,$prefix,$caps,$c);
   my (@suffix,@prefix);

   # 1: look for common prefixes
   if ($word !~ /[A-Z].*[a-z]/ and
       $word =~ /^(aange|aan|achterge|achter|afge|af|be|bijge|bij|binnenge|binnen|bovenge|boven|buitenge|buiten|contrage|contra|doorge|door|ge|heenge|heen|inter|inte|inge|in|langsge|langs|meege|mee|nage|na|ont|onge|onver|onderge|onder|onge|on|openge|open|opge|op|overge|over|rondge|rond|tegenge|tegen|terechtge|terecht|terugge|terug|toege|toe|tussenge|tussen|uitge|uit|vastge|vast|verge|ver|voorge|voor|wegge|weg)..../i) { 
      $nextWord = "$1$nextWord"; 
   # 2: check for compounds
   } elsif (length($word) >= 8) {
      @suffix = split(//,$word);
      @prefix = ();
      $suffix = join("",@suffix);
      while ($suffix ne "" and
             (not defined $dict{$suffix} or $#prefix <= 1)) {
         $c = shift(@suffix);
         push(@prefix,$c);
         $suffix = join("",@suffix);
      }
      if (defined $dict{$suffix} and length($suffix) >= 4) {
         $prefix = join("",@prefix);
         $nextWord = $prefix =~ /[A-Z]/ ? &caps($smurf) : "$smurf";
         $nextWord =~ s/(t)$//;
         $nextWord .= $suffix;
         return($nextWord);
      }
   }
   # 3: check for common suffixes
   if ($word =~ /....isch$/i) { $nextWord .= "ig"; } 
   elsif ($word =~ /....[td]e$/i) { $nextWord .= "te"; } 
   elsif ($word =~ /....ers$/i) { $nextWord .= "en"; } 
   elsif ($word =~ /ge.*...[td]en$/i) { $nextWord .= "t"; } 
   elsif ($word =~ /....[td]en$/i) { $nextWord .= "ten"; } 
   elsif ($word =~ /ge.*...end$/i) { $nextWord .= "t"; } 
   elsif ($word =~ /....(baar|end|en|heid|ige|ig|ing|jes|ije|je|lijk|schap|stad|burg|dorp|land|staat|rijk|weg|straat|dreef|meer|zee|i|s|t)$/i) { 
      $nextWord .= $1; 
   } elsif ($word =~ /....ie$/i) { $nextWord .= "ie"; } 
   elsif ($word =~ /....e$/i) { $nextWord .= "se"; } 
   elsif ($word =~ /(be|ge|ver).....*d$/i) { $nextWord .= "t"; } 
   elsif ($word =~ /..en$/i or 
          $word =~ /^(bestaan|gaan|heengaan|slaan|staan|verslaan|weerstaan|zijn)$/i) { $nextWord .= "en"; }
   elsif ($word =~ /^(riep|vond|zei)$/i) { $nextWord .= "te"; }
   # 4: capitalize words
   if ($word =~ /^[A-Z]+$/) { $nextWord =~ tr/a-z/A-Z/; }
   elsif ($word =~ /[A-Z]/) { $nextWord = &caps($nextWord); }
   return($nextWord);
}

# convert initial character to capital
sub caps {
   my ($word) = shift(@_);
   my ($caps);

   $word =~ s/^(.)//;
   $caps = $1;
   $caps =~ tr/a-z/A-Z/;
   $word = "$caps$word";
   return($word);
}
