#!/usr/bin/perl -w
# anonymize: remove user names from twitter data
# usage: anonymize files
# 20120308 erikt(at)xs4all.nl

$command = $0;
@files = @ARGV;

# do not replace party names: they are required for the counts
# %ids contains the id for each Twitter user name
%ids = qw(pvv pvv vvd vvd cda cda pvda pvda sp sp gl gl d66 d66 cu cu pvdd pvdd sgp sgp 50plus 50plus christenunie christenunie groenlinks groenlinks 50+ 50+);
# %reverse contains the Twitter user name for each id
%reverse = %ids;
# process all data files
foreach $file (@files) {
   open(INFILE,"$file") or die "$command: cannot read $file\n";
   # store anonymized tweets in file $file.out
   $outFile = "$file.out";
   open(OUTFILE,">$outFile") or die "$command: cannot write to $outFile\n";
   while (<INFILE>) {
      $line = $_;
      chomp($line);
      @words = split(/([\s\@()?.!,:;'"])/,$line);
      for ($i=0;$i<=$#words;$i++) {
         if ($i == 0 or $words[$i-1] eq "@") {
            $words[$i] = &getId($words[$i]);
         }
      }
      $line = join("",@words);
      print OUTFILE "$line\n";
   }
   close(OUTFILE);
   close(INFILE);
}

exit(0);

sub getId {
   my $name = shift(@_);
   $name = lc($name);
   if (defined $ids{$name}) { return($ids{$name}); }
   my $id = -1;
   until ($id >= 0 and not defined $reverse{$id}) {
      $id = int(1000000+rand(9000000));
   }
   $ids{$name} = "USER$id";
   $reverse{$id} = $name;
   return($ids{$name});
}
