#!/usr/bin/perl -w
# logfile.annotate.select: select and count tweets annotated twice
# usage: bin/select < logfile
# 20120205 erikt(at)xs4all.nl

$ANNOTATOR1 = "\@erik";
$ANNOTATOR2 = "\@Johan";

# expected line format: annotator class user tweet
%tweets = ();
%classes = ();
while (<STDIN>) {
   $line = $_;
   chomp($line);
   @fields = split(/\s+/,$line);
   $annotator = shift(@fields);
   $class = shift(@fields);
   $tweet = join(" ",@fields);
   $tweets{$tweet} = 1;
   # if the tweet has been tagged but differently, assign a non-existent tag
   if (defined $classes{"$annotator $tweet"} and
       $classes{"$annotator $tweet"} ne $class) { $ classes{"$annotator $tweet"} = $annotator; }
   else { $classes{"$annotator $tweet"} = $class; }
}

$found = 0;
$equal = 0;
foreach $tweet (keys %tweets) {
   if (defined $classes{"$ANNOTATOR1 $tweet"} and 
       defined $classes{"$ANNOTATOR2 $tweet"}) {
      $found++;
      if ($classes{"$ANNOTATOR1 $tweet"} eq $classes{"$ANNOTATOR2 $tweet"}) {
         $equal++;
         print $tweet . " " . $classes{"$ANNOTATOR1 $tweet"} . "\n"; 
      }
   }
}

print STDERR "found = $found\nequal = $equal\n";

exit(0);
