#!/usr/bin/perl -w
# r: estimate correlation coefficient per row of numbers
# usage: r [-m number] [-s number] [-y year] [-d] < file
# note: compared and checked r values at
# * http://www.alcula.com/calculators/statistics/correlation-coefficient/
# * http://www.socscistatistics.com/tests/pearson/Default2.aspx
# option -d invokes debug mode
# number option sets maximum number size on window of months used
# data files: http://www.let.rug.nl/gosse/Ngrams/download.html
# 20160219 name(at)domain

use strict;
use Getopt::Std;

my $command = $0;
my %options = ();
getopts("dm:s:y:",\%options);

my $MAX; # for sliding window of MAX+1 time points; set by number option
if ($options{"m"} and $options{"m"} =~ /^\d+$/) { $MAX = $options{"m"}; }
my $smooth = 0; # for tests with data smoothing
if ($options{"s"} and $options{"s"} =~ /^\d+$/) { $smooth = $options{"s"}; $smooth = int($smooth/2); }
my $nbrOfValues; # number of columns in data file
my @corpusSize = (); # total number of tokens per column
my $unknown = "????";
my $firstYear = $unknown; # first year in which the word is observed
my $lastYear = $unknown; # last year in which the word is observed
my $startYear = 1837; # oldest publication year in collection
if ($options{"y"} and $options{"y"} =~ /^\d+$/) { $startYear = $options{"y"}; }
# the first line in the data file specifies these: CORPUSSIZE tab NUMBER
LOOP: while (<STDIN>) {
   my $line = $_;
   chomp($line);
   $firstYear = $unknown;
   $lastYear = 0;
   my $topYear = -1;
   my $topValue = -1;
   my ($word,@values) = split(/\t/,$line);
   if (not defined $nbrOfValues) { 
      $nbrOfValues = $#values; 
      if (not defined $MAX) { $MAX = $nbrOfValues; }
   }
   if ($#values != $nbrOfValues) {
      die "$command: unexpected number of values (1+$#values vs $nbrOfValues) in line: $line\n";
   }
   if (not defined $corpusSize[0]) {
      # first line in the data should specify the number of tokens per column
      # CORPUSSIZE tab NUMBER tab NUMBER tab ...
      if (not $word eq "CORPUSSIZE") {
         die "$command: expected CORPUSSIZE on first line: $line\n";
      }
      @corpusSize = @values;
      next LOOP;
   }
   my @relative = (); # relative frequencies
   my $min = $values[0]; # minimum value on this data line
   my $max = $values[0]; # maximum value on this data line
   # minimum and maximum values of corrlation coefficients (T) and deltas (R)
   # only with sliding windows will max be different from min
   my ($maxT,$minT,$maxR,$minR); 
   for (my $start=0;$start<=$#values-$MAX;$start++) {
      # convert to relative frequency
      $relative[$start] = $values[$start]/$corpusSize[$start]; 
      if ($firstYear eq $unknown and $values[$start] > 0) { $firstYear = $start; }
      # smoothing includes the s previous and next time points, computes average 
      if (defined $smooth) {
         my $j = 1;
         while ($j <= $smooth and $start-$j >= 0) {
            $relative[$start] += $values[$start-$j]/$corpusSize[$start-$j];
            $j++;
         }
         $j = 1;
         while ($j <= $smooth and $start+$j <= $#values) {
            $relative[$start] += $values[$start+$j]/$corpusSize[$start+$j];
            $j++;
         }
         $relative[$start] /= $j;
      }
      # compute average of data values on this line
      my $averageX = $relative[$start];
      for (my $i=$start+1;$i<=$start+$MAX;$i++) {
         if ($values[$i] < $min) { $min = $values[$i]; } 
         if ($values[$i] > $max) { $max = $values[$i]; }
         if ($firstYear eq $unknown and $values[$i] > 0) { $firstYear = $i; }
         if ($lastYear < $i and $values[$i] > 0) { $lastYear = $i; }
         # convert to relative frequency
         $relative[$i] = $values[$i]/$corpusSize[$i]; 
         if ($relative[$i] > $topValue) { 
            $topValue = $relative[$i];
            $topYear = $i;
         }
         if (defined $smooth) {
            my $j = 1;
            while ($j <= $smooth and $i-$j >= 0) {
               $relative[$i] += $values[$i-$j]/$corpusSize[$i-$j];
               $j++;
            }
            $j = 1;
            while ($j <= $smooth and $i+$j <= $#values) {
               $relative[$i] += $values[$i+$j]/$corpusSize[$i+$j];
               $j++;
            }
            $relative[$i] /= $j;
         }
         $averageX += $relative[$i];
      }
      $averageX /= ($MAX+1);
      # average of the time point numbers
      my $averageY = $start+$MAX/2;
      # compute sds for data values and time point numbers
      my $sdX = 0;
      my $sdY = 0;
      for (my $i=$start;$i<=$start+$MAX;$i++) {
         $sdX += ($relative[$i]-$averageX)*($relative[$i]-$averageX);
         $sdY += ($i-$averageY)*($i-$averageY);
      }
      if (defined $options{"d"}) { printf "x avg: %6.3f\ny xvg: %6.3f\nx sd2: %6.3f\ny sd2: %6.3f\n",$averageX,$averageY,$sdX,$sdY; }
      $sdX = sqrt($sdX/($MAX+1));
      $sdY = sqrt($sdY/($MAX+1));
      # compute correlation coefficient ($total) and delta score ($rising)
      my $total = 0;
      my $rising = 0;
      if ($max != $min) {
         for (my $i=$start;$i<=$start+$MAX;$i++) {
            if (defined $options{"d"}) { printf "x$i-xavg: %6.3f\ny$i-yavg: %6.3f\nproduct: %6.3f\n",($relative[$i]-$averageX),($i-$averageY),($relative[$i]-$averageX)*($i-$averageY); }
            $total += ($relative[$i]-$averageX)*($i-$averageY);
            if ($i > $start and $relative[$i] > $relative[$i-1]) { $rising++; }
            if ($i > $start and $relative[$i] < $relative[$i-1]) { $rising--; }
         }
      }
      if (defined $options{"d"}) { printf "total: %6.3f\n",$total; }
      if ($sdX != 0 and $sdY != 0) { $total /= $sdX*$sdY*($MAX+1); }
      if (not defined $maxT or $total > $maxT) { $maxT = $total; }
      if (not defined $minT or $total < $minT) { $minT = $total; }
      if (not defined $maxR or $rising > $maxR) { $maxR = $rising; }
      if (not defined $minR or $rising < $minR) { $minR = $rising; }
   }
   # overrule values of $firstYear and $lastYear: base them on highest peak
   # problem: selectinf FY1837=0 and LY2009=0
   # $firstYear = $topYear;
   # $lastYear = $topYear;
   # while ($firstYear-1 >= 0 and $values[$firstYear-1] > 0) { $firstYear--; }
   # while ($lastYear+1 <= $#values and $values[$lastYear+1] > 0) { $lastYear++; }
   # show results for this data line:
   # correlationcoefficient deltascore [] fraction min max word
   # where fraction = max/min, and
   # [] are the minimum values for correlationcoefficient deltascore
   # [] is only non-empty when a sliding window is used ($MAX)
   if ($firstYear =~ /^\d+$/) { 
      $firstYear += $startYear; 
      if ($firstYear >= 1945) { $firstYear++; }
      if ($firstYear >= 2003) { $firstYear++; $firstYear++ }
      if ($firstYear >= 2008) { $firstYear++; }
   }
   if ($lastYear =~ /^\d+$/) { 
      $lastYear += $startYear; 
       if ($lastYear >= 1945) { $lastYear++; }
       if ($lastYear >= 2003) { $lastYear++; $lastYear++ }
       if ($lastYear >= 2008) { $lastYear++; }
   }
   my $lastMinusFirst = log(($relative[$#values]+(0.5/$corpusSize[$#values]))/
                            ($relative[0]+(0.5/$corpusSize[0])))/log(2);
   printf "%8.5f\t%d\t%8.5f\tFY=%s\tLY=%s",$lastMinusFirst,$maxR,$maxT,$firstYear,$lastYear;
   if ($MAX < $nbrOfValues) { printf "\t%d\t%8.5f",$minR,$minT; }
   my $fraction = $min > 0 ? $max/$min : $max/0.5;
   printf "\t%8.5f\t%d\t%d\t%s\n",$fraction,$min,$max,$word;
}
exit(0);
