#! /usr/bin/perl
#
#
# Perform a Wilcoxon two sample Test on ARGV[0] vs ARGV[1]
# or on two columns in ARGV[0] ("\n" separated). Returns the level
# of significance.
#
# If ARGV[0] is a filename (or '-')
# the corresponding file is read (or STDIN).
#
# E.g., 
# > echo -e '1 2\n2 3\n3 4\n4 3\n'| WilcoxonTest.pl -
# => 0.9429*  (the * indicates that the real level of significance
# could not be calculated reliably).
#
# Uses 
#
# 
# Copyright (C) 1996, 2001  Rob van Son
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
# 
#
# The absolute function
sub abs{
($_[0] >= 0) ? $_[0] : -$_[0];
}
# Calculate k out n
sub k_out_n {
  # k out n = n!/(k!*(n-k)! which is equal to the PROD(i=k; 1){(n-i+1)/i}
  local($k, $n, $kn);
  $k = $_[0];
  $n = $_[1];
  $kn = 1.0;
  for(; $k>0; --$k, --$n)
  { $kn *= $n/$k;};
  $kn; 
}
#
# This routine recursively counts the number of distributions of ranks over two
# samples for which the sum of the ranks in the smaller sample is smaller than or
# equal to a given upper bound W.
# $W = the bound, $Sum = the sum of ranks upto now, $m-1 = one less than the
# number of elements in the smaller sample that still have to be done, 
# $Start = the current position in the ranks list, *RankList = the array
# with all the ranks (this is NOT just the numbers from 1 - N because of ties).
# The list with ranks MUST be sorted in INCREASING order.
sub CountSmallerRanks{		# CountSmallerRanks($W, $Sum, $m-1, $Start, *RankList) -> Smaller
  local($W, $Sum, $m, $Start, *RankList) = @_;
  local($i, $Temp, $Smaller, $End, $mminus1) = (0, 0, 0, 0, 0);
  # There are no values smaller than W
  if($Sum > $W){ return 0;};
  $End = $#RankList;
  # Check all subsets of the remaining of RankList
  if($m > 0)
  { 
    $mminus1 = $m-1;
    for($i = $Start; $i <= $End-$m; ++$i)
    { 
      $Temp = $Sum + $RankList[$i];
      if($Temp > $W){ return $Smaller;};	# No smaller values expected anymore
      $Smaller += &CountSmallerRanks($W, $Temp, $mminus1, $i+1, *RankList);
    };
  }
  else
  { 
    # If even adding the highest rank doesn't reach $W, 
    # return the remaining number of items
    if($Sum + $End + 1 <= $W){ return $End - $Start + 1;};
    for($i = $Start; $i <= $End; ++$i)
    { 
      $Temp = $Sum + $RankList[$i];
      if($Temp <= $W)
      { ++$Smaller;}
      else	# No smaller values expected anymore
      { return $Smaller;};
    };
  };
  $Smaller;
}
#
#
# Get input
#
# Get STDIN if needed
if($ARGV[0] eq '-')
{
	$ARGV[0] = join("\n", <STDIN>);
}
elsif(-e $ARGV[0])
{
	open(INPUT, "<$ARGV[0]") || die "<$ARGV[0]: $!";
	$ARGV[0] = join("\n", <INPUT>);
};

# The second argument contains alphanumeric characters (i.e., exists)
if($ARGV[1] =~ /[\w]+/)
{
  @A = sort {$a <=> $b} split(' ', $ARGV[0]);
  @B = sort {$a <=> $b} split(' ', $ARGV[1]);
}
else	# Use a double column from the first entry. Note that the FIRST column
{ 
  @AB = split('\n', $ARGV[0]);	# must be the longer one OR null entries must be
  foreach (@AB)			# entered with a '-' character
  { @temp = split(' ');
    if($temp[0] =~ /\w/){ push(@Aunsorted, $temp[0]);};
    if($temp[1] =~ /\w/){ push(@Bunsorted, $temp[1]);};
  };
  @A = sort {$a <=> $b} @Aunsorted;
  @B = sort {$a <=> $b} @Bunsorted;
};
#
@TotalList = sort {$a <=> $b} (@A, @B);
#
# Initialize some values
$nA = $#A+1;
$nB = $#B+1;
$N = $nA+$nB;
$MaxSum = $N*($N+1)/2;
$H0 = $MaxSum/2.0;
#
# Replace values by ranks
$previous = undef;
$start = 0;
for($i=0; $i <= $#TotalList; ++$i)
{ if($TotalList[$i] == $previous)
  { $mean_rank = ($start+$i+2)/2.0;
    for($j=$start; $j<=$i; ++$j)
    { $Total_rank[$j] = $mean_rank;
    };
  }
  else
  { $Total_rank[$i] = $i+1;
    $previous = $TotalList[$i];
    $start = $i;
  };
};
#
# Determine the shortest list
@shortest = ($#A < $#B) ? @A : @B;
$nShortest = $#shortest+1;
# Summ the ranks in the shortest list
$W = 0;
foreach $Value (@shortest)
{
  $i = 0;
  until($i > $#TotalList || $Value == $TotalList[$i]){ ++$i;};
  $W += $Total_rank[$i];
};
#
# Use the smallest value of $W
$Nz = ($W > $H0) ? $N - $nShortest : $nShortest;
$W = ($W > $H0) ? $MaxSum - $W : $W;
#
# Determine the two-tailed level of significance
$p = 0;
# 
# First calculate the Normal approximation. This can be used to
# check whether a significant result is plausable for larger N.
$Permutations = &k_out_n($nA, $N);
if($Permutations >= 25000 || $#shortest+1 >= 10)	
{
   $Continuity = ($W >= $H0) ? -0.5 : 0.5;
   $Z = ($W+$Continuity-$Nz*($N+1.0)/2.0)/sqrt($nA*$nB*($N+1)/12.0);
   $Z = &abs($Z);
   $p = NormalZ($Z);
};
#
# The exact level of significance, for large N, first check whether a
# significant result is plausable, i.e., the Normal Approximation gives
# a $p < 0.25.
if($#shortest+1 < 10 && $p < 0.25 && $Permutations < 60000)	# Exact
{
   # Remember that $W must be SMALLER than $MaxSum/2=$H0
   $Less = &CountSmallerRanks($W, 0, $#shortest, 0, *Total_rank);
   # If $Less < $Permutations/2, we have obviously calculated the 
   # wrong way. We should have calculated UPWARD (higher than W)
   # We can't do that, but we can calculate $Less for $W-1 and
   # subtract it from $Permutations
   if(2*$Less > $Permutations)
   {
       $Less = CountSmallerRanks($W - 1, 0, $#shortest, 0, \@Total_rank);
       $Less = $Permutations - $Less;
   };
   
   $SumFrequencies = $Permutations;
   $p = 2.0*$Less/$SumFrequencies;
};

#
printf("%5.4g", $p);
if($#shortest+1 < 10 && ($p >= 0.25 || $Permutations >= 60000))
{ print "*";
};

#
# Calculate two-tailed significance level associated with Z(x) with x = ARGV[0]
#
# i.e., P(|Z|>=x|Normal Distribution), error(x) < 7.5*10(-8)
#
# Probabilities are calculated according to: 
# Abramowitz and Stegun,
# Handbook of mathematical functions
# Ninth printing, Dover publications, Inc., 1970                  
# p932, 26.2.17
# 
#
sub NormalZ   # ($Z) -> $p
{
	$x = shift;
	#
	# P(x) = 1 - Z(x)(b1*t+b2*t**2+b3*t**3+b4*t**4+b5*t**5)
	# Z(x) = exp(-$x*$x/2.0)/(sqrt(2*3.14159265358979323846))
	# t = 1/(1+p*x)
	#
	# Parameters
	@b = (0.319381530, -0.356563782, 1.781477937, -1.821255978, 1.330274429);
	$p = 0.2316419;
	$t = 1/(1+$p*$x);
	# Initialize variables
	$fact = $t;
	$Sum = 0;
	# Sum polynomial
	for($i=0; $i <= $#b; ++$i)
	{ 
  		$Sum += $b[$i]*$fact;
  		$fact *= $t;
	};
	# Calculate probability
	$p = 2*$Sum*exp(-$x*$x/2.0)/(sqrt(2*3.14159265358979323846));
	#
	return $p;
};

