#! /usr/bin/perl # # # Perform a Wilcoxon two sample Test on ARGV[0] vs ARGV[1] # or on two columns in ARGV[0] ("\n" separated). Returns the level # of significance. # # If ARGV[0] is a filename (or '-') # the corresponding file is read (or STDIN). # # E.g., # > echo -e '1 2\n2 3\n3 4\n4 3\n'| WilcoxonTest.pl - # => 0.9429* (the * indicates that the real level of significance # could not be calculated reliably). # # Uses # # # Copyright (C) 1996, 2001 Rob van Son # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # # The absolute function sub abs{ ($_[0] >= 0) ? $_[0] : -$_[0]; } # Calculate k out n sub k_out_n { # k out n = n!/(k!*(n-k)! which is equal to the PROD(i=k; 1){(n-i+1)/i} local($k, $n, $kn); $k = $_[0]; $n = $_[1]; $kn = 1.0; for(; $k>0; --$k, --$n) { $kn *= $n/$k;}; $kn; } # # This routine recursively counts the number of distributions of ranks over two # samples for which the sum of the ranks in the smaller sample is smaller than or # equal to a given upper bound W. # $W = the bound, $Sum = the sum of ranks upto now, $m-1 = one less than the # number of elements in the smaller sample that still have to be done, # $Start = the current position in the ranks list, *RankList = the array # with all the ranks (this is NOT just the numbers from 1 - N because of ties). # The list with ranks MUST be sorted in INCREASING order. sub CountSmallerRanks{ # CountSmallerRanks($W, $Sum, $m-1, $Start, *RankList) -> Smaller local($W, $Sum, $m, $Start, *RankList) = @_; local($i, $Temp, $Smaller, $End, $mminus1) = (0, 0, 0, 0, 0); # There are no values smaller than W if($Sum > $W){ return 0;}; $End = $#RankList; # Check all subsets of the remaining of RankList if($m > 0) { $mminus1 = $m-1; for($i = $Start; $i <= $End-$m; ++$i) { $Temp = $Sum + $RankList[$i]; if($Temp > $W){ return $Smaller;}; # No smaller values expected anymore $Smaller += &CountSmallerRanks($W, $Temp, $mminus1, $i+1, *RankList); }; } else { # If even adding the highest rank doesn't reach $W, # return the remaining number of items if($Sum + $End + 1 <= $W){ return $End - $Start + 1;}; for($i = $Start; $i <= $End; ++$i) { $Temp = $Sum + $RankList[$i]; if($Temp <= $W) { ++$Smaller;} else # No smaller values expected anymore { return $Smaller;}; }; }; $Smaller; } # # # Get input # # Get STDIN if needed if($ARGV[0] eq '-') { $ARGV[0] = join("\n", ); } elsif(-e $ARGV[0]) { open(INPUT, "<$ARGV[0]") || die "<$ARGV[0]: $!"; $ARGV[0] = join("\n", ); }; # The second argument contains alphanumeric characters (i.e., exists) if($ARGV[1] =~ /[\w]+/) { @A = sort {$a <=> $b} split(' ', $ARGV[0]); @B = sort {$a <=> $b} split(' ', $ARGV[1]); } else # Use a double column from the first entry. Note that the FIRST column { @AB = split('\n', $ARGV[0]); # must be the longer one OR null entries must be foreach (@AB) # entered with a '-' character { @temp = split(' '); if($temp[0] =~ /\w/){ push(@Aunsorted, $temp[0]);}; if($temp[1] =~ /\w/){ push(@Bunsorted, $temp[1]);}; }; @A = sort {$a <=> $b} @Aunsorted; @B = sort {$a <=> $b} @Bunsorted; }; # @TotalList = sort {$a <=> $b} (@A, @B); # # Initialize some values $nA = $#A+1; $nB = $#B+1; $N = $nA+$nB; $MaxSum = $N*($N+1)/2; $H0 = $MaxSum/2.0; # # Replace values by ranks $previous = undef; $start = 0; for($i=0; $i <= $#TotalList; ++$i) { if($TotalList[$i] == $previous) { $mean_rank = ($start+$i+2)/2.0; for($j=$start; $j<=$i; ++$j) { $Total_rank[$j] = $mean_rank; }; } else { $Total_rank[$i] = $i+1; $previous = $TotalList[$i]; $start = $i; }; }; # # Determine the shortest list @shortest = ($#A < $#B) ? @A : @B; $nShortest = $#shortest+1; # Summ the ranks in the shortest list $W = 0; foreach $Value (@shortest) { $i = 0; until($i > $#TotalList || $Value == $TotalList[$i]){ ++$i;}; $W += $Total_rank[$i]; }; # # Use the smallest value of $W $Nz = ($W > $H0) ? $N - $nShortest : $nShortest; $W = ($W > $H0) ? $MaxSum - $W : $W; # # Determine the two-tailed level of significance $p = 0; # # First calculate the Normal approximation. This can be used to # check whether a significant result is plausable for larger N. $Permutations = &k_out_n($nA, $N); if($Permutations >= 25000 || $#shortest+1 >= 10) { $Continuity = ($W >= $H0) ? -0.5 : 0.5; $Z = ($W+$Continuity-$Nz*($N+1.0)/2.0)/sqrt($nA*$nB*($N+1)/12.0); $Z = &abs($Z); $p = NormalZ($Z); }; # # The exact level of significance, for large N, first check whether a # significant result is plausable, i.e., the Normal Approximation gives # a $p < 0.25. if($#shortest+1 < 10 && $p < 0.25 && $Permutations < 60000) # Exact { # Remember that $W must be SMALLER than $MaxSum/2=$H0 $Less = &CountSmallerRanks($W, 0, $#shortest, 0, *Total_rank); # If $Less < $Permutations/2, we have obviously calculated the # wrong way. We should have calculated UPWARD (higher than W) # We can't do that, but we can calculate $Less for $W-1 and # subtract it from $Permutations if(2*$Less > $Permutations) { $Less = CountSmallerRanks($W - 1, 0, $#shortest, 0, \@Total_rank); $Less = $Permutations - $Less; }; $SumFrequencies = $Permutations; $p = 2.0*$Less/$SumFrequencies; }; # printf("%5.4g", $p); if($#shortest+1 < 10 && ($p >= 0.25 || $Permutations >= 60000)) { print "*"; }; # # Calculate two-tailed significance level associated with Z(x) with x = ARGV[0] # # i.e., P(|Z|>=x|Normal Distribution), error(x) < 7.5*10(-8) # # Probabilities are calculated according to: # Abramowitz and Stegun, # Handbook of mathematical functions # Ninth printing, Dover publications, Inc., 1970 # p932, 26.2.17 # # sub NormalZ # ($Z) -> $p { $x = shift; # # P(x) = 1 - Z(x)(b1*t+b2*t**2+b3*t**3+b4*t**4+b5*t**5) # Z(x) = exp(-$x*$x/2.0)/(sqrt(2*3.14159265358979323846)) # t = 1/(1+p*x) # # Parameters @b = (0.319381530, -0.356563782, 1.781477937, -1.821255978, 1.330274429); $p = 0.2316419; $t = 1/(1+$p*$x); # Initialize variables $fact = $t; $Sum = 0; # Sum polynomial for($i=0; $i <= $#b; ++$i) { $Sum += $b[$i]*$fact; $fact *= $t; }; # Calculate probability $p = 2*$Sum*exp(-$x*$x/2.0)/(sqrt(2*3.14159265358979323846)); # return $p; };