## Code to accompany lecture on the details of the sampling distribtion
## for rank based tests (Wilcox and Squared Ranks)
##
## Nonparametric Statistics, Virginia Tech
## Robert B. Gramacy
library(FLSSS)
## dranks:
##
## mass (density) of ranks, following the Mann-Whitney test description
## in the Conover book. This can be very slow if N = nx + ny is much
## bigger than 20
dranks <- function(k, nx, ny)
{
N <- nx + ny
numer <- length(FLSSS(nx, 1:N, k, 0, sizeNeeded="all"))
denom <- choose(N, nx)
return(numer/denom)
}
## pranks:
##
## cumulative distribution function for ranks, following the Mann-Whitney
## description in the Conover book. This can be very slow if N = nx + ny
## is much bigger than 20
pranks <- function(k, nx, ny)
{
## nothing to do?
if(k == 0) return(0)
## sum up all density evaluations from 1:k
p <- 0
for(i in 1:k) p <- p + dranks(i, nx, ny)
## done
return(p)
}
## man.whitney.test:
##
## This is a re-coding of the wilcox.test function in R, following the
## description of the (equivalent) Mann-Whitney test description in the
## Conover book, using the pranks function above for the distribution
## calculations. It is only appropriate if nx = length(x) and ny = length(y)
## are small and there are not many tied ranks. Otherwise, the Gaussian
## approximation given in class is best. Note that this function is slower
## than wilcox.test because the density/distribution evaluation (dwilcox/pwilcox)
## thereis is much faster than our FLSSS implemetation in dranks/pranks above
mann.whitney.test <- function(x, y, alternative=c("two-tailed", "less", "greater"))
{
## check alternative argument
alternative <- match.arg(alternative)
## extract lengths
nx <- length(x)
ny <- length(y)
N <- nx + ny
## get ranks
r <- rank(c(x, y))
rx <- r[seq_along(x)]
## check for ties and possibly warn
if(length(r) - length(unique(r)) > 0.1*N)
warning("more than 10% are ties, suggest using normal approximation")
## calculate test statistic
t <- sum(rx)
## go through alternatives for p-value
if(alternative == "less") phi <- pranks(t, nx, ny)
else if(alternative == "greater") phi <- 1- pranks(t-1, nx, ny)
else { ## two-sided
phi <- 2*min(pranks(t, nx, ny), 1-pranks(t-1, nx, ny))
}
## return results
return(list(stat=t, p.value=phi, alternative=alternative))
}
## dsqranks:
##
## mass (density) of squared ranks, following the squared ranks test description
## in the Conover book. This can be very slow if N = nx + ny is much
## bigger than 20
dsqranks <- function(k, nx, ny)
{
N <- nx + ny
r2 <- (1:N)^2
numer <- length(FLSSS(nx, r2, k, 0, sizeNeeded="all"))
denom <- 0
for(i in 1:sum(r2)) denom <- denom + length(FLSSS(nx, r2, i, 0, sizeNeeded="all"))
return(numer/denom)
}
## pranks:
##
## cumulative distribution function for ranks, following the Mann-Whitney
## description in the Conover book. This can be very slow if N = nx + ny
## is much bigger than 20. Since the denominator involves summing dsqranks
## calculation, this function foes not use dsqranks directly as a subroutine
psqranks <- function(k, nx, ny)
{
## nothing to do?
if(k == 0) return(0)
## avoiding duplicated denom computation in multipl dsqranks calls
N <- nx + ny
r2 <- (1:N)^2
## calculate the common denominator to all density calculations
denom <- 0
for(i in 1:sum(r2)) denom <- denom + length(FLSSS(nx, r2, i, 0, sizeNeeded="all"))
## sum up all density evaluations from 1:k
p <- 0
for(i in 1:k) p <- p + length(FLSSS(nx, r2, i, 0, sizeNeeded="all"))/denom
## done
return(p)
}
## sqranks.test:
##
## This an implementation of the squared ranks test described in the Conover book,
## in the style of the Mann-Whitney trest function above. It is only appropriate
## if nx = length(x) and ny = length(y) are small and there are not many ties (in the
## ranks of absolute deviations). Otherwise, the Gaussian approximation given in
## class is best.
sqranks.test <- function(x, y, alternative=c("two-tailed", "less", "greater"))
{
## check alternative argument
alternative <- match.arg(alternative)
## calculate the absolute discrepancies from the (estimated) mean
nx <- length(x)
ny <- length(y)
N <- nx + ny
xbar <- mean(x)
ybar <- mean(y)
u <- abs(x - xbar)
v <- abs(y - ybar)
## calculate the observed ranks, and t
r <- rank(c(u, v))
ru <- r[seq_along(u)]
rv <- r[-seq_along(u)]
## check for ties and possibly warn
if(length(r) - length(unique(r)) > 0.1*N)
warning("more than 10% are ties, suggest using normal approximation")
## calculate test statistic
t <- sum(ru^2)
## go through alternatives for p-value
if(alternative == "less") phi <- psqranks(t, nx, ny)
else if(alternative == "greater") phi <- 1-psqranks(t-1, nx, ny)
else { ## two-sided
phi <- 2*min(psqranks(t, nx, ny), 1-psqranks(t-1, nx, ny))
}
## return results
return(list(stat=t, p.value=phi, alternative=alternative))
}