## Code to accompany lecture on the details of the sampling distribtion ## for rank based tests (Wilcox and Squared Ranks) ## ## Nonparametric Statistics, Virginia Tech ## Robert B. Gramacy library(FLSSS) ## dranks: ## ## mass (density) of ranks, following the Mann-Whitney test description ## in the Conover book. This can be very slow if N = nx + ny is much ## bigger than 20 dranks <- function(k, nx, ny) { N <- nx + ny numer <- length(FLSSS(nx, 1:N, k, 0, sizeNeeded="all")) denom <- choose(N, nx) return(numer/denom) } ## pranks: ## ## cumulative distribution function for ranks, following the Mann-Whitney ## description in the Conover book. This can be very slow if N = nx + ny ## is much bigger than 20 pranks <- function(k, nx, ny) { ## nothing to do? if(k == 0) return(0) ## sum up all density evaluations from 1:k p <- 0 for(i in 1:k) p <- p + dranks(i, nx, ny) ## done return(p) } ## man.whitney.test: ## ## This is a re-coding of the wilcox.test function in R, following the ## description of the (equivalent) Mann-Whitney test description in the ## Conover book, using the pranks function above for the distribution ## calculations. It is only appropriate if nx = length(x) and ny = length(y) ## are small and there are not many tied ranks. Otherwise, the Gaussian ## approximation given in class is best. Note that this function is slower ## than wilcox.test because the density/distribution evaluation (dwilcox/pwilcox) ## thereis is much faster than our FLSSS implemetation in dranks/pranks above mann.whitney.test <- function(x, y, alternative=c("two-tailed", "less", "greater")) { ## check alternative argument alternative <- match.arg(alternative) ## extract lengths nx <- length(x) ny <- length(y) N <- nx + ny ## get ranks r <- rank(c(x, y)) rx <- r[seq_along(x)] ## check for ties and possibly warn if(length(r) - length(unique(r)) > 0.1*N) warning("more than 10% are ties, suggest using normal approximation") ## calculate test statistic t <- sum(rx) ## go through alternatives for p-value if(alternative == "less") phi <- pranks(t, nx, ny) else if(alternative == "greater") phi <- 1- pranks(t-1, nx, ny) else { ## two-sided phi <- 2*min(pranks(t, nx, ny), 1-pranks(t-1, nx, ny)) } ## return results return(list(stat=t, p.value=phi, alternative=alternative)) } ## dsqranks: ## ## mass (density) of squared ranks, following the squared ranks test description ## in the Conover book. This can be very slow if N = nx + ny is much ## bigger than 20 dsqranks <- function(k, nx, ny) { N <- nx + ny r2 <- (1:N)^2 numer <- length(FLSSS(nx, r2, k, 0, sizeNeeded="all")) denom <- 0 for(i in 1:sum(r2)) denom <- denom + length(FLSSS(nx, r2, i, 0, sizeNeeded="all")) return(numer/denom) } ## pranks: ## ## cumulative distribution function for ranks, following the Mann-Whitney ## description in the Conover book. This can be very slow if N = nx + ny ## is much bigger than 20. Since the denominator involves summing dsqranks ## calculation, this function foes not use dsqranks directly as a subroutine psqranks <- function(k, nx, ny) { ## nothing to do? if(k == 0) return(0) ## avoiding duplicated denom computation in multipl dsqranks calls N <- nx + ny r2 <- (1:N)^2 ## calculate the common denominator to all density calculations denom <- 0 for(i in 1:sum(r2)) denom <- denom + length(FLSSS(nx, r2, i, 0, sizeNeeded="all")) ## sum up all density evaluations from 1:k p <- 0 for(i in 1:k) p <- p + length(FLSSS(nx, r2, i, 0, sizeNeeded="all"))/denom ## done return(p) } ## sqranks.test: ## ## This an implementation of the squared ranks test described in the Conover book, ## in the style of the Mann-Whitney trest function above. It is only appropriate ## if nx = length(x) and ny = length(y) are small and there are not many ties (in the ## ranks of absolute deviations). Otherwise, the Gaussian approximation given in ## class is best. sqranks.test <- function(x, y, alternative=c("two-tailed", "less", "greater")) { ## check alternative argument alternative <- match.arg(alternative) ## calculate the absolute discrepancies from the (estimated) mean nx <- length(x) ny <- length(y) N <- nx + ny xbar <- mean(x) ybar <- mean(y) u <- abs(x - xbar) v <- abs(y - ybar) ## calculate the observed ranks, and t r <- rank(c(u, v)) ru <- r[seq_along(u)] rv <- r[-seq_along(u)] ## check for ties and possibly warn if(length(r) - length(unique(r)) > 0.1*N) warning("more than 10% are ties, suggest using normal approximation") ## calculate test statistic t <- sum(ru^2) ## go through alternatives for p-value if(alternative == "less") phi <- psqranks(t, nx, ny) else if(alternative == "greater") phi <- 1-psqranks(t-1, nx, ny) else { ## two-sided phi <- 2*min(psqranks(t, nx, ny), 1-psqranks(t-1, nx, ny)) } ## return results return(list(stat=t, p.value=phi, alternative=alternative)) }