## code to accompany Lecture 8 of ## Statistical computing with R by ## Robert B. Gramacy ## so the code will wrap appropriately in the slides options(width=50) ## hitmiss: ## ## integration by hit and miss method; assumes ## f is vectorized over rows of X so that sapply ## is not needed hitmiss <- function(f, a, b, c, d, n, plotit=FALSE) { X <- runif(n, a, b) Y <- runif(n, c, d) Z <- Y <= f(X) if(plotit) { I <- (b-a)*c + (cumsum(Z)/(1:n))*(b-a)*(d-c) plot(2:n, I[-1], type="l") return(I[n]) } else return((b-a)*c + (sum(Z)/n)*(b-a)*(d-c)) } ## test hitmiss f <- function(x) x^3 - 7*x^2 + 1 hitmiss(f, 0, 1, -6, 2, 1000000, plotit=TRUE) hitmiss(f, 0, 1, -6, 2, 1000000) ## mci: ## ## Monte carlo integration mci <- function(g, a, b, n) return(mean(f(runif(n, a, b)))*(b-a)) ## testing mci(f, 0, 1, 100000) ## probability calculation for A & S n <- 1000 S <- runif(n, 10, 11.5) A <- runif(n, 10.5, 12) prob <- mean(A < S) prob ## plotting the acceptance region plot(S[1:1000], A[1:1000]) polygon(c(10.5, 11.5, 11.5, 10.5), c(10.5, 10.5, 11.5, 10.5), density=10, angle=135) ## (sqrt) variance of estimate se <- sqrt(prob*(1-prob)/n) ## normal approximation to binomial prob + c(-1,1)*1.96*se ## expected differences diff <- A - S d <- c(mean(diff), mean(abs(diff))) se <- c(sd(diff), sd(abs(diff)))/sqrt(n) d[1] + c(-1,1)*1.96*se[1] d[2] + c(-1,1)*1.96*se[2] ## bootstrap mean ## set up a Gamma(alpha,beta) distribution alpha <- 4; ibeta <- 1/4 n <- 30 x <- rgamma(n, alpha, ibeta) ## for overlaying the truth xt <- seq(0,qgamma(0.999, alpha, ibeta), length=1000) f <- dgamma(xt, alpha, ibeta) ## plot a histogram of the samples hist(x, freq=FALSE, xlim=range(xt), ylim=c(0,1.5*max(f)), main="") lines(xt, f, col=2, lwd=2, lty=2) legend("topright", "true pdf", lty=2, col=2, lwd=2, cex=1.5, bty="n") ## normal approximation xbar <- mean(x) s2.norm <- var(x) za <- qnorm(0.975, 0, sqrt(s2.norm)) q.norm <- xbar + c(-1,1)*za/sqrt(n) points(xbar, 0, col=3, cex=1.5) text(q.norm, c(0,0), c("[", "]"), col=3, cex=1.5) ## now via bootstrap B <- 199 X.star <- matrix(NA, nrow=B, ncol=n) for(b in 1:B) { X.star[b,] <- sample(x, n, replace=TRUE) } ## calculate the bootstrap mean-interval xbar.boot <- apply(X.star, 1, mean) q.boot <- quantile(xbar.boot, c(0.025, 0.975)) c(mean(xbar.boot), q.boot) ## adding bootstrap interval to plot text(q.boot, c(0,0), c("[", "]"), col=4, cex=1.5) ## we can also add information about the entire ## bootstrap distribution for the mean to the plot lines(density(xbar.boot), col=4, lty=3, lwd=2) legend("right", "boot mean distn", cex=1.5, col=4, lty=3, lwd=2, bty="n") ## try/not in sides hist(xbar.boot) ## bootstrapped linear regression n <- 200 X <- 1/cbind(rt(n, df=1), rt(n, df=1), rt(n,df=1)) beta <- c(1,2,3,0) Y <- beta[1] + X %*% beta[-1] + rnorm(100, sd=3) fit <- lm(Y~X) coef(fit) ## try/not in slides summary(fit)\$cov.unscaled ## now take the bootstrap samples B <- 10000 beta.hat.boot <- matrix(NA, nrow=B, ncol=length(beta)) for(b in 1:B) { indices <- sample(1:nrow(X), nrow(X), replace=TRUE) beta.hat.boot[b,] <- coef(lm(Y[indices]~X[indices,])) } ## compare to cov/unscaled above cov(beta.hat.boot) ## comparing marginals par(mfrow=c(2,2)) for(i in 1:4) { m <- coef(fit)[i] s <- sqrt(cov(beta.hat.boot)[i,i]) x <- m + seq(-4*s, 4*s, length=1000) xlim <- range(c(x, beta.hat.boot[,i])) hist(beta.hat.boot[,i], freq=FALSE, xlim=xlim, xlab=i, main="", breaks=20) lines(x, dnorm(x, m, s), col=2, lwd=2) } ## generating cross-validation folds cv.folds <- function (n, folds = 10) split(sample(1:n), rep(1:folds, length = n)) ## demonstrating folds <- cv.folds(30, folds=5) folds[[1]] folds[[5]] ## a cartoon CV example using our rgamma sample pab <- pmean <- pmed <- rep(NA, length(x)) for(k in 1:length(folds)) { fk <- folds[[k]] pmean[fk] <- mean(x[-fk]) pmed[fk] <- median(x[-fk]) pab[fk] <- alpha/ibeta } ## calculate rmse and mad c(sqrt(mean((pmean - x)^2)), mean(abs(pmean-x))) c(sqrt(mean((pmed - x)^2)), mean(abs(pmed-x))) c(sqrt(mean((pab - x)^2)), mean(abs(pab-x))) ## spam MC comparison ## source("spam_mc.R") load("spam_0.RData"); spam <- read.csv("../data/spam.csv") hit <- function(x, y=spam\$spam) mean(x == y, na.rm=TRUE) df <- data.frame(null=apply(pnull, 1, hit), full=apply(pfull, 1, hit), fwd=apply(pfwd, 1, hit), fwdi=apply(pfwdi, 1, hit), lda=apply(plda, 1, hit), qda=apply(pqda, 1, hit), fda=apply(pfda, 1, hit), rp=apply(prp, 1, hit), rf=apply(prf, 1, hit)) boxplot(df, ylab="hit rate") ## try/not in slides boxplot(df[,-1], ylab="hit rate") ## the rest of this analysis is in the full collection ## script: spam_mc_collect.R ## mulinks: ## ## simple mutual links function mutlinks <- function(A) { n <- ncol(A) if(nrow(A) != n) stop("A must be square") S <- 0 for(i in 1:(n-1)) for(j in (i+1):n) for(k in 1:n) S <- S + A[i,k]*A[j,k] S/choose(n,2) } ## testing mutual links function A <- matrix(sample(0:1, 16, replace=TRUE), nrow=4) A <- matrix(sample(0:1, (8^2)^2, replace=TRUE), nrow=8^2) system.time(mutlinks(A)) A <- matrix(sample(0:1, (16^2)^2, replace=TRUE), nrow=16^2) system.time(mutlinks(A)) A <- matrix(sample(0:1, (24^2)^2, replace=TRUE), nrow=24^2) system.time(mutlinks(A)) ## a new version that handles chunks of i, and takes ## a matrix-vectorization shortcut mtl <- function(ichunk, A) { n <- ncol(A) if(nrow(A) != n) stop("A must be square") S <- 0 for(i in ichunk) { if(i < n) { rowi <- A[i,] S <- S + sum(A[(i+1):n,] %*% rowi) } } S } ## checking output and time mutlinks(A) system.time(Amtl <- mtl(1:nrow(A), A)) Amtl/choose(nrow(A), 2) A <- matrix(sample(0:1, (32^2)^2, replace=TRUE), nrow=32^2) system.time(mtl(1:nrow(A), A)) ## function to divy up into chunks for snow mutlinks.snow <- function(cls, A) { n <- ncol(A) if(nrow(A) != n) stop("A must be square") nc <- length(cls) suppressWarnings(ichunks <- split(1:n, 1:nc)) Ss <- clusterApply(cls, ichunks, mtl, A) do.call(sum, Ss) / choose(n, 2) } ## run on 2 processors library(parallel) cl <- makeCluster(type="PSOCK", c("localhost", "localhost")) system.time(mutlinks.snow(cl, A)) stopCluster(cl) ## try even bigger, both A-size and number of instances A <- matrix(sample(0:1, (40^2)^2, replace=TRUE), nrow=40^2) system.time(mtl(1:nrow(A), A)) cl <- makeCluster(type="PSOCK", rep("localhost", 4)) system.time(mutlinks.snow(cl, A)) stopCluster(cl)