## sampling.R -- to accompany the third lecture of ## Intermediate Data Analytics and Machine Learning ## ## by Robert B. Gramacy, Department of Statistics ## Virginia Tech ## ## see e.g., linear.R for examples of how to use the functions ## contained in this file ## linf: ## ## generate from a standard linear model with covariate ## x, intercept alpha and slope beta -- then fit a linear ## model (ML) to the result. return the sample, the ## fit and a descriptive name, n is a vector of ones to be ## compatible with logitf linf <- function(x, alpha, beta) { epsilon <- rnorm(length(x)) y <- alpha + beta*x + epsilon fit <- lm(y ~ x) return(list(y=y, fit=fit, n=rep(1,length(y)), name="linear")) } ## loginf: ## ## generate from a log-linear model with covariate ## x, intercept alpha and slope beta -- then fit a linear ## model (ML) to the result. return the sample, the ## fit and a descriptive name, n is a vector of ones to be ## compatible with logitf loglinf <- function(x, alpha, beta) { mu <- exp(alpha + beta*x) y <- rpois(length(mu), mu) fit <- glm(y ~ x, family="poisson") return(list(y=y, fit=fit, n=rep(1,length(y)), name="log-linear")) } ## logit: ## ## generate from a binomial-logit model with covariate ## x, intercept alpha and slope beta -- then fit a linear ## model (ML) to the result. return the sample, the ## fit, the total counts, and a descriptive name logitf <- function(x, alpha, beta) { n <- rpois(length(x), lambda=2)+1 p <- exp(alpha + beta*x)/(1+exp(alpha + beta*x)) y <- rbinom(length(n), n, prob=p) fit <- glm(y/n ~ x, family="binomial", weights=n) return(list(y=y, fit=fit, n=n, name="logit")) } ## three.ex: ## ## sample (plot) data from a standard linear model, a log-linear ## model and a binomial-logit model, with options to add the ## MLE regression line to the plot (and regression coeffs) three.ex <- function(x, alpha, beta, infer=FALSE) { ## for plotting on a 2x2 grid par(mfrow=c(2,2), cex.main=2, cex.lab=2, cex.axis=2) ## simple linear model lin <- linf(x, alpha, beta) plot(x,lin\$y, ylab="y", cex=2) ## plot ML inference for the linear model if(infer) { p.lin <- predict(lin\$fit, se.fit=TRUE) lines(x, p.lin\$fit, lwd=2) lines(x, p.lin\$fit+2*p.lin\$se.fit, col=2, lty=2, lwd=2) lines(x, p.lin\$fit-2*p.lin\$se.fit, col=2, lty=2, lwd=2) title(paste(lin\$name, ": m = a + b*x, (a,b)=(", signif(coef(lin\$fit)[1],2), ",", signif(coef(lin\$fit)[2],2), ")", sep="")) } else title(lin\$name) ## simple log-linear model loglin <- loglinf(x, alpha, beta) plot(x, loglin\$y, ylab="y", cex=2) ## ML inference for the log-linear model if(infer) { p.lin <- predict(loglin\$fit, type="response", se.fit=TRUE) lines(x, p.lin\$fit, lwd=2) lines(x, p.lin\$fit+2*p.lin\$se.fit, col=2, lty=2, lwd=2) lines(x, p.lin\$fit-2*p.lin\$se.fit, col=2, lty=2, lwd=2) title(paste(loglin\$name, ": log(m) = a + b*x, (a,b)=(", signif(coef(loglin\$fit)[1],2), ",", signif(coef(loglin\$fit)[2],2), ")", sep="")) } else title(loglin\$name) ## simple logit model logit <- logitf(x, alpha, beta) plot(x, logit\$y/logit\$n, type="n", ylab="y") text(x, logit\$y/logit\$n, logit\$n, cex=2) ## ML inference for the logit model if(infer) { p.logit <- predict(logit\$fit, type="response", se.fit=TRUE) lines(x, p.logit\$fit, lwd=2) lines(x, p.logit\$fit+2*p.logit\$se.fit, col=2, lty=2, lwd=2) lines(x, p.logit\$fit-2*p.logit\$se.fit, col=2, lty=2, lwd=2) title(paste(logit\$name, ": log(m/(1-m)) = a + b*x, (a,b)=(", signif(coef(logit\$fit)[1],2), ",", signif(coef(logit\$fit)[2],2), ")", sep="")) } else title("logit") } ## ex.mc: ## ## monte-carlo sampling from the provided sampling function ## (one of the three GLM functions above) with ML fits, ## and finally summarize the estimates with means and ## errorbars ex.mc <- function(x, alpha, beta, sampf=linf) { ## initialization i <- 0 par(mfrow=c(1,1), cex.main=1.5, cex.lab=1.5, cex.axis=1.5) l <- ab <- NULL ## while the user wishes to continue while(TRUE) { ## sample with the sampling function sampf i <- i+1 samp <- sampf(x, alpha, beta) l <- cbind(l, predict(samp\$fit, type="response")) ## plot the points if(any(samp\$n != 1)) { plot(x, samp\$y/samp\$n, cex=2, ylim=range(as.vector(l)), type="n", ylab="y") text(x, samp\$y/samp\$n, samp\$n, cex=2, col=i) } else { plot(x, samp\$y/samp\$n, cex=2, col=i, ylim=range(as.vector(l)), ylab="y") } ## plot the accumulated regression lines matplot(x, l, lwd=2, col=1:i, type="l", lty=1, add=TRUE, ylab="y") ab <- rbind(ab, coef(samp\$fit)) title(paste(samp\$name, ": m = a + b*x, average (a,b)=(", signif(mean(ab[,1]),2), ",", signif(mean(ab[,2]),2), ")", sep="")) ## continue sampling? if(readline("press RETURN to continue, q to stop: ") == "q") break } ## plot the resulting mean regression line and quantiles m <- apply(l, 1, mean); plot(x, m, type="l", lwd=2, ylab="y") title(paste(samp\$name, ": predictive mean and interval", sep="")) q1 <- apply(l, 1, quantile, 0.05) lines(x, q1, lwd=2, col=2) q2 <- apply(l, 1, quantile, 0.95) lines(x, q2, lwd=2, col=2) }