## code to accompany Lecture 6 of ## Statistical computing with R by ## Robert B. Gramacy ## so the code will wrap appropriately in the slides options(width=50) ## summary stats load("../data/dow30.RData") mean(dow30\$Open) min(dow30\$Open) max(dow30\$Open) range(dow30\$Open) sd(dow30\$Open) ## empirical distributions quantile(dow30\$Open, probs=c(0, 0.25, 0.5, 0.75, 1)) fivenum(dow30\$Open) IQR(dow30\$Open) summary(dow30\$Open) ## try/not in slides summary(dow30) ## birthweight b6 <- read.csv("../data/births2006.csv") b6c <- b6[!is.na(b6\$WTGAIN) & !is.na(b6\$DBWT) & b6\$DPLURAL == "1 Single" & b6\$ESTGEST>35,] cor(b6c\$WTGAIN, b6c\$DBW) ## via spearman and kendall cor(b6c\$WTGAIN, b6c\$DBWT, method="spearman") cor(b6c\$WTGAIN, b6c\$DBWT, method="kendall") ## PCA bt0008 <- read.csv("../data/batteam0008.csv") bpca <- princomp(~X1B+X2B+X3B+HR+BB+HBP+SF+SB+CS, data=bt0008) plot(bpca) ## try/not in slides bpca summary(bpca) loadings(bpca) ## blank entries are small but not zero ## biplot biplot(bpca, cex=0.5) ## factor analysis bfact <- factanal(~X1B+X2B+X3B+HR+BB+HBP+SF+SB+CS, data=bt0008, factors=2) ## try/not in slides bfact update(bfact, factors=3) update(bfact, factors=4) update(bfact, factors=5) ## in slides loadings(update(bfact, factors=4)) ## the normal distribution, etc par(mfrow=c(1,3), lwd=2, bty="n") plot(dnorm, -3, 3, main="PDF") plot(pnorm, -3, 3, main="CDF") plot(qnorm, 0, 1, main="Quantiles") ## try/not on slides pnorm(0) pnorm(1.96) qnorm(0.975) pnorm(1.96) - pnorm(-1.96) ## rnorm hist(rnorm(10000), breaks=50) ## tire data tires <- read.csv("../data/tires.csv") hfail <- subset(tires, Tire_Type=="H" & Speed_At_Failure_km_h == 160)\$Time_To_Failure hfail mean(hfail) ## one-sample t-test t.test(hfail, mu=9) ## tires with same speed rating efail <- subset(tires, Tire_Type=="E" & Speed_At_Failure_km_h == 180)\$Time_To_Failure dfail <- subset(tires, Tire_Type=="D" & Speed_At_Failure_km_h == 180)\$Time_To_Failure bfail <- subset(tires, Tire_Type=="B" & Speed_At_Failure_km_h == 180)\$Time_To_Failure ## comparing E and D t.test(efail, dfail) ## try/not in slides t.test(efail, bfail) ## mortality data for ANOVA mort06 <- read.csv("../data/mort06.csv") par(mar=c(9,3,1,1), las=2) boxplot(age~Cause, data=mort06, cex.axis=0.75) ## doing the anova anova(lm(age~Cause, data=mort06)) ## checking for normality fg <- read.csv("../data/fg.csv") par(mfrow=c(1,2)) hist(fg\$yards) yn <- (fg\$yards - mean(fg\$yards))/sd(fg\$yards) qqnorm(yn) abline(0,1) ## Shapiro-wilk test shapiro.test(fg\$yards) ## ks-test ks.test(fg\$yards, pnorm, mean=mean(fg\$yards), sd=sd(fg\$yards)) ## essential to include mean and sd params ## try/not in slides ks.test(fg\$yards, pnorm) ## field goal fgtab <- table(fg\$play.type, fg\$stadium.type) fgtab ## try/not in slides fgtab <- t(fgtab[3:4,]) fgtab ## test to see if there is a difference in groups prop.test(fgtab) ## LM for runs blm <- lm(R~X1B+X2B+X3B+HR+BB+HBP+SF+SB+CS, data=bt0008) blm coef(blm) ## summary lm output summary(blm) confint(blm) ## checking by hand X <- cbind(1, bt0008[,c("X1B", "X2B", "X3B", "HR", "BB", "HBP", "SF", "SB", "CS")]) X <- as.matrix(X) y <- bt0008\$R XtXi <- solve(t(X) %*% X) bhat <- XtXi %*% t(X) %*% y bhat ## print to screen df <- length(y) - ncol(X) s2 <- sum((X %*% bhat - y)^2)/df sqrt(s2) ## print to screen SEs <- sqrt(diag(XtXi)*s2) SEs ## print to screen tvals <- bhat/SEs pvals <- 2*pt(abs(tvals), df=df, lower.tail=FALSE) pvals ## print to screen ## update to the smaller model blm2 <- update(blm, R~X1B+X2B+X3B+HR+BB+HBP+SF) summary(blm)\$r.squared summary(blm2)\$r.squared ## anova to compare anova(blm, blm2) ## residual diagnostics via plot par(mfrow=c(2,2), mar=c(4,4,2,2)) plot(blm2) ## demonstrating prediction ar a random two ## training data points Xf <- bt0008[sample(1:nrow(bt0008), 2),] predict(blm2, newdata=Xf, interval="prediction", se.fit=TRUE) ## stepwise regression blm.aic <- step(blm, scope=~.+.^2, direction="both") blm.bic <- step(blm, scope=~.+.^2, direction="both", k=log(nrow(bt0008))) ## try/not in slides summary(blm.bic) extractAIC(blm) extractAIC(blm2) extractAIC(blm.aic) extractAIC(blm.bic) ## try/not in slides ## search with lars library(lars) blasso <- lars(X[,-1], y) plot(blasso) blasso.cv <- cv.lars(X[,-1],y) ## getting the coefficients coef(blasso, s=0.8, mode="fraction") library(monomvn) regress(X[,-1], y, method="lasso")\$b[,1][-1] ## PCR the hard way Zp <- cbind(1, X[,-1] %*% bpca\$loadings[,1:3]) theta <- solve(t(Zp) %*% Zp) %*% t(Zp) %*% y bpcr <- c(theta[1], bpca\$loadings[,1:3]%*%theta[-1]) bpcr ## PCR via pls library(pls) bpcr2 <- pcr(y ~ X[,-1], ncomp = 3) as.numeric(coef(bpcr2)) ## using monomvn to choose the number of components bpcrcv <- regress(X[,-1], y, method="pcr") bpcrcv\$b[,1][-1] bpcrcv\$ncomp ## non-linear data X <- seq(0,10,length=50) Ytrue <- (sin(pi*X/5) + 0.2*cos(4*pi*X/5)) Y <- Ytrue + rnorm(length(Ytrue), sd=0.1) plot(X,Y) ## predictive locations XX <- seq(0,10,length=199) YYtrue <- (sin(pi*XX/5) + 0.2*cos(4*pi*XX/5)) lines(XX, YYtrue, col="gray", type="l") ## smoothing splines with 11 DoFs fit1 <- smooth.spline(X, Y, df=11) YY1 <- as.matrix(predict(fit1, data.frame(X=XX))\$y) lines(XX, YY1, col=5, lty=5, lwd=2) ## try/not in slides: higher fidelity fit2 <- smooth.spline(X, Y, df=15) YY2 <- as.matrix(predict(fit2, data.frame(X=XX))\$y) lines(XX, YY2, col=6, lty=6, lwd=2) ## loess smoothing fit3 <- loess(Y~X, span=0.5) YY1 <- as.matrix(predict(fit3, data.frame(X=XX))) lines(XX, YY2, col=7, lty=7, lwd=2) legend("topright", c("spline", "loess"), lty=c(5,7), col=c(5,7), lwd=2) ## make binary field goals data for ## logistic regression fglr <- transform(fg, good=as.factor(ifelse(play.type=="FG good", "good", "bad"))) ## making a plot fgt <- table(fglr\$good, fglr\$yards) fgt ## try/not in slides plot(colnames(fgt), fgt["good",]/ (fgt["good",]+fgt["bad",]), xlab="Distance (Yards)", ylab="Percent Good") ## fitting the glm fglogit <- glm(good~yards, data=fglr, family="binomial") summary(fglogit) ## adding the fit to the plot xx <- data.frame(yards=seq(15,65,length=1000)) p <- predict(fglogit, newdata=xx, type="response") lines(xx[,1], p, lwd=2) ## spam data spam <- read.csv("../data/spam.csv") names(spam) ## try/not in slides testi <- sample(1:nrow(spam), 1000) train <- spam[-testi,] test <- spam[testi,] ## LDA spam.lda <- lda(spam~., data=train) p.lda <- predict(spam.lda, newdata=test)\$class table(actual=test\$spam, lda=p.lda) ## QDA spam.qda <- qda(spam~., data=train) p.qda <- predict(spam.qda, newdata=test)\$class table(actual=test\$spam, qda=p.qda) ## FDA library(mda) spam.fda <- fda(spam~., data=train) p.fda <- predict(spam.fda, newdata=test, type="class") table(actual=test\$spam, qda=p.fda) ## rpart library(rpart) ## otherwise rpart will do regression train <- transform(train, spam=as.factor(spam)) spam.rp <- rpart(spam~., data=train) ## try/not in slides plot(spam.rp) text(spam.rp, all=TRUE, cex=0.7, splits=TRUE, use.n=TRUE, xpd=TRUE) ## in slides library(maptree) draw.tree(spam.rp, cex=0.5, nodeinfo=TRUE) ## rpart performance ## try/not in slides p.rp <- predict(spam.rp, newdata=test, type="class") table(actual=test\$spam, rp=p.rp)