source("sweep.R") X <- read.table("rubin.txt") ## count the number of NA's in each column nas <- rep(0, ncol(X)) for(i in 1:ncol(X)) { nas[i] <- sum(is.na(X[,i])) } ## order the columns of X by their missingness (number of NAs) nao <- order(nas) naso <- nas[nao] miss <- (1:ncol(X))[duplicated(naso) == FALSE] miss <- c(miss, ncol(X)+1) ## ## step 1: find ML estimates for first NAs ## ## columns with the first missingness pattern col1 <- nao[miss[1]:(miss[2]-1)] ## remove the botton naso[miss[1]] as missing X1 <- X[1:(nrow(X)-naso[miss[1]]), col1] if(nrow(X) != nrow(X1)) stop("error X should have at least one column without missing data") ## calculate mean and covar matrix mu <- apply(X1, 2, mean) n1 <- nrow(X1) S <- cov(X1)*(n1-1)/n1 ## ## steps 2 & 3: find ML regression estimates ## ## don't really need to store all Xsub, G, etc. Might be able to get away ## with not storing any of this stuff -- but we keep it for debugging ## purposes col <- list(); Xsub <- list(); G <- list(); F <- list(); beta <- list() colall <- col1 for(j in 2:(length(miss)-1)) { cat(paste("FWD: j = ", j, "\n", sep="")) ## columns of the j-th missingness pattern col[[j]] <- nao[miss[j]:(miss[j+1]-1)] ## get the non-missing values of the first two sets of cols colall <- c(colall, col[[j]]) Xsub[[j]] <- X[1:(nrow(X)-naso[miss[j]]), colall] ## create the second G matrix G[[j]] <- matrix(0, nrow=ncol(Xsub[[j]])+1, ncol=ncol(Xsub[[j]])+1) G[[j]][1,1] <- 1 G[[j]][1,2:ncol(G[[j]])] <- apply(Xsub[[j]], 2, mean) G[[j]][,1] <- G[[j]][1,] G[[j]][2:ncol(G[[j]]),2:ncol(G[[j]])] <- t(Xsub[[j]]) %*% as.matrix(Xsub[[j]]) / nrow(Xsub[[j]]) ## sweep in order to do the regressions tosweep <- 1:(length(colall)-length(col[[j]])+1) swpG <- sweep(tosweep, G[[j]]) ## get residual covariance matrix (F2) and regression coeffs (beta2) from G2 f <- (nrow(G[[j]])-length(col[[j]])+1):nrow(G[[j]]) F[[j]] <- matrix(swpG[f,f], ncol=length(f)) beta[[j]] <- matrix(swpG[f,1:(nrow(G[[j]])-length(col[[j]]))], nrow=length(f)) } ## ## step 4: reconstruct ## ## inner-most sweep A = matrix(0, ncol=length(col1)+1, nrow=length(col1)+1) A[1,1] <- -1 f <- 2:(length(col1)+1) A[1,f] <- A[f,1] <- mu A[f,f] <- S A <- sweep(f, A) ## the rest of them for(j in 2:(length(miss)-1)) { cat(paste("BACK: j = ", j, "\n", sep="")) B <- matrix(0, ncol=ncol(A)+length(col[[j]]), nrow=ncol(A)+length(col[[j]])) B[1:ncol(A), 1:ncol(A)] <- A B[(ncol(A)+1):ncol(B),] <- cbind(beta[[j]], F[[j]]) B[1:ncol(A),(ncol(A)+1):ncol(B)] <- t(beta[[j]]) if(j == length(miss)-1) A <- rsweep(2:(length(colall)), B) else A <- sweep((ncol(A)+1):ncol(B), B) } ## extract the mean and covariance matrix from A mu.hat <- A[1,1+order(nao)] S.hat <- A[1+order(nao),1+order(nao)]