rm(list = ls(all = TRUE)) library(MASS) ############################## # R Demo: Ridge Regression # ############################## # Simulate data # set.seed(7) n = 60 x1 = rnorm(n,1,2) x2 = rnorm(n,1,1) x3 = rnorm(n,2,1) x4 = rnorm(n,2,2) x5 = rnorm(n,1,1) x6 = rnorm(n,1,2) x7 = rnorm(n,2,1) error = rnorm(n, 0,2) y = 5 + 0.5*x1 + 0.8*x2+1*x3 + error # True model dat = data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,y)) head(dat) # Divide the data into training and test sets # set.seed(20) n.train <- round(nrow(dat)/2) scramble <- sample(1:nrow(dat)) id.train <- scramble[1:n.train] train = dat[id.train, ] test = dat[-id.train,] dim(train) dim(test) head(train) # Fit ridge regression by lm.ridge function in library(MASS) # install.packages("MASS") library(MASS) lambda.range <- seq(0, 50, by=1) fit = lm.ridge(y ~ . , data=train, lambda = lambda.range) # You can just give un-standardize data into the function. It will by default normalize x and center y for you. And the algorithm won't penalize the itercept plot(fit) abline(h=0, col="grey") # Prediction given a lambda value given.lambda = 10 mod =lm.ridge(y ~ . , data=train, lambda=given.lambda) names(mod) # Way 1 - Scaled pred = scale(test[,1:7], center=mod$xm, scale=mod$scales) %*% mod$coef + mod$ym # Way 2 - unscaled pred2 <- as.matrix(cbind(rep(1,30),test[,1:7])) %*% coef(mod) # Wrong way pred.wrong = as.matrix(test[,1:7]) %*% mod$coef data.frame( true.y=test$y, pred=pred, pred.wrong=pred.wrong) MSE = mean((test$y - pred)^2) MSE ##### OCV calculations ridge.lm = function(lambda, train.dat) { sc.data.x = scale(train.dat[,-8], center=T, scale=T) cent.data.y = scale(train.dat[,8], center=T, scale=F) beta_ridge = solve(t(sc.data.x) %*% sc.data.x + lambda*diag(7)) %*% t(sc.data.x) %*% cent.data.y hat = diag(sc.data.x %*% (solve(t(sc.data.x) %*% sc.data.x + lambda*diag(7)) %*% t(sc.data.x))) return(list("beta_ridge" = beta_ridge, "hat" = hat, "lambda" = lambda)) } rr.fit = t(sapply(lambda.range, ridge.lm, train.dat=train)) ocv = sapply(1:length(lambda.range), function(i) { fit = lm.ridge(y ~ ., data=train, lambda=lambda.range[i]) resid = train$y - as.matrix(cbind(1, train[,-8])) %*% coef(fit) ocv = mean(((resid)/(1 - rr.fit[,"hat"][[i]]))^2) return(ocv) }) plot(lambda.range, ocv, col="red", type="l", lwd=2, xlab=expression(lambda), ylab="OCV/GCV Score", main=expression(paste("OCV/GCV Score vs.",~~lambda,sep=""))) ################### # R Demo: Lasso # ################### # Simulate data # set.seed(7) n = 100 x1 = rnorm(n,1,2) x2 = rnorm(n,1,1) x3 = rnorm(n,2,1) x4 = rnorm(n,2,2) x5 = rnorm(n,1,1) x6 = rnorm(n,1,2) x7 = rnorm(n,2,1) error = rnorm(n, 0,1) y = 5 + 0.5*x1 + 0.8*x2+1*x3 + error dat = data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,y)) head(dat) library(lars) mod.lars <- lars(as.matrix( dat[,1:7] ), dat$y) plot(mod.lars) names(mod.lars) # Handy Cross validation r <- cv.lars(as.matrix(dat[,1:7]), dat$y, K=10) # The shrinkage factor of the best model bestfraction <- r$index[which.min(r$cv)] bestfraction # coefficients for the best model from 10-fold cv coef.best = predict(mod.lars, as.matrix(dat[1:7]), s=bestfraction, type="coefficient", mode="fraction") coef.best # Prediction pred.best <- predict(mod.lars, as.matrix(dat[1:7]), s=bestfraction, type="fit", mode="fraction")$fit cbind(pred.best, dat$y) mean((pred.best - dat$y)^2)