rm(list = ls(all = TRUE))
library(MASS)



##############################
# R Demo: Ridge Regression #
##############################

# Simulate data #

set.seed(7)
n = 60
x1 = rnorm(n,1,2)
x2 = rnorm(n,1,1)
x3 = rnorm(n,2,1)
x4 = rnorm(n,2,2)
x5 = rnorm(n,1,1)
x6 = rnorm(n,1,2)
x7 = rnorm(n,2,1)
error = rnorm(n, 0,2)
y = 5 + 0.5*x1 + 0.8*x2+1*x3 + error # True model

dat = data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,y))
head(dat)


# Divide the data into training and test sets #

set.seed(20)
n.train <- round(nrow(dat)/2)
scramble <- sample(1:nrow(dat))
id.train <- scramble[1:n.train]
train = dat[id.train, ]
test  = dat[-id.train,]
dim(train)
dim(test)
head(train)


# Fit ridge regression by lm.ridge function in library(MASS) #
install.packages("MASS")
library(MASS)
lambda.range <- seq(0, 50, by=1)
fit = lm.ridge(y ~ . , data=train, lambda = lambda.range) 
# You can just give un-standardize data into the function. It will by default normalize x and center y for you. And the algorithm won't penalize the itercept
plot(fit)
abline(h=0, col="grey")

# Prediction given a lambda value

given.lambda = 10
mod =lm.ridge(y ~ . , data=train, lambda=given.lambda)
names(mod)

# Way 1 - Scaled
pred = scale(test[,1:7], center=mod$xm, scale=mod$scales) %*% mod$coef + mod$ym

# Way 2 - unscaled
pred2 <- as.matrix(cbind(rep(1,30),test[,1:7])) %*% coef(mod)

# Wrong way
pred.wrong = as.matrix(test[,1:7]) %*% mod$coef 
data.frame( true.y=test$y, pred=pred, pred.wrong=pred.wrong) 

MSE = mean((test$y - pred)^2)
MSE

##### OCV calculations

ridge.lm = function(lambda, train.dat) {
  sc.data.x = scale(train.dat[,-8], center=T, scale=T)
  cent.data.y = scale(train.dat[,8], center=T, scale=F)
  beta_ridge = solve(t(sc.data.x) %*% sc.data.x + lambda*diag(7)) %*% t(sc.data.x) %*% cent.data.y
  hat = diag(sc.data.x %*% (solve(t(sc.data.x) %*% sc.data.x + lambda*diag(7)) %*% t(sc.data.x)))
  return(list("beta_ridge" = beta_ridge, "hat" = hat, "lambda" = lambda))
}
rr.fit = t(sapply(lambda.range, ridge.lm, train.dat=train))

ocv = sapply(1:length(lambda.range),
             function(i) {
               fit = lm.ridge(y ~ ., data=train, lambda=lambda.range[i])
               resid = train$y - as.matrix(cbind(1, train[,-8])) %*% coef(fit)
               ocv = mean(((resid)/(1 - rr.fit[,"hat"][[i]]))^2)
               return(ocv)
             })

plot(lambda.range, ocv, col="red", type="l", lwd=2, xlab=expression(lambda), ylab="OCV/GCV Score", main=expression(paste("OCV/GCV Score vs.",~~lambda,sep="")))


###################
# R Demo: Lasso #
###################

# Simulate data #

set.seed(7)
n = 100
x1 = rnorm(n,1,2)
x2 = rnorm(n,1,1)
x3 = rnorm(n,2,1)
x4 = rnorm(n,2,2)
x5 = rnorm(n,1,1)
x6 = rnorm(n,1,2)
x7 = rnorm(n,2,1)
error = rnorm(n, 0,1)
y = 5 + 0.5*x1 + 0.8*x2+1*x3 + error

dat = data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,y))
head(dat)


library(lars)
mod.lars <- lars(as.matrix( dat[,1:7] ), dat$y)
plot(mod.lars)
names(mod.lars)

# Handy Cross validation
r <- cv.lars(as.matrix(dat[,1:7]), dat$y, K=10)
# The shrinkage factor of the best model 
bestfraction <- r$index[which.min(r$cv)]
bestfraction

# coefficients for the best model from 10-fold cv
coef.best = predict(mod.lars, as.matrix(dat[1:7]), s=bestfraction, type="coefficient", mode="fraction")
coef.best

# Prediction 
pred.best <- predict(mod.lars, as.matrix(dat[1:7]), s=bestfraction, type="fit", mode="fraction")$fit
cbind(pred.best, dat$y)
mean((pred.best - dat$y)^2)