###################################
# R Demo 1: polynomial regression #
###################################

### Simulate y and x ###

set.seed(527)
x = c(seq(1, 10, 0.2), 12,15 )
ex = 5+1*x + 0.3*x^2 - 0.1*x^3 # True model
y = ex + rnorm(length(x),0,15)
length(x)
plot(x, y, main="", ylab="y", xlab="x")

# Fit 1 polynomial model, and plot

plot(x, y, main="", ylab="y", xlab="x")
mod <-lm(y ~ poly(x, degree = 2))
lines(x, mod$fitted,lwd=2)
	
# Fit several polynomial models, and compare

plot(x, y, main="", ylab="y", xlab="x")
degreeRange <- c(1, 3, 7, 9)
for (i in degreeRange){
	mod <-lm(y ~ poly(x, degree = i))
	lines(x, mod$fitted,col= (i+1)/2, lty=i, lwd=2)
}
legend("bottomleft", lty=degreeRange, col=(degreeRange+1)/2, lwd=2, legend=paste("degree", degreeRange), bty="n")

# Add the true model

lines(x, ex, lty=1, lwd=3, col="red")

# Residual plots 

par(mfrow=c(2,2))
for (i in degreeRange) {
  mod <-lm(y~poly(x, degree=i))
  plot(x, mod$residuals,ylab="residuals",main=paste("Degree=",i))
  abline(h=0,col="red",lty=2)
}


# Training and test set

dat <- data.frame(cbind(y,x))
set.seed(20)
n.train <- round(nrow(dat)/2)
scramble <- sample(1:nrow(dat))
id.train <- scramble[1:n.train]
train = dat[id.train, ]
test  = dat[-id.train,]

given.degree = 5
	
mod <-lm(y ~ poly(x, degree = given.degree), data=train)
MSE.inSample = mean( (predict(mod)-train$y )^2)
pred = predict(mod, newdata=test)
MSE.outSample = mean( (pred - test$y)^2 )
cbind(pred, test$y)
c(given.degree, MSE.inSample, MSE.outSample)

# Now see how MSE changes if we change the degree to 5 ?


##############################
# R Demo 2: Ridge Regression #
##############################

# Simulate data #

set.seed(7)
n = 60
x1 = rnorm(n,1,2)
x2 = rnorm(n,1,1)
x3 = rnorm(n,2,1)
x4 = rnorm(n,2,2)
x5 = rnorm(n,1,1)
x6 = rnorm(n,1,2)
x7 = rnorm(n,2,1)
error = rnorm(n, 0,2)
y = 5 + 0.5*x1 + 0.8*x2+1*x3 + error # True model

dat = data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,y))
head(dat)


# Divide the data into training and test sets #

set.seed(20)
n.train <- round(nrow(dat)/2)
scramble <- sample(1:nrow(dat))
id.train <- scramble[1:n.train]
train = dat[id.train, ]
test  = dat[-id.train,]
dim(train)
dim(test)
head(train)


# Fit ridge regression by lm.ridge function in library(MASS) #
install.packages("MASS")
library(MASS)
lambda.range <- seq(0, 50, by=1)
fit = lm.ridge(y ~ . , data=train, lambda = lambda.range) 
# You can just give un-standardize data into the function. It will by default normalize x and center y for you. And the algorithm won't penalize the itercept
plot(fit)
abline(h=0, col="grey")

# Prediction given a lambda value

given.lambda = 10
mod =lm.ridge(y ~ . , data=train, lambda=given.lambda)
names(mod)

# Way 1
pred = scale(test[,1:7], center=mod$xm, scale=mod$scales) %*% mod$coef + mod$ym

# Way 2
pred2 <- as.matrix(cbind(rep(1,30),test[,1:7]))%*%coef(mod)

# Wrong way
pred.wrong = as.matrix(test[,1:7]) %*% mod$coef 
data.frame( true.y=test$y, pred=pred, pred.wrong=pred.wrong) 

MSE = mean((test$y - pred)^2)
MSE

###################
# R Demo 3: Lasso #
###################

# Simulate data #

set.seed(7)
n = 100
x1 = rnorm(n,1,2)
x2 = rnorm(n,1,1)
x3 = rnorm(n,2,1)
x4 = rnorm(n,2,2)
x5 = rnorm(n,1,1)
x6 = rnorm(n,1,2)
x7 = rnorm(n,2,1)
error = rnorm(n, 0,1)
y = 5 + 0.5*x1 + 0.8*x2+1*x3 + error

dat = data.frame(cbind(x1,x2,x3,x4,x5,x6,x7,y))
head(dat)

# the lasso vs. shrinkage factor
install.package("lasso2")
library("lasso2")
s.range <- seq(0.01, 1, 0.05)
mod <-  l1ce(y~.,data=dat, bound=s.range)
?l1ce()
coef(mod)
coefmat = coef(mod)[,-1]


# 
# Plots lasso coefficients vs. shrinkage factor
#
par(mar=c(5,5,4,2)) 
matplot(s.range, coefmat, type="l",xlim=c(0,1),
  ylab=expression(hat(beta)), xlab="Shrinkage Factor",
  lty=c(1:6,8),col=1:7)
legend("topleft",bty="n",legend=paste("x",1:7),lty=c(1:6,8,9), col=1:8)
abline(0,0,lty=1,col="grey")


# Prediction

mod = l1ce(y~.,data=dat,bound=0.5)
pred = predict(mod, newdata = dat)
pred
cbind(pred, dat$y)

#
# You can also fit lasso using LARS: produce the full trajectory of betas and shrinkage factor

library(lars)
mod.lars <- lars(as.matrix( dat[,1:7] ), dat$y)
plot(mod.lars)
names(mod.lars)

# Handy Cross validation
r <- cv.lars(as.matrix(dat[,1:7]), dat$y, K=10)
# The shrinkage factor of the best model 
bestfraction <- r$index[which.min(r$cv)]
bestfraction

# coefficients for the best model from 10-fold cv
coef.best = predict(mod.lars, as.matrix(dat[1:7]), s=bestfraction, type="coefficient", mode="fraction")
coef.best

# Prediction 
pred.best <- predict(mod.lars, as.matrix(dat[1:7]), s=bestfraction, type="fit", mode="fraction")$fit
cbind(pred.best, dat$y)
mean((pred.best - dat$y)^2)