#################
# Simulate Data #
#################

set.seed(527)
#x = c(seq(0.1, 10, 0.1)  )
x = sort(rnorm(100, 5, sd=2.5))
above = function(x){ret=pmax(0, x)}
EX = 2 + 0.5*x^3 -1*above(x-3)^3 -  2*above(x-5)^3
y = EX + rnorm(length(x), 0, sd=15)
plot(x,y)
lines(x, EX)

###############################
# R Demo 1: Spline and Kernel #
###############################


##########################################
# (a) Penalized cubic regression splines #
##########################################

library(mgcv)

# By default: K = 10, spaced by quantiles

fit.a.k10 = gam(y~s(x, bs="cs", k=10))
k10 = fit.a.k10$smooth[[1]]$xp # default is 10 knots
k10

plot(x, y, main="Penalized cubic regression splines (by mgcv)")
lines(x, fit.a.k10$fitted.values, col="red", lwd=2)


## Specify the knots ##
# Number of knots, location of knots #

set.seed(100)
fit.a.k10.rand = gam(y~s(x, bs="cr"), knots = list(x=sort(sample(x, 10))))
k10.rand = fit.a.k10.rand$smooth[[1]]$xp
k10.rand

fit.a.k25 = gam(y~s(x, bs="cr", k=25))
k25 = fit.a.k25$smooth[[1]]$xp
k25

set.seed(10)
fit.a.k25.rand = gam(y~s(x, bs="cr", k=25), knots = list(x=sort(sample(x, 25))))
k25.rand = fit.a.k25.rand$smooth[[1]]$xp
k25.rand

## Fig 1. plot Penalized cubic regression spline models 
## with k=10 and 25 and different locations of knots 

z = seq(min(x), max(x), length.out=200)
plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Penalized Cubic Regression Splines", cex.main=0.9, cex.axis=0.8)
legend("bottomleft", legend=c("True model","k=10 equally spaced", "k=10 random knots", "k=25 equally spaced", "k=25 random knots"), lty=c(1,1,1,1,1), col=c("black","green", "red", "yellow", "blue"), bty="n", lwd=2)
# True model
lines(x, EX, lwd=2)
# k=10, evenly spaced (by quantile)
points(k10, predict(fit.a.k10,   newdata=data.frame(x=k10)), col="green", pch=16, cex=1.5)
lines(z, predict(fit.a.k10,      newdata=data.frame(x=z)), lty=1, col="green", lwd=2)
# k=10, random knots
points(k10.rand, predict(fit.a.k10.rand,   newdata=data.frame(x=k10.rand)), col="red", pch=16, cex=1.5)
lines(z, predict(fit.a.k10.rand, newdata=data.frame(x=z)), lty=1, col="red", lwd=2)

# k=25, evenly spaced (by quantile)
points(k25, predict(fit.a.k25,   newdata=data.frame(x=k25)), col="yellow", pch=18, cex=1.5)
lines(z, predict(fit.a.k25,      newdata=data.frame(x=z)), lty=1, col="yellow", lwd=3)
# k=25, random knots
points(k25.rand, predict(fit.a.k25.rand,   newdata=data.frame(x=k25.rand)), col="blue", pch=18, cex=1.5)
lines(z, predict(fit.a.k25.rand, newdata=data.frame(x=z)), lty=1, col="blue", lwd=3)


# Fig 2. Overall nice curves
z = seq(min(x), max(x), length.out=200)
plot(x, y, xlab="X", ylab="Y", col="darkgrey",
          main="Penalized Cubic Regression Splines", 
          cex.main=0.9, cex.axis=0.8)
lines(x, EX, lwd=2)
lines(z, predict(fit.a.k10,      newdata=data.frame(x=z)), lty=1, col="green", lwd=2)
lines(z, predict(fit.a.k10.rand, newdata=data.frame(x=z)), lty=1, col="red", lwd=2)
lines(z, predict(fit.a.k25,      newdata=data.frame(x=z)), lty=2, col="yellow", lwd=2)
lines(z, predict(fit.a.k25.rand, newdata=data.frame(x=z)), lty=2, col="blue", lwd=2)
legend("bottomleft", legend=c("k=10 equally spaced", "k=10 random knots",
		"k=25 equally spaced", "k=25 random knots"), lty=c(1,1,2,2), col=c("green", "red", "yellow", "blue"), bty="n", lwd=2)


####################################################
# (b) Fit a Nadaraya-Watson locally constant model #
####################################################

#install.packages("locfit")
library(locfit)

# Fit with all bandwidths

gcv.b<-gcvplot(y~x, kern="gauss", deg=0, ev=dat(), alpha=seq(0.01, 0.99, by=.01))
plot(gcv.b)

data.frame(relative.bandwidth = gcv.b$alpha, GCV = round(gcv.b$values), DF=round(gcv.b$df,1 ))
alpha.chosen.b = gcv.b$alpha[which.min(gcv.b$values)]
alpha.chosen.b

# Fit with chosen bandwidths

fit.b0 <- locfit(y~x, kern="gauss",deg=0, ev=dat(), alpha=0.01 ) 
fit.b9 <- locfit(y~x, kern="gauss",deg=0, ev=dat(), alpha=0.99)
fit.b <- locfit(y~x, kern="gauss",deg=0,  ev=dat(), alpha=alpha.chosen.b )  
	
		
# Fig 3. plot with different bandwidths
		
plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Nadaraya-Watson locally constant model")
legend("bottomleft", lty=1,col=c("black","red","blue","green"), legend=c("True model", paste("bandwidth =", c( 0.01, 0.99, alpha.chosen.b))), bty="n", lwd=1.5)
lines(x, EX, lwd=2)
lines(x, fitted(fit.b0), col="red", lwd=1.5) 
lines(x, fitted(fit.b9), col="blue", lwd=1.5) 
lines(x, fitted(fit.b), col="green", lwd=2) 


#############################################
# (c) Fit a locally linear polynomial model #
#############################################

# Fit with all bandwidths

gcv.c <- gcvplot(y~x, kern="gauss", deg=1, ev=dat(), alpha=seq(0.01, 0.99, by=.01))
plot(gcv.c)

data.frame(relative.bandwidth = gcv.c$alpha, GCV = round(gcv.c$values), DF=round(gcv.c$df,1 ))
alpha.chosen.c = gcv.c$alpha[which.min(gcv.c$values)]
alpha.chosen.c 

# Fit with chosen bandwidths

fit.c0<- locfit(y~x, kern="gauss",deg=1, ev=dat(), alpha= 0.01) 
fit.c9<- locfit(y~x, kern="gauss",deg=1, ev=dat(), alpha= 0.99) 
fit.c <- locfit(y~x, kern="gauss",deg=1, ev=dat(), alpha= alpha.chosen.c) 
		
# Fig 4. plot with different bandwidths
		
plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Locally linear polynomial model")
legend("bottomleft", lty=1,col=c("black","red","blue","green"), legend=c("True model", paste("bandwidth =", c( 0.01, 0.99, alpha.chosen.c))), bty="n", lwd=1.5)
lines(x, EX, lwd=2)
lines(x, fitted(fit.c0), col="red", lwd=1.5) 
lines(x, fitted(fit.c9), col="blue", lwd=1.5) 
lines(x, fitted(fit.c), col="green", lwd=2)


########################################		
# (d) Plot three models: PCRS, Kernels #
########################################		

plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Spline and kernel models")
legend("bottomleft", legend=c("True model", "penalized cubic regression", "Nadaraya-Watson", "locally linear"), col=c("black", "blue", "red",  "green"), lwd=2, lty=c(1,1, 1,1), bty="n")
lines(x, EX, lwd=2)
lines(x, fit.a.k10$fitted.values, col="blue", lty=1, lwd=2)
lines(x, fitted(fit.b),  col="red", lty=1, lwd=2)
lines(x, fitted(fit.c),  col="green", lty=1, lwd=2)


# Check degrees of freedom in each model. They should be compariable

sum(fit.a.k10$edf)
fit.b$dp["df2"]
fit.c$dp["df2"]


#########################################################
# R Demo 2: Gaussian Processes: Noise-free observations #
#########################################################

####################
# Simulate dataset #
####################

# Data = c(x,y) are the training data. 
# x is the location of the training data.

x = c(-5, -4, -3, -2, -1, 4, 5)
EX = c(1, 2, 2.2, 2.3, 1.5, 2.5, 2)
y = scale(EX)
plot(x, y)

## (a) Draw from prior for the function f(x) --- at a set of fine grids of x

# Define a set of fine grids of x locations, call it z
# z is the test data location

range(x)
z = seq(min(x), max(x), by=0.1)
nZ = length(z) # 101
nZ

# Calculate the Covariance matrix of the prior, 
# based on the squared exponential covariance Kernel, Equation (2.16) 

# Create a function to calculate covariance kernal K = {k_ij}

# An element k_ij
my.kernel.ij <- function(xi, xj, sigma_f, l){
	k.ij = sigma_f^2* exp(-0.5/(l^2)*(xi - xj)^2)
	return(k.ij)
}

# the matrix K
my.kernel <- function(A, B, sigma_f = 1, l = 1){
	n.A = length(A)
	n.B = length(B)
	K.AB = matrix(NA, n.A, n.B)
	for (i in 1: n.A){
		for (j in 1:n.B){
			K.AB[i,j] = my.kernel.ij(A[i], B[j], sigma_f, l)
		}
	}
	return(K.AB)
}


# Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its prior

library(MASS)
set.seed(10)
sigma_f = 1
l = 1  # We will try l =  1, 0.5, and 1.5
covMat = my.kernel(z, z, sigma_f, l)

#pdf(paste("l_",l,"_prior.pdf", sep=""), h=6, w=6)
for (i in 1:100){
	f.prior = mvrnorm(n=1, mu=rep(0, nZ), Sigma=covMat )
	if (i==1) 
		plot(z, f.prior, type="l", ylim=c(-5, 5))
	else 
		lines(z, f.prior, col=i)
}
points(x, y, cex=1.2, col="red", pch=16)

# Add prior mean and confidence band
abline(h=0, lwd=3, col="black")
sd.prior = sqrt(diag(covMat))
lines(z, 0+1.96*sd.prior, lty=2, lwd=3)
lines(z, 0-1.96*sd.prior, lty=2, lwd=3)
#dev.off()

## (b) Draw from posterior for the function f(x)|D = conditional dist of f(x) on the observed data points --- at a set of fine grids of x

# Calculate the Covariance matrix of the posterior,
# based on Equation (2.19), f_* is the f(z) here.

# Calculate K.zx,    K.xx,   K.zz,  K.xz
# Note K.xz is the transpose of K.zx

K.zx = my.kernel(z, x, sigma_f, l)
K.xx = my.kernel(x, x, sigma_f, l)
K.zz = my.kernel(z, z, sigma_f, l)
K.xz = t(K.zx)
K.xx.inv = solve(K.xx)
L = K.zx %*% K.xx.inv 
f.post.mu  = L %*% y
f.post.cov = K.zz - L %*% K.xz

# Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its posterior

library(MASS)
#pdf(paste("l_",l,"_post.pdf",sep=""), h=6, w=6)
set.seed(100)
for (i in 1:100){
	f.post = mvrnorm(n=1, mu=f.post.mu, Sigma=f.post.cov )
	if (i==1) 
		plot(z, f.post, type="l", ylim=c(-5, 5))
	else 
		lines(z, f.post, col=i)
}
points(x, y, cex=1.2, col="red", pch=16)

# Add prior mean and confidence band
lines(z,f.post.mu, lwd=2, col="black")
sd.prior = sqrt(diag(f.post.cov))
lines(z, f.post.mu+1.96*sd.prior, lty=2, lwd=2)
lines(z, f.post.mu-1.96*sd.prior, lty=2, lwd=2)
#dev.off()


####################################################
# R Demo 3: Gaussian Processes: Noisy observations #
####################################################

####################
# Simulate dataset #
####################

# Data = c(x,y) are the training data. 
# x is the location of the training data.

x = c(-5, -4, -3, -2, -1, 4, 5)
EX = c(1, 2, 2.2, 2.3, 1.5, 2.5, 2)
sigma.n = 0.3
set.seed(2013)
EX = as.vector(scale(EX))
y = EX + rnorm(length(x), 0, sd=sigma.n)
plot(x, y, pch=16)
points(x, EX, col="red", pch=16)

## (a) Draw from prior for the function f(x) --- at a set of fine grids of x

# Define a set of fine grids of x locations, call it z
# z is the test data location

range(x)
z = seq(min(x), max(x), by=0.1)
nZ = length(z) # 101
nZ

# Calculate the Covariance matrix of the prior, 
# based on the squared exponential covariance Kernel, Equation (2.16) 

# Create a function to calculate covariance kernal K = {k_ij}

# An element k_ij
my.kernel.ij <- function(xi, xj, sigma_f, l){
	k.ij = sigma_f^2* exp(-0.5/(l^2)*(xi - xj)^2)
	return(k.ij)
}

# the matrix K
my.kernel <- function(A, B, sigma_f = 1, l = 1){
	n.A = length(A)
	n.B = length(B)
	K.AB = matrix(NA, n.A, n.B)
	for (i in 1: n.A){
		for (j in 1:n.B){
			K.AB[i,j] = my.kernel.ij(A[i], B[j], sigma_f, l)
		}
	}
	return(K.AB)
}


# Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its prior

library(MASS)
set.seed(10)
sigma_f = 1
l = 1 # We will try l =  1, 0.5, and 1.5
covMat = my.kernel(z, z, sigma_f, l)

for (i in 1:100){
	f.prior = mvrnorm(n=1, mu=rep(0, nZ), Sigma=covMat )
	if (i==1) 
		plot(z, f.prior, type="l", ylim=c(-5, 5))
	else 
		lines(z, f.prior, col=i)
}
# Add observed data points
points(x, y, pch=16)
points(x, EX, col="red", pch=16)


## (b) Draw from posterior for the function f(x)|D = conditional dist of f(x) on the observed data points --- at a set of fine grids of x

# Calculate the Covariance matrix of the posterior,
# based on Equation (2.22-24), f_* is the f(z) here.

# Calculate K.zx,    K.xx,   K.zz,  K.xz
# Note K.xz is the transpose of K.zx

K.zx = my.kernel(z, x, sigma_f, l)
K.xx = my.kernel(x, x, sigma_f, l)
K.zz = my.kernel(z, z, sigma_f, l)
K.xz = t(K.zx)
# The difference happens here!
K.xx.inv = solve(K.xx + diag(rep(sigma.n, length(x))))
L = K.zx %*% K.xx.inv 
f.post.mu  = L %*% y
f.post.cov = K.zz - L %*% K.xz

# Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its posterior

library(MASS)
set.seed(100)
for (i in 1:100){
	f.post = mvrnorm(n=1, mu=f.post.mu, Sigma=f.post.cov )
	if (i==1) 
		plot(z, f.post, type="l", ylim=c(-5, 5))
	else 
		lines(z, f.post, col=i)
}
points(x, y, cex=1.2, col="black", pch=16)
#points(x, EX, cex=1.2, col="red", pch=16)

lines(z,f.post.mu, lwd=2, col="black")
sd.prior = sqrt(diag(f.post.cov))
lines(z, f.post.mu+1.96*sd.prior, lty=2, lwd=2)
lines(z, f.post.mu-1.96*sd.prior, lty=2, lwd=2)