################# # Simulate Data # ################# set.seed(527) #x = c(seq(0.1, 10, 0.1) ) x = sort(rnorm(100, 5, sd=2.5)) above = function(x){ret=pmax(0, x)} EX = 2 + 0.5*x^3 -1*above(x-3)^3 - 2*above(x-5)^3 y = EX + rnorm(length(x), 0, sd=15) plot(x,y) lines(x, EX) ############################### # R Demo 1: Spline and Kernel # ############################### ########################################## # (a) Penalized cubic regression splines # ########################################## library(mgcv) # By default: K = 10, spaced by quantiles fit.a.k10 = gam(y~s(x, bs="cs", k=10)) k10 = fit.a.k10$smooth[[1]]$xp # default is 10 knots k10 plot(x, y, main="Penalized cubic regression splines (by mgcv)") lines(x, fit.a.k10$fitted.values, col="red", lwd=2) ## Specify the knots ## # Number of knots, location of knots # set.seed(100) fit.a.k10.rand = gam(y~s(x, bs="cr"), knots = list(x=sort(sample(x, 10)))) k10.rand = fit.a.k10.rand$smooth[[1]]$xp k10.rand fit.a.k25 = gam(y~s(x, bs="cr", k=25)) k25 = fit.a.k25$smooth[[1]]$xp k25 set.seed(10) fit.a.k25.rand = gam(y~s(x, bs="cr", k=25), knots = list(x=sort(sample(x, 25)))) k25.rand = fit.a.k25.rand$smooth[[1]]$xp k25.rand ## Fig 1. plot Penalized cubic regression spline models ## with k=10 and 25 and different locations of knots z = seq(min(x), max(x), length.out=200) plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Penalized Cubic Regression Splines", cex.main=0.9, cex.axis=0.8) legend("bottomleft", legend=c("True model","k=10 equally spaced", "k=10 random knots", "k=25 equally spaced", "k=25 random knots"), lty=c(1,1,1,1,1), col=c("black","green", "red", "yellow", "blue"), bty="n", lwd=2) # True model lines(x, EX, lwd=2) # k=10, evenly spaced (by quantile) points(k10, predict(fit.a.k10, newdata=data.frame(x=k10)), col="green", pch=16, cex=1.5) lines(z, predict(fit.a.k10, newdata=data.frame(x=z)), lty=1, col="green", lwd=2) # k=10, random knots points(k10.rand, predict(fit.a.k10.rand, newdata=data.frame(x=k10.rand)), col="red", pch=16, cex=1.5) lines(z, predict(fit.a.k10.rand, newdata=data.frame(x=z)), lty=1, col="red", lwd=2) # k=25, evenly spaced (by quantile) points(k25, predict(fit.a.k25, newdata=data.frame(x=k25)), col="yellow", pch=18, cex=1.5) lines(z, predict(fit.a.k25, newdata=data.frame(x=z)), lty=1, col="yellow", lwd=3) # k=25, random knots points(k25.rand, predict(fit.a.k25.rand, newdata=data.frame(x=k25.rand)), col="blue", pch=18, cex=1.5) lines(z, predict(fit.a.k25.rand, newdata=data.frame(x=z)), lty=1, col="blue", lwd=3) # Fig 2. Overall nice curves z = seq(min(x), max(x), length.out=200) plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Penalized Cubic Regression Splines", cex.main=0.9, cex.axis=0.8) lines(x, EX, lwd=2) lines(z, predict(fit.a.k10, newdata=data.frame(x=z)), lty=1, col="green", lwd=2) lines(z, predict(fit.a.k10.rand, newdata=data.frame(x=z)), lty=1, col="red", lwd=2) lines(z, predict(fit.a.k25, newdata=data.frame(x=z)), lty=2, col="yellow", lwd=2) lines(z, predict(fit.a.k25.rand, newdata=data.frame(x=z)), lty=2, col="blue", lwd=2) legend("bottomleft", legend=c("k=10 equally spaced", "k=10 random knots", "k=25 equally spaced", "k=25 random knots"), lty=c(1,1,2,2), col=c("green", "red", "yellow", "blue"), bty="n", lwd=2) #################################################### # (b) Fit a Nadaraya-Watson locally constant model # #################################################### #install.packages("locfit") library(locfit) # Fit with all bandwidths gcv.b<-gcvplot(y~x, kern="gauss", deg=0, ev=dat(), alpha=seq(0.01, 0.99, by=.01)) plot(gcv.b) data.frame(relative.bandwidth = gcv.b$alpha, GCV = round(gcv.b$values), DF=round(gcv.b$df,1 )) alpha.chosen.b = gcv.b$alpha[which.min(gcv.b$values)] alpha.chosen.b # Fit with chosen bandwidths fit.b0 <- locfit(y~x, kern="gauss",deg=0, ev=dat(), alpha=0.01 ) fit.b9 <- locfit(y~x, kern="gauss",deg=0, ev=dat(), alpha=0.99) fit.b <- locfit(y~x, kern="gauss",deg=0, ev=dat(), alpha=alpha.chosen.b ) # Fig 3. plot with different bandwidths plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Nadaraya-Watson locally constant model") legend("bottomleft", lty=1,col=c("black","red","blue","green"), legend=c("True model", paste("bandwidth =", c( 0.01, 0.99, alpha.chosen.b))), bty="n", lwd=1.5) lines(x, EX, lwd=2) lines(x, fitted(fit.b0), col="red", lwd=1.5) lines(x, fitted(fit.b9), col="blue", lwd=1.5) lines(x, fitted(fit.b), col="green", lwd=2) ############################################# # (c) Fit a locally linear polynomial model # ############################################# # Fit with all bandwidths gcv.c <- gcvplot(y~x, kern="gauss", deg=1, ev=dat(), alpha=seq(0.01, 0.99, by=.01)) plot(gcv.c) data.frame(relative.bandwidth = gcv.c$alpha, GCV = round(gcv.c$values), DF=round(gcv.c$df,1 )) alpha.chosen.c = gcv.c$alpha[which.min(gcv.c$values)] alpha.chosen.c # Fit with chosen bandwidths fit.c0<- locfit(y~x, kern="gauss",deg=1, ev=dat(), alpha= 0.01) fit.c9<- locfit(y~x, kern="gauss",deg=1, ev=dat(), alpha= 0.99) fit.c <- locfit(y~x, kern="gauss",deg=1, ev=dat(), alpha= alpha.chosen.c) # Fig 4. plot with different bandwidths plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Locally linear polynomial model") legend("bottomleft", lty=1,col=c("black","red","blue","green"), legend=c("True model", paste("bandwidth =", c( 0.01, 0.99, alpha.chosen.c))), bty="n", lwd=1.5) lines(x, EX, lwd=2) lines(x, fitted(fit.c0), col="red", lwd=1.5) lines(x, fitted(fit.c9), col="blue", lwd=1.5) lines(x, fitted(fit.c), col="green", lwd=2) ######################################## # (d) Plot three models: PCRS, Kernels # ######################################## plot(x, y, xlab="X", ylab="Y", col="darkgrey", main="Spline and kernel models") legend("bottomleft", legend=c("True model", "penalized cubic regression", "Nadaraya-Watson", "locally linear"), col=c("black", "blue", "red", "green"), lwd=2, lty=c(1,1, 1,1), bty="n") lines(x, EX, lwd=2) lines(x, fit.a.k10$fitted.values, col="blue", lty=1, lwd=2) lines(x, fitted(fit.b), col="red", lty=1, lwd=2) lines(x, fitted(fit.c), col="green", lty=1, lwd=2) # Check degrees of freedom in each model. They should be compariable sum(fit.a.k10$edf) fit.b$dp["df2"] fit.c$dp["df2"] ######################################################### # R Demo 2: Gaussian Processes: Noise-free observations # ######################################################### #################### # Simulate dataset # #################### # Data = c(x,y) are the training data. # x is the location of the training data. x = c(-5, -4, -3, -2, -1, 4, 5) EX = c(1, 2, 2.2, 2.3, 1.5, 2.5, 2) y = scale(EX) plot(x, y) ## (a) Draw from prior for the function f(x) --- at a set of fine grids of x # Define a set of fine grids of x locations, call it z # z is the test data location range(x) z = seq(min(x), max(x), by=0.1) nZ = length(z) # 101 nZ # Calculate the Covariance matrix of the prior, # based on the squared exponential covariance Kernel, Equation (2.16) # Create a function to calculate covariance kernal K = {k_ij} # An element k_ij my.kernel.ij <- function(xi, xj, sigma_f, l){ k.ij = sigma_f^2* exp(-0.5/(l^2)*(xi - xj)^2) return(k.ij) } # the matrix K my.kernel <- function(A, B, sigma_f = 1, l = 1){ n.A = length(A) n.B = length(B) K.AB = matrix(NA, n.A, n.B) for (i in 1: n.A){ for (j in 1:n.B){ K.AB[i,j] = my.kernel.ij(A[i], B[j], sigma_f, l) } } return(K.AB) } # Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its prior library(MASS) set.seed(10) sigma_f = 1 l = 1 # We will try l = 1, 0.5, and 1.5 covMat = my.kernel(z, z, sigma_f, l) #pdf(paste("l_",l,"_prior.pdf", sep=""), h=6, w=6) for (i in 1:100){ f.prior = mvrnorm(n=1, mu=rep(0, nZ), Sigma=covMat ) if (i==1) plot(z, f.prior, type="l", ylim=c(-5, 5)) else lines(z, f.prior, col=i) } points(x, y, cex=1.2, col="red", pch=16) # Add prior mean and confidence band abline(h=0, lwd=3, col="black") sd.prior = sqrt(diag(covMat)) lines(z, 0+1.96*sd.prior, lty=2, lwd=3) lines(z, 0-1.96*sd.prior, lty=2, lwd=3) #dev.off() ## (b) Draw from posterior for the function f(x)|D = conditional dist of f(x) on the observed data points --- at a set of fine grids of x # Calculate the Covariance matrix of the posterior, # based on Equation (2.19), f_* is the f(z) here. # Calculate K.zx, K.xx, K.zz, K.xz # Note K.xz is the transpose of K.zx K.zx = my.kernel(z, x, sigma_f, l) K.xx = my.kernel(x, x, sigma_f, l) K.zz = my.kernel(z, z, sigma_f, l) K.xz = t(K.zx) K.xx.inv = solve(K.xx) L = K.zx %*% K.xx.inv f.post.mu = L %*% y f.post.cov = K.zz - L %*% K.xz # Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its posterior library(MASS) #pdf(paste("l_",l,"_post.pdf",sep=""), h=6, w=6) set.seed(100) for (i in 1:100){ f.post = mvrnorm(n=1, mu=f.post.mu, Sigma=f.post.cov ) if (i==1) plot(z, f.post, type="l", ylim=c(-5, 5)) else lines(z, f.post, col=i) } points(x, y, cex=1.2, col="red", pch=16) # Add prior mean and confidence band lines(z,f.post.mu, lwd=2, col="black") sd.prior = sqrt(diag(f.post.cov)) lines(z, f.post.mu+1.96*sd.prior, lty=2, lwd=2) lines(z, f.post.mu-1.96*sd.prior, lty=2, lwd=2) #dev.off() #################################################### # R Demo 3: Gaussian Processes: Noisy observations # #################################################### #################### # Simulate dataset # #################### # Data = c(x,y) are the training data. # x is the location of the training data. x = c(-5, -4, -3, -2, -1, 4, 5) EX = c(1, 2, 2.2, 2.3, 1.5, 2.5, 2) sigma.n = 0.3 set.seed(2013) EX = as.vector(scale(EX)) y = EX + rnorm(length(x), 0, sd=sigma.n) plot(x, y, pch=16) points(x, EX, col="red", pch=16) ## (a) Draw from prior for the function f(x) --- at a set of fine grids of x # Define a set of fine grids of x locations, call it z # z is the test data location range(x) z = seq(min(x), max(x), by=0.1) nZ = length(z) # 101 nZ # Calculate the Covariance matrix of the prior, # based on the squared exponential covariance Kernel, Equation (2.16) # Create a function to calculate covariance kernal K = {k_ij} # An element k_ij my.kernel.ij <- function(xi, xj, sigma_f, l){ k.ij = sigma_f^2* exp(-0.5/(l^2)*(xi - xj)^2) return(k.ij) } # the matrix K my.kernel <- function(A, B, sigma_f = 1, l = 1){ n.A = length(A) n.B = length(B) K.AB = matrix(NA, n.A, n.B) for (i in 1: n.A){ for (j in 1:n.B){ K.AB[i,j] = my.kernel.ij(A[i], B[j], sigma_f, l) } } return(K.AB) } # Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its prior library(MASS) set.seed(10) sigma_f = 1 l = 1 # We will try l = 1, 0.5, and 1.5 covMat = my.kernel(z, z, sigma_f, l) for (i in 1:100){ f.prior = mvrnorm(n=1, mu=rep(0, nZ), Sigma=covMat ) if (i==1) plot(z, f.prior, type="l", ylim=c(-5, 5)) else lines(z, f.prior, col=i) } # Add observed data points points(x, y, pch=16) points(x, EX, col="red", pch=16) ## (b) Draw from posterior for the function f(x)|D = conditional dist of f(x) on the observed data points --- at a set of fine grids of x # Calculate the Covariance matrix of the posterior, # based on Equation (2.22-24), f_* is the f(z) here. # Calculate K.zx, K.xx, K.zz, K.xz # Note K.xz is the transpose of K.zx K.zx = my.kernel(z, x, sigma_f, l) K.xx = my.kernel(x, x, sigma_f, l) K.zz = my.kernel(z, z, sigma_f, l) K.xz = t(K.zx) # The difference happens here! K.xx.inv = solve(K.xx + diag(rep(sigma.n, length(x)))) L = K.zx %*% K.xx.inv f.post.mu = L %*% y f.post.cov = K.zz - L %*% K.xz # Draw 100 samples from the multivariate normal for [f(z1), f(z2),..., f(z101)] from its posterior library(MASS) set.seed(100) for (i in 1:100){ f.post = mvrnorm(n=1, mu=f.post.mu, Sigma=f.post.cov ) if (i==1) plot(z, f.post, type="l", ylim=c(-5, 5)) else lines(z, f.post, col=i) } points(x, y, cex=1.2, col="black", pch=16) #points(x, EX, cex=1.2, col="red", pch=16) lines(z,f.post.mu, lwd=2, col="black") sd.prior = sqrt(diag(f.post.cov)) lines(z, f.post.mu+1.96*sd.prior, lty=2, lwd=2) lines(z, f.post.mu-1.96*sd.prior, lty=2, lwd=2)