################################# # Demo 1: Natural Cubic Splines # ################################# # Simulate data set.seed(527) x = seq(0.1, 10, 0.1) above = function(x){ret=pmax(0, x)} EX = 2 + 0.5*x^3 -1*above(x-3)^3 - 2*above(x-5)^3 y = EX + rnorm(length(x), 0, 10) plot(x,y) lines(x, EX) # Save plot as pdf pdf("simu_data.pdf", h=6, w=6) plot(x,y) lines(x, EX) dev.off() ######################################################### # (a) NCS, smoothing parameter chosen by minimizing OCV # ######################################################### # load package: "pspline", natural polynomial smoothing spline of order specified by the user install.packages("pspline") library(pspline) # Fit NCS # # by default sm.spline() fits natural *cubic* splines (norder=2) - and that's what we need for NCS fit.a=sm.spline(x=x, y=y, cv=TRUE) # OCV fit.a # Note the output OCV is consistent with "by hand" OCV here. # However, for the fossile data in homework 3, the output OCV doesn't match the "by hand" OCV. fit.a$cv mean( ((fit.a$ysmth-y)/(1-fit.a$lev))^2 ) ######################################################### # (b) NCS, smoothing parameter chosen by minimizing GCV # ######################################################### fit.b=sm.spline(x=x, y=y, cv=FALSE) # GCV fit.b # Note the output GCV is consistent with "by hand" GCV here. # However, for the fossile data in homework 3, the output GCV doesn't match the "by hand" GCV. fit.b$gcv mean( ((fit.b$ysmth-y)/(1-mean(fit.b$lev)))^2 ) ## Plot of the fitted curves by (a) and (b) ## plot(x, y, main="Natural Cubic Splines") lines(fit.a, lty=1, col="green", lwd=2) lines(fit.a, lty=2, col=2, lwd=2) legend("bottomleft", lty=1:3, lwd=2, col=c("green","red","black"), legend=c("Fit based on the best OCV", "Fit based on the based on GCV","Truth"), bty="n") # Compare to the true curve lines(x, EX, lty=1, lwd=2) ## Extract information about the fitted model ## names(fit.b) ## Prediction at given x ## pred.a=predict(fit.a, c(5, 8)) # no standard errors pred.a pred.b=predict(fit.b, c(5, 8)) # no standard errors pred.b ############################################## # Demo 2: Penalized Cubic Regression Splines # ############################################## ######################################### # (c) Penalized cubic regression spline # # smoothing paramter chosen by min GCV # ######################################### library(mgcv) fit.c=gam(y~s(x, bs="cr", k=10)) summary(fit.c) ## Extract information about the fitted model ## names(fit.c) fit.c$sp # Extract smoothing paramter fit.c$smooth[[1]]$xp # Extract knots # Two ways to extract diagonal elements of the "hat" matrix fit.c$hat influence(fit.c) all(influence(fit.c)==fit.c$hat) ## Specify the knots ## set.seed(100) fit.c.knots=gam(y~s(x, bs="cr"), knots = list(sample(x, 10))) summary(fit.c.knots) ## plot the two Penalized cubic regression spline models ## with two sets of knots z=seq(min(x), max(x), by=0.05) plot(x, y, xlab="Age", ylab="", main="Penalized Cubic Regression Splines (with two different sets of knots)", cex.main=0.9, cex.axis=0.8) lines(z, predict(fit.c, newdata=data.frame(x=z)), lty=1, col="green", lwd=2) lines(z, predict(fit.c.knots, newdata=data.frame(x=z)), lty=2, col="red", lwd=2) legend("bottomleft", legend=c("GCV equally spaced knots", "GCV random knots"), lty=1:2, col=c("green", "red"), bty="n", lwd=2) ######################################### # (d) Penalized cubic regression spline # # smoothing paramter chosen by min OCV # ######################################### # select the optimal smoothing paramter to minimize OCV sp = seq(0, 0.8, by=0.01) # this seems to be a reasonable range # given fit.c$sp is equal to 0.32 m = length(sp) OCV = GCV = DF = numeric(m) n = length(x)[1] for (i in 1:m) { fit = gam( y ~ s(x, bs="cr", k=10), sp=sp[i] ) DF[i] = sum( influence(fit) ) # trace of hat matrix OCV[i] = mean( fit$residuals^2/(1-influence(fit))^2 ) GCV[i] = mean( fit$residuals^2/(1-DF[i]/n)^2 ) # just to double check (compare with fit.c) } # plot the OCV, GCV scores vs. smoothing parameter respectively # plot(sp, OCV, type="l", xlab="Smoothing Parameter", ylab="Cross-Validation Score", main="Smoothing Parameter Selection for Penalized Cubic Regression Splines", cex.main=0.7, cex.axis=0.7) lines(sp, GCV, lty=2, col="red") sp[which.min(OCV)] # 0.23 sp[which.min(GCV)] # 0.32 - same as fit.c$sp points(sp[which.min(OCV)], min(OCV), pch=20, cex=3) points(sp[which.min(GCV)], min(GCV), col="red", pch=20, cex=3) legend("bottomright", legend=c("OCV", "GCV"), lty=1:2, col=c("black","red")) # Fit the model with the optimal smoothing paramter that minimizes OCV sp[which.min(OCV)] fit.d=gam(y~s(x, bs="cr", k=10), sp=0.23) summary(fit.d) DF[which.min(OCV)] # df=9.0 sum(fit.d$edf) # df=9.0 OCV[which.min(OCV)] # Predictions with asymptotic 95% confidence intervals (pointwise) # pred.c=predict(fit.c, newdata=data.frame(x=c(5, 8)), se.fit=TRUE) pred.c pred.d=predict(fit.d, newdata=data.frame(x=c(5, 8)), se.fit=TRUE) pred.d c(pred.c$fit[1] , pred.c$fit[1] - 1.96*pred.c$se.fit[1], pred.c$fit[1] + 1.96*pred.c$se.fit[1]) c(pred.c$fit[2] , pred.c$fit[2] - 1.96*pred.c$se.fit[2], pred.c$fit[2] + 1.96*pred.c$se.fit[2])