################### # R demo 1: DPMM # ################### install.packages("DPpackage") library("DPpackage") ## now estimated using DP package function ## DPdensity(.) ## Simulate data N1 = rnorm(500, 0,1) N2 = rnorm(500, 5, 1.5) g = sample(c(rep(0,400), rep(1,600))) y <- g*N1 + (1-g)*N2 hist(y, prob=T) n <- length(y) ## most of this is straight from the manual page of DP package ##### set up parameters for call to DPdensity(.) below: state <- NULL ## Initial state mcmc1 <- list(nburn=0,nsave=1,nskip=0,ndisplay=1) mcmc2 <- list(nburn=0,nsave=10,nskip=0,ndisplay=1) mcmc3 <- list(nburn=0,nsave=100,nskip=0,ndisplay=10) mcmc4 <- list(nburn=0,nsave=1000,nskip=0,ndisplay=100) prior0 <- list(alpha=1, m1=0, k0= 1, psiinv1=1, nu1=1) fit1 <-DPdensity(y=y,prior=prior0,mcmc=mcmc1,state=state,status=TRUE) fit2 <-DPdensity(y=y,prior=prior0,mcmc=mcmc2,state=state,status=TRUE) fit3 <-DPdensity(y=y,prior=prior0,mcmc=mcmc3,state=state,status=TRUE) fit4 <-DPdensity(y=y,prior=prior0,mcmc=mcmc4,state=state,status=TRUE) ### Results: density plot(fit1,ask=FALSE) plot(fit2,ask=FALSE) plot(fit3,ask=FALSE) plot(fit4,ask=FALSE) ## Plot the number of clusters plot(fit4,ask=FALSE,output="param",param="ncluster",nfigr=1,nfigc=2) # sensitive to prior specification, but resulting density is ok. ############################################# # R demo 2: Spline with multiple predictors # ############################################# # Install package install.packages("gamair") install.packages("mgcv") # Grab the data library(gamair) data(chicago) dim(chicago) head(chicago) library(mgcv) ############################################################## # Model 1: Penalized cubic regression spline of time # # linear trend for other variables (Poisson Model for count) # ############################################################## apo <- gam(death ~ s(time, bs="cr", k=200) + pm10median + so2median + o3median + tmpd, data = chicago, family = "poisson") # k is the basis dim and is treated as the upper limit on the degrees of freedom associated with the smooth # Model 1 output summary(apo) plot(apo) ###################################################### # Model 2: Penalized cubic regession splines of time # # and temperature, respectively # ###################################################### apo2 <- gam(death ~ s(time, bs="cr", k=200) + pm10median + so2median + o3median + s(tmpd, bs="cr", k=100), data = chicago, family = "poisson") # Model 2 output summary(apo2) plot(apo2) ############################################# # Model 3: Fit Thin Plate Regression Spline # ############################################# library(lasso2) data(Prostate); attach(Prostate) gammod <- gam(lpsa ~ s(lcavol,lweight,bs="tp") + s(x=age,k=7,bs="cr")+ s(x=lbph,k=7,bs="cr") + s(x=lcp,k=7,bs="cr") + svi + gleason + s(x=pgg45,k=7,bs="cr"), method="GCV.Cp") ?smooth.terms # Plot the fitted Thin Plate Regressio Spline plot(gammod,select=1,pers=T,xlab="log(can vol)",ylab="log(weight)", main="Fitted Thin Plate Regression Splne",theta=35,phi=25) # Plots all smoothers plot(gammod,pers=T,xlab="log(can vol)",ylab="log(weight)", main="Fitted Thin Plate Regression Splne",theta=35,phi=25) ##################################### # Model 4: Fit Tensor Product basis # ##################################### gammod.tensor <- gam(lpsa ~ te(lcavol,lweight,k=c(6,6)) + s(x=age,k=7,bs="cr")+ s(x=lbph,k=7,bs="cr") + s(x=lcp,k=7,bs="cr") + svi + gleason + s(x=pgg45,k=7,bs="cr"), method="GCV.Cp") # Plot the tensor product spline plot(gammod.tensor,select=1,pers=T,xlab="log(can vol)",ylab="log(weight)", main="Fitted Tensor Product Spline",theta=35,phi=25) # Plot all splines plot(gammod.tensor, pers=T,xlab="log(can vol)",ylab="log(weight)", main="Fitted Tensor Product Spline",theta=35,phi=25) ## ?? Extract basis ## ######################################### # R Demo 3: Classification and Regression Trees # ######################################### # Load the data library(lasso2) data(Prostate) # Split data into training and test sets n = nrow(Prostate) set.seed(527) trainIdct = sample(1:n, size=round(n*2/3)) traindat = Prostate[trainIdct,] testdat = Prostate[-trainIdct,] library(rpart) # Unpruned tree unpruned <- rpart(lpsa~., data=traindat,method="anova", control=list(minsplit=5,minbucket=3,cp=.001)) # Plot unpruned tree plot(unpruned,margin=.07,uniform=T) text(unpruned,use.n=TRUE,cex=.7) # List all optimal trees given a complexity parameter printcp(unpruned) unpruned$cptable # Option (1) Select the complexity prarameter by minimizing the 10-fold cv error id = which.min(unpruned$cptable[,"xerror"]) optimal.cp = unpruned$cptable[id,"CP"] pruned <- prune(unpruned, cp = optimal.cp) printcp(pruned) # Option (2) Select the complexity prarameter by minimizing the 10-fold cv error with 1 SE Rule # CV plot plotcp(unpruned) optimal.cp = unpruned$cptable[3,"CP"] pruned <- prune(unpruned, cp = optimal.cp) printcp(pruned) ##### # plot Full tree par(mfrow=c(1,2)) plot(unpruned, margin=.07,uniform=T) text(unpruned, use.n=TRUE,cex=0.5) # plot Pruned tree plot(pruned, margin=.07,uniform=T) text(pruned, use.n=TRUE,cex=1) ###### # Prediction by the unpruned tree testdat <- as.data.frame(testdat) pred.unpruned <- predict(unpruned, newdata=testdat) test.error.unpruned <- mean((pred.unpruned-testdat[,9])^2) test.error.unpruned # Prediction by the pruned tree testdat <- as.data.frame(testdat) pred.pruned <- predict(pruned, newdata=testdat) test.error.pruned <- mean((pred.pruned-testdat[,9])^2) test.error.pruned