## ----setup,cache=FALSE,echo=FALSE--------------------------------------------- current.file.basename <- "resampling" ## ----common-setup,child='beamer.Rnw'------------------------------------------ ## ----setupforbeamer,cache=FALSE,echo=FALSE,results='hide',message=FALSE------- set.seed(0) datadir <- paste("data-",current.file.basename,sep="") ## full size by default ## beamer sizes: 5.04x3.78 opts_chunk$set(echo=FALSE,cache.path=paste(".cache-",current.file.basename,"/",sep=""), fig.env='center', fig.width=4.5,fig.height=3.4, fig.path=paste("figures-",current.file.basename,"/",sep="")) options(scipen = 5, OutDec=".") beamerblue <- rgb(0.2,0.2,0.7) beamergreen <- rgb(0,0.5,0) beamerblue.alpha <- rgb(0.2,0.2,0.7,0.5) apiacoa.colors <- list("darkblue"="#051F42", "darkorange"="#E84017", "darkgreen"="#3D6B34", "blue"="#003075", "lightblue"="#99BFBB", "darkrose"="#997568") palette(unlist(apiacoa.colors)) to_alpha <- function(color,alpha=0.5) { dacol <- as.vector(col2rgb(color)) rgb(red=dacol[1]/255,green=dacol[2]/255,blue=dacol[3]/255,alpha=alpha) } ## tool for multiple figure printing hook_plot <- knit_hooks$get('plot') # the default hook hook_plot_multi <- function(x, options) { txt = hook_plot(x, options) if (options$fig.cur <= 0) return(txt) # add \only before \includegraphics gsub('(\\\\includegraphics[^}]+} )', sprintf('\\\\only<%d>{\\1}%%', options$fig.cur), txt) } to_single_fig <- function() { knit_hooks$set(plot = hook_plot) } to_multi_fig <- function() { knit_hooks$set(plot = hook_plot_multi) } ## standard packages always used library(xtable) ## ----post-setup--------------------------------------------------------------- myerror <- function(ypred,y) { sum(ypred!=y)/length(y) } ## ----packages,cache=FALSE,echo=FALSE,results='hide',message=FALSE,warning=FALSE---- library(rpart) library(caret) library(class) ## ----bankmarketingLoad,cache=TRUE--------------------------------------------- bankmarketing.full <- read.table("../data/bank-marketing/bank-additional-full.csv",sep=";",header=TRUE,stringsAsFactors=TRUE) bankmarketing.full.y <- which(bankmarketing.full$y=="yes") bankmarketing.subindex <- c(sample(which(bankmarketing.full$y=="no"),length(bankmarketing.full.y)),bankmarketing.full.y) bmbl <- bankmarketing.full[bankmarketing.subindex,] ## ----bankmarketingsplit,cache=TRUE-------------------------------------------- bmbl.train <- createDataPartition(bmbl$y,p=0.7,list=FALSE)[,1] ## ----bankmarketingRpart,cache=TRUE-------------------------------------------- bmbl.ft <- rpart(y~.,data=bmbl,subset=bmbl.train,control=rpart.control(cp=0,minsplit=5)) bmbl.ft.refperf <- max(table(bmbl$y[bmbl.train]))/length(bmbl.train) bmbl.ft.pred <- predict(bmbl.ft,newdata=bmbl,type="class") ## ----bankmarketingRpartperf,fig.height=3-------------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") ## ----bankmarketingRpartTest,cache=TRUE---------------------------------------- bmbl.ft.pred.prune.risk <- matrix(NA,ncol=2,nrow=nrow(bmbl.ft$cptable)) for(i in 1:nrow(bmbl.ft$cptable)) { bmbl.ft.pruned <- prune(bmbl.ft,cp=bmbl.ft$cptable[i,1]) bmbl.ft.pruned.pred <- predict(bmbl.ft.pruned,newdata=bmbl,type="class") bmbl.ft.pred.prune.risk[i,] <- c(myerror(bmbl.ft.pruned.pred[bmbl.train],bmbl$y[bmbl.train]), myerror(bmbl.ft.pruned.pred[-bmbl.train],bmbl$y[-bmbl.train])) } ## ----bankmarketingRpartperfTest,fig.height=3---------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") lines(bmbl.ft$cptable[,2]+1,bmbl.ft.pred.prune.risk[,2],col=2) legend("topright",legend=c("Learning set","Validation/Test set"),col=c(1,2),lwd=1) ## ----bankmarketingRpartperfTest,fig.height=3---------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") lines(bmbl.ft$cptable[,2]+1,bmbl.ft.pred.prune.risk[,2],col=2) legend("topright",legend=c("Learning set","Validation/Test set"),col=c(1,2),lwd=1) ## ----bankmarketingRpartperfTestcv,fig.height=3-------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") lines(bmbl.ft$cptable[,2]+1,bmbl.ft.pred.prune.risk[,2],col=2) lines(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,4]*bmbl.ft.refperf,col=3) legend("topright",legend=c("Learning set","Validation/Test set","10 fold cv"),col=c(1,2,3),lwd=1) ## ----bankmarketingRpartManyCV,cache=TRUE-------------------------------------- bmbl.ft.mcv <- matrix(NA,nrow=nrow(bmbl.ft$cptable),ncol=20) for(k in 1:ncol(bmbl.ft.mcv)) { bmbl.ft.tmp <- rpart(y~.,data=bmbl,subset=bmbl.train,control=rpart.control(cp=0,minsplit=5)) bmbl.ft.mcv[,k] <- bmbl.ft.tmp$cptable[,4]*bmbl.ft.refperf } ## ----bankmarketingRpartperfTestcv,fig.height=3-------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") lines(bmbl.ft$cptable[,2]+1,bmbl.ft.pred.prune.risk[,2],col=2) lines(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,4]*bmbl.ft.refperf,col=3) legend("topright",legend=c("Learning set","Validation/Test set","10 fold cv"),col=c(1,2,3),lwd=1) ## ----bankmarketingRpartperfTestcvMulti,fig.height=3--------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") lines(bmbl.ft$cptable[,2]+1,bmbl.ft.pred.prune.risk[,2],col=2) for(k in 1:ncol(bmbl.ft.mcv)) { lines(bmbl.ft$cptable[,2]+1,bmbl.ft.mcv[,k],col=3) } legend("topright",legend=c("Learning set","Validation/Test set","20 x 10 fold cv"),col=c(1,2,3),lwd=1) ## ----bankmarketingRpartperfTestcvEval,fig.height=3---------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,3]*bmbl.ft.refperf,type="l",log="x",xlab="Tree size",ylab="Empirical risk") lines(bmbl.ft$cptable[,2]+1,bmbl.ft.pred.prune.risk[,2],col=2) lines(bmbl.ft$cptable[,2]+1,bmbl.ft$cptable[,4]*bmbl.ft.refperf,col=3) lines(bmbl.ft$cptable[,2]+1,(bmbl.ft$cptable[,4]+2*bmbl.ft$cptable[,5])*bmbl.ft.refperf,col=3,lty=2) lines(bmbl.ft$cptable[,2]+1,(bmbl.ft$cptable[,4]-2*bmbl.ft$cptable[,5])*bmbl.ft.refperf,col=3,lty=2) legend("topright",legend=c("Learning set","Validation/Test set","10 fold cv"),col=c(1,2,3),lwd=1)