## ----setup,cache=FALSE,echo=FALSE,results='hide',message=FALSE---------------- opts_chunk$set(echo=FALSE,fig.pos='htbp',fig.align='center',fig.env='center', message=FALSE, fig.width=4.5,fig.height=3.4, cache.path=".cache-desequilibre/", fig.path="figures-desequilibre/") set.seed(0) opts_chunk$set(echo=FALSE) beamerblue <- rgb(0.2,0.2,0.7) beamerblue.alpha <- rgb(0.2,0.2,0.7,0.5) beamergreen <- rgb(0,0.5,0) hook_plot <- knit_hooks$get('plot') # the default hook hook_plot_multi <- function(x, options) { txt = hook_plot(x, options) if (options$fig.cur <= 0) return(txt) # add \only before \includegraphics gsub('(\\\\includegraphics[^}]+} )', sprintf('\\\\only<%d>{\\1}%%', options$fig.cur), txt) } to_single_fig <- function() { knit_hooks$set(plot = hook_plot) } to_multi_fig <- function() { knit_hooks$set(plot = hook_plot_multi) } options(scipen = 5, OutDec=".") beamerblue <- rgb(0.2,0.2,0.7) beamerblue.alpha <- rgb(0.2,0.2,0.7,0.5) apiacoa.colors <- list("darkblue"="#051F42", "darkorange"="#E84017", "darkgreen"="#3D6B34", "blue"="#003075", "lightblue"="#99BFBB", "darkrose"="#997568") palette(unlist(apiacoa.colors)) alpha.palette <- function(col,nb) { basecol <- col2rgb(col)/256 rgb(basecol[1],basecol[2],basecol[3],alpha=seq(0,1,length.out=nb)) } ## ----library,cache=FALSE,message=FALSE---------------------------------------- library(proxy) library(foreach) library(doMC) registerDoMC() library(caret) library(pROC) my.auc <- function(auc) { as.numeric(auc$auc) } library(xtable) library(DMwR) library(ROSE) library(e1071) library(randomForest) library(xgboost) library(boot) do.full.glm <- function(data.learn,data.test,thresholds=c(0.5),weights=NULL) { if(is.null(weights)) { model <- glm(class~.,data=data.learn,family=binomial()) } else { model <- glm(class~.,data=data.learn,family=binomial(),weights=weights) } pred.learn <- predict(model,data.learn,type="response") pred.test <- predict(model,data.test,type="response") roc.learn <- roc(data.learn$class,pred.learn) roc.test <- roc(data.test$class,pred.test) results.learn <- matrix(data.learn$class,nrow=nrow(data.learn),ncol=length(thresholds)) results.test <- matrix(data.test$class,nrow=nrow(data.test),ncol=length(thresholds)) for(t in seq_along(thresholds)) { results.learn[,t] <- levels(data.learn$class)[as.integer(pred.learn>=thresholds[t])+1] results.test[,t] <- levels(data.test$class)[as.integer(pred.test>=thresholds[t])+1] } list(model=model,thresholds=thresholds, learn=list(data=data.learn,pred=pred.learn,roc=roc.learn,results=results.learn), test=list(data=data.test,pred=pred.test,roc=roc.test,results=results.test)) } confusion <- function(full.result,tindex=1) { list(learn=table(full.result$learn$results[,tindex],full.result$learn$data$class), test=table(full.result$test$results[,tindex],full.result$test$data$class)) } do.full.svm <- function(data.learn,data.test,baseweights=NULL) { cost <- 10^seq(-3,1,length.out=21) if(!is.null(baseweights)) { class.weights <- list("0"=baseweights[1],"1"=baseweights[2]) } full.result <- foreach(c=cost) %dopar% { if(is.null(baseweights)) { model <- svm(class~.,data=data.learn,kernel="linear",cost=c) } else { model <- svm(class~.,data=data.learn,kernel="linear",cost=c,class.weights=class.weights) } list(learn=model$fitted,test=predict(model,data.test)) } if(is.null(baseweights)) { baseweights <- c(1,1) } tp.learn <- rep(NA,length(cost)) tp.test <- rep(NA,length(cost)) tn.learn <- rep(NA,length(cost)) tn.test <- rep(NA,length(cost)) c.learn <- table(data.learn$class) c.test <- table(data.test$class) perf.learn <- rep(NA,length(cost)) perf.test <- rep(NA,length(cost)) for(ci in seq_along(cost)) { tp.learn[ci] <- sum(full.result[[ci]]$learn=="1" & data.learn$class=="1")/c.learn[2] tp.test[ci] <- sum(full.result[[ci]]$test=="1" & data.test$class=="1")/c.test[2] tn.learn[ci] <- sum(full.result[[ci]]$learn=="0" & data.learn$class=="0")/c.learn[1] tn.test[ci] <- sum(full.result[[ci]]$test=="0" & data.test$class=="0")/c.test[1] perf.learn[ci] <- (sum(full.result[[ci]]$learn=="1" & data.learn$class=="0")*baseweights[1]+sum(full.result[[ci]]$learn=="0" & data.learn$class=="1")*baseweights[2])/nrow(data.learn) perf.test[ci] <- (sum(full.result[[ci]]$test=="1" & data.test$class=="0")*baseweights[1]+sum(full.result[[ci]]$test=="0" & data.test$class=="1")*baseweights[2])/nrow(data.test) } list(full.result=full.result,learn=list(tp=tp.learn,tn=tn.learn,perf=perf.learn),test=list(tp=tp.test,tn=tn.test,perf=perf.test)) } do.svm.full.cv <- function(data.learn,data.test,folds,baseweights=NULL,balencing=NULL) { K <- max(folds) cost <- 10^seq(-3,1,length.out=21) class.weights <- NULL if(!is.null(baseweights)) { class.weights <- list("0"=baseweights[1],"1"=baseweights[2]) } if(is.null(baseweights)) { baseweights <- c(1,1) } cv <- foreach(c=cost,.combine=cbind) %:% foreach(k=1:K,.combine=c) %dopar% { if(!is.null(balencing)) { if(balencing=="rose") { data.blearn <- ROSE(class~.,data=data.learn[folds!=k,])$data } else { data.blearn <- SMOTE(class~.,data=data.learn[folds!=k,]) } model <- svm(class~.,data=data.blearn,kernel="linear",cost=c) } else { if(is.null(class.weights)) { model <- svm(class~.,data=data.learn[folds!=k,],kernel="linear",cost=c) } else { model <- svm(class~.,data=data.learn[folds!=k,],kernel="linear",cost=c,class.weights=class.weights) } } data.class.fold <- data.learn$class[folds==k] fold.pred <- predict(model,data.learn[folds==k,]) sum(fold.pred=="1" & data.class.fold=="0")*baseweights[1]+sum(fold.pred=="0" & data.class.fold=="1")*baseweights[2] } cv.scores <- apply(cv,2,sum)/nrow(data.learn) best.cost <- which.min(cv.scores) if(!is.null(balencing)) { if(balencing=="rose") { data.blearn <- ROSE(class~.,data=data.learn)$data } else { data.blearn <- SMOTE(class~.,data=data.learn) } model <- svm(class~.,data=data.blearn,kernel="linear",cost=cost[best.cost]) } else { if(is.null(class.weights)) { model <- svm(class~.,data=data.learn,kernel="linear",cost=cost[best.cost]) } else { model <- svm(class~.,data=data.learn,kernel="linear",cost=cost[best.cost],class.weights=class.weights) } } learn.pred <- predict(model,data.learn) c.learn <- table(data.learn$class) c.test <- table(data.test$class) model.test <- predict(model,data.test) tp.learn <- sum(learn.pred=="1" & data.learn$class=="1")/c.learn[2] tp.test <- sum(model.test=="1" & data.test$class=="1")/c.test[2] tn.learn <- sum(learn.pred=="0" & data.learn$class=="0")/c.learn[1] tn.test <- sum(model.test=="0" & data.test$class=="0")/c.test[1] perf.learn <- (sum(learn.pred=="1" & data.learn$class=="0")*baseweights[1]+sum(learn.pred=="0" & data.learn$class=="1")*baseweights[2])/nrow(data.learn) perf.test <- (sum(model.test=="1" & data.test$class=="0")*baseweights[1]+sum(model.test=="0" & data.test$class=="1")*baseweights[2])/nrow(data.test) list(model=model,cv.scores=cv.scores,best.cost=best.cost,learn=list(tp=tp.learn,tn=tn.learn,perf=perf.learn),test=list(tp=tp.test,tn=tn.test,perf=perf.test)) } ## ----coilDataLoad,cache=TRUE-------------------------------------------------- coil.data <- read.table("../data/coil2000/ticdata2000.txt",header=FALSE,sep="\t") coil.data.test <- read.table("../data/coil2000/ticeval2000.txt",header=FALSE,sep="\t") coil.data.test$class <- read.table("../data/coil2000/tictgts2000.txt",header=FALSE,sep="\t")[[1]] for(v in c(1,5,86)) { coil.data[[v]] <- as.factor(coil.data[[v]]) coil.data.test[[v]] <- as.factor(coil.data.test[[v]]) } names(coil.data)[86] <- "class" unbal <- sum(coil.data$class==1)/nrow(coil.data) ## ----coilDataLogistic,cache=TRUE,warning=FALSE-------------------------------- coilRatio <- sum(coil.data$class==0)/sum(coil.data$class==1) coil.data.glm <- do.full.glm(coil.data,coil.data.test,c(0.5,1/coilRatio)) coil.data.glm.conf <- confusion(coil.data.glm) coil.data.glm.conf.t <- confusion(coil.data.glm,2) ## ----RocStats----------------------------------------------------------------- myrate <- function(target,pred) { c(sum(target==0 & pred==0)/sum(target==0),sum(target==1 & pred==1)/sum(target==1)) } rate.base <- myrate(coil.data$class,coil.data.glm$learn$results[,1]) rate.pw <- myrate(coil.data$class,coil.data.glm$learn$results[,2]) rate.pw.test <- myrate(coil.data.test$class,coil.data.glm$test$results[,2]) ## ----coilDataLogisticWeighted,cache=TRUE,warning=FALSE------------------------ base.weights <- nrow(coil.data)/table(coil.data$class) coil.data.wglm <- do.full.glm(coil.data,coil.data.test,weights=base.weights[as.integer(coil.data$class)]) coil.data.wglm.conf <- confusion(coil.data.wglm) ## ----RocStatsW---------------------------------------------------------------- rate.w <- myrate(coil.data$class,coil.data.wglm$learn$results[,1]) rate.w.test <- myrate(coil.data.test$class,coil.data.wglm$test$results[,1]) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.glm.conf[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.glm.conf[["test"]]),floating=FALSE) ## ----------------------------------------------------------------------------- allrates <- function(confM) { n <- sum(confM) pre.results <- list(accuracy=sum(diag(confM))/n, sensitivity = confM[2,2]/(confM[1,2]+confM[2,2]), specificity = confM[1,1]/(confM[1,1]+confM[2,1]), precision = confM[2,2]/(confM[2,1]+confM[2,2]), npv = confM[1,1]/(confM[1,1]+confM[1,2])) pre.results$f1score <- 2*(pre.results$precision*pre.results$sensitivity)/(pre.results$precision+pre.results$sensitivity) pre.results$ba <- 0.5*(pre.results$sensitivity+pre.results$specificity) pre.results } stupid.classifier <- function(learn, test) { dominant <- levels(learn)[which.max(table(learn))] fake.pred <- factor(rep(dominant,length(learn)),levels=levels(learn)) fake.test.pred <- factor(rep(dominant,length(test)),levels=levels(learn)) list(learn=table(fake.pred,learn),test=table(fake.test.pred,test)) } coil.data.stupid <- stupid.classifier(coil.data$class,coil.data.test$class) ## ----CoilRates---------------------------------------------------------------- coil.glm.learn.rate <- allrates(coil.data.glm.conf$learn) coil.glm.test.rate <- allrates(coil.data.glm.conf$test) coil.wglm.learn.rate <- allrates(coil.data.wglm.conf$learn) coil.wglm.test.rate <- allrates(coil.data.wglm.conf$test) coil.stupid.learn.rate <- allrates(coil.data.stupid$learn) coil.stupid.test.rate <- allrates(coil.data.stupid$test) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.glm.conf[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.glm.conf[["test"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.stupid[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.stupid[["test"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.wglm.conf[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.wglm.conf[["test"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.wglm.conf[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.wglm.conf[["test"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.glm.conf.t[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.glm.conf.t[["test"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.wglm.conf[["learn"]]),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.wglm.conf[["test"]]),floating=FALSE) ## ----simulatedBasic,cache=TRUE------------------------------------------------ base.data.c0 <- sweep(matrix(rnorm(400),ncol=2),2,c(0,1),"+") base.data.c1 <- sweep(matrix(rnorm(400),ncol=2),2,c(0,1),"-") base.data.simulated <- data.frame(X=rbind(base.data.c0,base.data.c1),Y=c(rep(0,nrow(base.data.c0)),rep(1,nrow(base.data.c1)))) base.data.simulated$Y <- as.factor(base.data.simulated$Y) bds.glm <- glm(Y~.,data=base.data.simulated[1:220,],family=binomial()) bds.glm$intercept <- -coef(bds.glm)[1]/coef(bds.glm)[3] bds.glm$slope <- -coef(bds.glm)[2]/coef(bds.glm)[3] tmp <- predict(bds.glm,type="response") bds.glm$support <- tmp*(1-tmp) ## ----simulatedBasicFig,fig.width=2,fig.height=2.8----------------------------- par(mar=c(2,2,0.5,0.5),cex=0.4) plot(base.data.simulated[,1:2],col=as.integer(base.data.simulated$Y),pch=20,asp=1) ## ----simulatedBasicFigTrue,fig.width=2,fig.height=2.8------------------------- par(mar=c(2,2,0.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1) ## ----simulatedBasicWeighted,cache=TRUE---------------------------------------- bds.weighted.glm <- glm(Y~.,data=base.data.simulated[1:220,],family=binomial(),weights=c(rep(1,200),rep(10,20))) bds.weighted.glm$intercept <- -coef(bds.weighted.glm)[1]/coef(bds.weighted.glm)[3] bds.weighted.glm$slope <- -coef(bds.weighted.glm)[2]/coef(bds.weighted.glm)[3] tmp <- predict(bds.weighted.glm,type="response") bds.weighted.glm$support <- tmp*(1-tmp) ## ----simulatedBasicFigTrueBaseDecision,fig.width=4,fig.height=3--------------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Seuil") abline(a=bds.glm$intercept,b=bds.glm$slope,col=1) legend("topright",c("seuil=0.5"),lwd=1,col=1) ## ----simulatedBasicFigTrueAllDecisions,fig.width=4,fig.height=3--------------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Seuil") abline(a=bds.glm$intercept,b=bds.glm$slope,col=1) abline(a=bds.glm$intercept+logit(1/10)/coef(bds.glm)[3],b=bds.glm$slope,col=2) legend("topright",c("seuil=0.5","seuil=0.1"),lwd=1,col=1:2) ## ----simulatedBasicFigTrueAllDecisionsFull,fig.width=4,fig.height=3----------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Seuil") abline(a=bds.glm$intercept,b=bds.glm$slope,col=1) abline(a=bds.glm$intercept+logit(1/10)/coef(bds.glm)[3],b=bds.glm$slope,col=2) abline(a=0,b=0,col=3) legend("topright",c("seuil=0.5","seuil=0.1","théorique"),lwd=1,col=1:3) ## ----simulatedBasicFigTrueAllDecisionsFullWeight,fig.width=4,fig.height=3----- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Pondération") abline(a=bds.glm$intercept,b=bds.glm$slope,col=1) abline(a=bds.glm$intercept+logit(1/10)/coef(bds.glm)[3],b=bds.glm$slope,col=2) abline(a=0,b=0,col=3) abline(a=bds.weighted.glm$intercept,b=bds.weighted.glm$slope,col=4) legend("topright",c("seuil=0.5","seuil=0.1","théorique","0.5 poids"),lwd=1,col=1:4) ## ----------------------------------------------------------------------------- myonecols <- alpha.palette(palette()[1],100) mytwocols <- alpha.palette(palette()[2],100) ## ----simulatedBasicConfusion,fig.width=4,fig.height=3------------------------- par(mar=c(2,2,2.5,0.5),cex=0.4) the.vals <- cut(bds.glm$support,breaks=seq(0,.25,length.out=100),include.lowest=TRUE,labels=FALSE) plot(base.data.simulated[1:220,1:2],col=c(myonecols[the.vals[1:200]],mytwocols[the.vals[201:220]]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Importance") abline(a=bds.glm$intercept,b=bds.glm$slope,col=1) abline(a=bds.glm$intercept+logit(1/10)/coef(bds.glm)[3],b=bds.glm$slope,col=3) ## ----simulatedBasicConfusionWeighted,fig.width=4,fig.height=3----------------- par(mar=c(2,2,2.5,0.5),cex=0.4) the.vals <- cut(bds.weighted.glm$support,breaks=seq(0,.25,length.out=100),include.lowest=TRUE,labels=FALSE) plot(base.data.simulated[1:220,1:2],col=c(myonecols[the.vals[1:200]],mytwocols[the.vals[201:220]]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Importance") abline(a=bds.weighted.glm$intercept,b=bds.weighted.glm$slope,col=3) ## ----coilDataglmRoc,fig.width=3.25,fig.height=3.25---------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(coil.data.glm$learn$roc,lwd=1) lines(coil.data.glm$test$roc,col=2,lwd=1) points(rbind(rate.base,rate.pw,rate.pw.test),col=c(1,1,2),pch=20) text(rbind(rate.base,rate.pw,rate.pw.test),labels=c("0.5",round(1/coilRatio,4),round(1/coilRatio,4)),col=c(1,1,2),pos=4) legend("bottomright",legend=c("apprentissage","test"),cex=0.9,col=c(1,2),lwd=1) ## ----coilDatawglmRoc,fig.width=3.25,fig.height=3.25--------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(coil.data.wglm$learn$roc,lwd=1) lines(coil.data.wglm$test$roc,col=2,lwd=1) lines(coil.data.glm$learn$roc,col=1,lty=2,lwd=1) lines(coil.data.glm$test$roc,col=2,lty=2,lwd=1) points(rbind(rate.pw,rate.w,rate.pw.test,rate.w.test),col=c(1,1,2,2),pch=20) legend("bottomright",legend=c("apprentissage","test"),cex=0.9,col=c(1,2),lwd=1) ## ----coilDataLogisticSub,cache=TRUE,warning=FALSE----------------------------- positive <- which(coil.data$class==1) negative <- sample((1:nrow(coil.data))[-positive],length(positive)) coil.data.subglm <- do.full.glm(coil.data[c(positive,negative),-1],coil.data.test) coil.data.subglm.confusion <- confusion(coil.data.subglm) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.subglm.confusion$learn),floating=FALSE) ## ----results='asis'----------------------------------------------------------- print(xtable(coil.data.subglm.confusion$test),floating=FALSE) ## ----coilDataLogisticNoOne,cache=TRUE,warning=FALSE--------------------------- coil.data.glmn1 <- do.full.glm(coil.data[,-1],coil.data.test) coil.data.wglmn1 <- do.full.glm(coil.data[,-1],coil.data.test,weights=base.weights[as.integer(coil.data$class)]) ## ----------------------------------------------------------------------------- to_multi_fig() ## ----rocsub,fig.width=4.5,fig.height=3.25,fig.align='center',fig.show='hold', fig.keep='all',warning=FALSE,cache=TRUE---- positive <- which(coil.data$class==1) for(i in 1:10) { negative <- sample((1:nrow(coil.data))[-positive],i*length(positive)) coil.data.subglmtmp <- glm(class~.,data=coil.data[c(positive,negative),-1],family=binomial()) coil.data.subglmtmp.pred <- data.frame(class=coil.data$class,prob=predict(coil.data.subglmtmp,coil.data,type="response")) coil.data.test.subglmtmp.pred <- data.frame(class=coil.data.test$class,prob=predict(coil.data.subglmtmp,coil.data.test,type="response")) coil.data.subglmtmp.pred.roc <- roc(class~prob,data=coil.data.subglmtmp.pred) coil.data.test.subglmtmp.pred.roc <- roc(class~prob,data=coil.data.test.subglmtmp.pred) layout(mat=matrix(1:2,ncol=2),widths=c(7,3)) par(mar=c(4,4,0.1,0.1),cex=0.5) plot(coil.data.subglmtmp.pred.roc,lwd=1,main=paste("k=",i,sep="")) lines(coil.data.test.subglmtmp.pred.roc,col=2,lwd=1) lines(coil.data.glmn1$learn$roc,col=1,lty=2,lwd=1) lines(coil.data.glmn1$test$roc,col=2,lty=2,lwd=1) legend("bottomright",legend=c("apprentissage","test"),cex=0.9,col=c(1,2),lwd=1) par(mar=rep(0,4),cex=0.5) plot(NA,type="n",axes=FALSE,xlim=c(0,1),ylim=c(0,15),xlab="",ylab="") text(0,11,"AUC référence",pos=4) text(0,10,paste("Apprentissage :",round(my.auc(coil.data.glmn1$learn$roc),4)),pos=4) text(0,9,paste("Test :",round(my.auc(coil.data.glmn1$test$roc),4)),pos=4,col=2) text(0,7,"AUC échantillonnage",pos=4) text(0,6,paste("Apprentissage :",round(my.auc(coil.data.subglmtmp.pred.roc),4)),pos=4) text(0,5,paste("Test :",round(my.auc(coil.data.test.subglmtmp.pred.roc),4)),pos=4,col=2) } ## ----rocsubweight,fig.width=4.5,fig.height=3.25,fig.align='center',fig.show='hold', fig.keep='all',warning=FALSE,cache=TRUE---- positive <- which(coil.data$class==1) for(i in 1:10) { negative <- sample((1:nrow(coil.data))[-positive],i*length(positive)) coil.data.subglmtmp <- glm(class~.,data=coil.data[c(positive,negative),-1],family=binomial(),weights=c(rep(i,length(positive)),rep(1,length(negative)))) coil.data.subglmtmp.pred <- data.frame(class=coil.data$class,prob=predict(coil.data.subglmtmp,coil.data,type="response")) coil.data.test.subglmtmp.pred <- data.frame(class=coil.data.test$class,prob=predict(coil.data.subglmtmp,coil.data.test,type="response")) coil.data.subglmtmp.pred.roc <- roc(class~prob,data=coil.data.subglmtmp.pred) coil.data.test.subglmtmp.pred.roc <- roc(class~prob,data=coil.data.test.subglmtmp.pred) layout(mat=matrix(1:2,ncol=2),widths=c(7,3)) par(mar=c(4,4,0.1,0.1),cex=0.5) plot(coil.data.subglmtmp.pred.roc,lwd=1,main=paste("k=",i,sep="")) lines(coil.data.test.subglmtmp.pred.roc,col=2,lwd=1) lines(coil.data.wglmn1$learn$roc,col=1,lty=2,lwd=1) lines(coil.data.wglmn1$test$roc,col=2,lty=2,lwd=1) legend("bottomright",legend=c("apprentissage","test"),cex=0.9,col=c(1,2),lwd=1) par(mar=rep(0,4),cex=0.5) plot(NA,type="n",axes=FALSE,xlim=c(0,1),ylim=c(0,15),xlab="",ylab="") text(0,11,"AUC référence",pos=4) text(0,10,paste("Apprentissage :",round(my.auc(coil.data.wglmn1$learn$roc),4)),pos=4) text(0,9,paste("Test :",round(my.auc(coil.data.wglmn1$test$roc),4)),pos=4,col=2) text(0,7,"AUC échantillonnage",pos=4) text(0,6,paste("Apprentissage :",round(my.auc(coil.data.subglmtmp.pred.roc),4)),pos=4) text(0,5,paste("Test :",round(my.auc(coil.data.test.subglmtmp.pred.roc),4)),pos=4,col=2) } ## ----------------------------------------------------------------------------- to_single_fig() ## ----simulatedSMOTE,cache=TRUE,warning=FALSE---------------------------------- base.data.simulated.smote <- SMOTE(Y~.,data=base.data.simulated[1:220,]) bds.smote.glm <- glm(Y~.,data=base.data.simulated.smote,family=binomial()) bds.smote.glm$intercept <- -coef(bds.smote.glm)[1]/coef(bds.smote.glm)[3] bds.smote.glm$slope <- -coef(bds.smote.glm)[2]/coef(bds.smote.glm)[3] bds.smote.weights <- 1/table(base.data.simulated.smote$Y) bds.smote.proba <- (1/bds.smote.weights[2])/sum(1/bds.smote.weights) bds.smote.weighted.glm <- glm(Y~.,data=base.data.simulated.smote,family=binomial(),weights=bds.smote.weights[as.integer(base.data.simulated.smote$Y)]) bds.smote.weighted.glm$intercept <- -coef(bds.smote.weighted.glm)[1]/coef(bds.smote.weighted.glm)[3] bds.smote.weighted.glm$slope <- -coef(bds.smote.weighted.glm)[2]/coef(bds.smote.weighted.glm)[3] ## ----CoilSmote,cache=TRUE,warning=FALSE--------------------------------------- coil.data.smoted <- SMOTE(class~.,data=coil.data) coil.data.smoted.glm <- do.full.glm(coil.data.smoted[,-1],coil.data.test[,-1]) ## ----simulatedBasicFigOrig,fig.width=4,fig.height=3--------------------------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Données d'origine") ## ----simulatedBasicFigSmoted,fig.width=4,fig.height=3------------------------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated.smote[,1:2],col=as.integer(base.data.simulated.smote$Y),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Données SMOTE") ## ----simulatedBasicFigSmoteTrueBaseDecision,fig.width=4,fig.height=3---------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="SMOTE + seuil") abline(a=bds.smote.glm$intercept,b=bds.smote.glm$slope,col=1) legend("topleft",c("seuil=0.5"),lwd=1,col=1) ## ----simulatedBasicFigSmoteTrueAllDecisions,fig.width=4,fig.height=3---------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="SMOTE + seuil") abline(a=bds.smote.glm$intercept,b=bds.smote.glm$slope,col=1) abline(a=bds.smote.glm$intercept+logit(bds.smote.proba)/coef(bds.smote.glm)[3],b=bds.smote.glm$slope,col=2) legend("topleft",c("seuil=0.5","seuil optimal"),lwd=1,col=1:2) ## ----simulatedBasicFigSmoteTrueAllDecisionsFull,fig.width=4,fig.height=3------ par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="SMOTE + seuil") abline(a=bds.smote.glm$intercept,b=bds.smote.glm$slope,col=1) abline(a=bds.smote.glm$intercept+logit(bds.smote.proba)/coef(bds.smote.glm)[3],b=bds.smote.glm$slope,col=2) abline(a=0,b=0,col=3) legend("topleft",c("seuil=0.5","seuil optimal","théorique"),lwd=1,col=1:3) ## ----simulatedBasicFigSmoteTrueAllDecisionsFullWeight,fig.width=4,fig.height=3---- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="SMOTE + pondération") abline(a=bds.smote.glm$intercept,b=bds.smote.glm$slope,col=1) abline(a=bds.smote.glm$intercept+logit(bds.smote.proba)/coef(bds.smote.glm)[3],b=bds.smote.glm$slope,col=2) abline(a=0,b=0,col=3) abline(a=bds.smote.weighted.glm$intercept,b=bds.smote.weighted.glm$slope,col=4) legend("topleft",c("seuil=0.5","seuil optimal","théorique","poids optimaux"),lwd=1,col=1:4) ## ----simulatedBasicFigSmoteTrueAllDecisionsFullWeightComp,fig.width=4,fig.height=3---- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="SMOTE + pondération") abline(a=bds.smote.glm$intercept,b=bds.smote.glm$slope,col=1) abline(a=bds.smote.glm$intercept+logit(bds.smote.proba)/coef(bds.smote.glm)[3],b=bds.smote.glm$slope,col=2) abline(a=0,b=0,col=3) abline(a=bds.smote.weighted.glm$intercept,b=bds.smote.weighted.glm$slope,col=4) abline(a=bds.weighted.glm$intercept,b=bds.weighted.glm$slope,col=5) legend("topleft",c("seuil=0.5","seuil optimal","théorique","poids optimaux","poids sans SMOTE"),lwd=1,col=1:5) ## ----coilDataSmoteRoc,fig.width=3.25,fig.height=3.25-------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(coil.data.smoted.glm$learn$roc,lwd=1) lines(coil.data.smoted.glm$test$roc,col=2,lwd=1) lines(coil.data.wglmn1$learn$roc,col=1,lty=2,lwd=1) lines(coil.data.wglmn1$test$roc,col=2,lty=2,lwd=1) legend("bottomright",legend=c("apprentissage","test"),cex=0.9,col=c(1,2),lwd=1) ## ----CoilROSE,cache=TRUE,warning=FALSE---------------------------------------- coil.data.ROSE <- ROSE(class~.,data=coil.data) coil.data.rose.glm <- do.full.glm(coil.data.ROSE$data,coil.data.test) ## ----simulatedROSE,cache=TRUE,warning=FALSE----------------------------------- base.data.simulated.rose <- ROSE(Y~.,data=base.data.simulated[1:220,])$data bds.rose.glm <- glm(Y~.,data=base.data.simulated.rose,family=binomial()) bds.rose.glm$intercept <- -coef(bds.rose.glm)[1]/coef(bds.rose.glm)[3] bds.rose.glm$slope <- -coef(bds.rose.glm)[2]/coef(bds.rose.glm)[3] bds.rose.weights <- 1/table(base.data.simulated.rose$Y) bds.rose.proba <- (1/bds.rose.weights[2])/sum(1/bds.rose.weights) bds.rose.weighted.glm <- glm(Y~.,data=base.data.simulated.rose,family=binomial(),weights=bds.rose.weights[as.integer(base.data.simulated.rose$Y)]) bds.rose.weighted.glm$intercept <- -coef(bds.rose.weighted.glm)[1]/coef(bds.rose.weighted.glm)[3] bds.rose.weighted.glm$slope <- -coef(bds.rose.weighted.glm)[2]/coef(bds.rose.weighted.glm)[3] ## ----simulatedBasicFigOrig,fig.width=4,fig.height=3--------------------------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Données d'origine") ## ----simulatedBasicFigRosed,fig.width=4,fig.height=3-------------------------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated.rose[,1:2],col=as.integer(base.data.simulated.rose$Y),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="Données ROSE") ## ----simulatedBasicFigRoseTrueBaseDecision,fig.width=4,fig.height=3----------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="ROSE + seuil") abline(a=bds.rose.glm$intercept,b=bds.rose.glm$slope,col=1) legend("topleft",c("seuil=0.5"),lwd=1,col=1) ## ----simulatedBasicFigRoseTrueAllDecisions,fig.width=4,fig.height=3----------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="ROSE + seuil") abline(a=bds.rose.glm$intercept,b=bds.rose.glm$slope,col=1) abline(a=bds.rose.glm$intercept+logit(bds.rose.proba)/coef(bds.rose.glm)[3],b=bds.rose.glm$slope,col=2) legend("topleft",c("seuil=0.5","seuil optimal"),lwd=1,col=1:2) ## ----simulatedBasicFigRoseTrueAllDecisionsFull,fig.width=4,fig.height=3------- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="ROSE + seuil") abline(a=bds.rose.glm$intercept,b=bds.rose.glm$slope,col=1) abline(a=bds.rose.glm$intercept+logit(bds.rose.proba)/coef(bds.rose.glm)[3],b=bds.rose.glm$slope,col=2) abline(a=0,b=0,col=3) legend("topleft",c("seuil=0.5","seuil optimal","théorique"),lwd=1,col=1:3) ## ----simulatedBasicFigRoseTrueAllDecisionsFullWeight,fig.width=4,fig.height=3---- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="ROSE + pondération") abline(a=bds.rose.glm$intercept,b=bds.rose.glm$slope,col=1) abline(a=bds.rose.glm$intercept+logit(bds.rose.proba)/coef(bds.rose.glm)[3],b=bds.rose.glm$slope,col=2) abline(a=0,b=0,col=3) abline(a=bds.rose.weighted.glm$intercept,b=bds.rose.weighted.glm$slope,col=4) legend("topleft",c("seuil=0.5","seuil optimal","théorique","poids optimaux"),lwd=1,col=1:4) ## ----simulatedBasicFigRoseTrueAllDecisionsFullWeightComp,fig.width=4,fig.height=3---- par(mar=c(2,2,2.5,0.5),cex=0.4) plot(base.data.simulated[1:220,1:2],col=as.integer(base.data.simulated$Y[1:220]),pch=20, ylim=range(base.data.simulated[,2]),xlim=range(base.data.simulated[,1]),asp=1,main="ROSE + pondération") abline(a=bds.rose.glm$intercept,b=bds.rose.glm$slope,col=1) abline(a=bds.rose.glm$intercept+logit(bds.rose.proba)/coef(bds.rose.glm)[3],b=bds.rose.glm$slope,col=2) abline(a=0,b=0,col=3) abline(a=bds.rose.weighted.glm$intercept,b=bds.rose.weighted.glm$slope,col=4) abline(a=bds.weighted.glm$intercept,b=bds.weighted.glm$slope,col=5) legend("topleft",c("seuil=0.5","seuil optimal","théorique","poids optimaux","poids sans ROSE"),lwd=1,col=1:5) ## ----coilDataRoseRoc,fig.width=3.25,fig.height=3.25--------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(coil.data.rose.glm$learn$roc,lwd=1) lines(coil.data.rose.glm$test$roc,col=2,lwd=1) lines(coil.data.wglm$learn$roc,col=1,lty=2,lwd=1) lines(coil.data.wglm$test$roc,col=2,lty=2,lwd=1) legend("bottomright",legend=c("apprentissage","test"),cex=0.9,col=c(1,2),lwd=1) ## ----wineDataLoad,cache=TRUE-------------------------------------------------- wine.data <- read.csv2("../data/wine-quality/winequality-red.csv") for(i in 1:11) { wine.data[[i]] <- as.numeric(as.character(wine.data[[i]])) } wine.data$class <- as.factor(as.integer(wine.data$quality>=7)) wine.data.split <- createDataPartition(wine.data$quality,p=2/3,list=FALSE) wine.data.quality <- wine.data$quality wine.data$quality <- NULL wine.data.learn <- wine.data[wine.data.split,] wine.data.test <- wine.data[-wine.data.split,] wine.data.ratio <- sum(wine.data$class==1)/nrow(wine.data) wine.data.ratio2 <- sum(wine.data.quality==8)/nrow(wine.data) wine.data2.learn <- wine.data.learn wine.data2.learn$class <- as.factor(as.integer(wine.data.quality[wine.data.split]>=8)) wine.data2.test <- wine.data.test wine.data2.test$class <- as.factor(as.integer(wine.data.quality[-wine.data.split]>=8)) ## ----wineGLM,cache=TRUE,warning=FALSE----------------------------------------- wine.data.glm <- do.full.glm(wine.data.learn,wine.data.test) wine.weights <- nrow(wine.data.learn)/table(wine.data.learn$class) wine.data.wglm <- do.full.glm(wine.data.learn,wine.data.test,weights=wine.weights[as.integer(wine.data.learn$class)]) ## ----wineGLM2,cache=TRUE,warning=FALSE---------------------------------------- wine.data2.glm <- do.full.glm(wine.data2.learn,wine.data.test) wine.weights2 <- nrow(wine.data2.learn)/table(wine.data2.learn$class) wine.data2.wglm <- do.full.glm(wine.data2.learn,wine.data2.test,weights=wine.weights2[as.integer(wine.data2.learn$class)]) ## ----WineDataRoc,fig.width=3.25,fig.height=3.25------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.data.glm$learn$roc,lwd=1,lty=2) lines(wine.data.glm$test$roc,lwd=1) lines(wine.data.wglm$learn$roc,col=2,lty=2,lwd=1) lines(wine.data.wglm$test$roc,col=2,lwd=1) legend("bottomright",legend=c("direct","weighted"),cex=0.9,col=c(1,2),lwd=1) ## ----WineDataRoc2,fig.width=3.25,fig.height=3.25------------------------------ par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.data2.glm$learn$roc,lwd=1,lty=2) lines(wine.data2.glm$test$roc,lwd=1) lines(wine.data2.wglm$learn$roc,col=2,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,col=2,lwd=1) legend("bottomright",legend=c("direct","weighted"),cex=0.9,col=c(1,2),lwd=1) ## ----wineDataSmoteRose,cache=TRUE,warning=FALSE------------------------------- wine.data.smoted <- SMOTE(class~.,data=wine.data.learn) wine.data.smoted.glm <- do.full.glm(wine.data.smoted,wine.data.test) wine.data.ROSE <- ROSE(class~.,data=wine.data.learn) wine.data.rose.glm <- do.full.glm(wine.data.ROSE$data,wine.data.test) ## ----WineDataRocSR,fig.width=3.25,fig.height=3.25----------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.data.smoted.glm$learn$roc,lwd=1,lty=2) lines(wine.data.smoted.glm$test$roc,lwd=1) lines(wine.data.rose.glm$learn$roc,lwd=1,lty=2,col=4) lines(wine.data.rose.glm$test$roc,lwd=1,col=4) lines(wine.data.wglm$learn$roc,col=2,lty=2,lwd=1) lines(wine.data.wglm$test$roc,col=2,lwd=1) legend("bottomright",legend=c("smote","rose","weighted"),cex=0.9,col=c(1,4,2),lwd=1) ## ----wineDataSmoteRose2,cache=TRUE,warning=FALSE------------------------------ wine.data2.smoted <- SMOTE(class~.,data=wine.data2.learn) wine.data2.smoted.glm <- do.full.glm(wine.data2.smoted,wine.data2.test) wine.data2.ROSE <- ROSE(class~.,data=wine.data2.learn) wine.data2.rose.glm <- do.full.glm(wine.data2.ROSE$data,wine.data2.test) ## ----WineDataRocSR2,fig.width=3.25,fig.height=3.25---------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.data2.smoted.glm$learn$roc,lwd=1,lty=2) lines(wine.data2.smoted.glm$test$roc,lwd=1) lines(wine.data2.rose.glm$learn$roc,lwd=1,lty=2,col=4) lines(wine.data2.rose.glm$test$roc,lwd=1,col=4) lines(wine.data2.wglm$learn$roc,col=2,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,col=2,lwd=1) legend("bottomright",legend=c("smote","rose","weighted"),cex=0.9,col=c(1,4,2),lwd=1) ## ----svmWine,cache=TRUE------------------------------------------------------- svm.weights <- seq(1,50,length.out=101) svm.roc.learn <- data.frame(Specificity=rep(NA,length(svm.weights)),Sensitivity=rep(NA,length(svm.weights))) svm.roc.test <- data.frame(Specificity=rep(NA,length(svm.weights)),Sensitivity=rep(NA,length(svm.weights))) for(i in seq_along(svm.weights)) { baseweights <- c(1,svm.weights[i]) svm <- do.full.svm(wine.data.learn,wine.data.test,baseweights) best.cheating <- which.min(svm$test$perf) svm.roc.learn$Specificity[i] <- svm$learn$tn[best.cheating] svm.roc.learn$Sensitivity[i] <- svm$learn$tp[best.cheating] svm.roc.test$Specificity[i] <- svm$test$tn[best.cheating] svm.roc.test$Sensitivity[i] <- svm$test$tp[best.cheating] } ## ----svmWineResamp,cache=TRUE------------------------------------------------- svm <- do.full.svm(wine.data.smoted,wine.data.test) best.cheating <- which.min(svm$test$perf) svm.smoted.roc.point.learn <- c(svm$learn$tn[best.cheating],svm$learn$tp[best.cheating]) svm.smoted.roc.point.test <- c(svm$test$tn[best.cheating],svm$test$tp[best.cheating]) svm <- do.full.svm(wine.data.ROSE$data,wine.data.test) best.cheating <- which.min(svm$test$perf) svm.rose.roc.point.learn <- c(svm$learn$tn[best.cheating],svm$learn$tp[best.cheating]) svm.rose.roc.point.test <- c(svm$test$tn[best.cheating],svm$test$tp[best.cheating]) ## ----WineDataSVMRoc,fig.width=3.25,fig.height=3.25---------------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm.roc.test,col=2,pch=20) lines(wine.data.wglm$learn$roc,lty=2,lwd=1) lines(wine.data.wglm$test$roc,lty=2,lwd=1,col=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineDataSVMRocWithRS,fig.width=3.25,fig.height=3.25---------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm.roc.test,col=2,pch=20) lines(wine.data.wglm$learn$roc,lty=2,lwd=1) lines(wine.data.wglm$test$roc,lty=2,lwd=1,col=2) points(svm.smoted.roc.point.learn[1],svm.smoted.roc.point.learn[2],pch=22,cex=2,bg=1) points(svm.smoted.roc.point.test[1],svm.smoted.roc.point.test[2],pch=22,cex=2,bg=2) points(svm.rose.roc.point.learn[1],svm.rose.roc.point.learn[2],pch=23,cex=2,bg=1) points(svm.rose.roc.point.test[1],svm.rose.roc.point.test[2],pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----svmWine2,cache=TRUE------------------------------------------------------ svm2.weights <- seq(1,100,length.out=100) svm2.roc.learn <- data.frame(Specificity=rep(NA,length(svm2.weights)),Sensitivity=rep(NA,length(svm2.weights))) svm2.roc.test <- data.frame(Specificity=rep(NA,length(svm2.weights)),Sensitivity=rep(NA,length(svm2.weights))) for(i in seq_along(svm2.weights)) { baseweights <- c(1,svm2.weights[i]) svm <- do.full.svm(wine.data2.learn,wine.data2.test,baseweights) best.cheating <- which.min(svm$test$perf) svm2.roc.learn$Specificity[i] <- svm$learn$tn[best.cheating] svm2.roc.learn$Sensitivity[i] <- svm$learn$tp[best.cheating] svm2.roc.test$Specificity[i] <- svm$test$tn[best.cheating] svm2.roc.test$Sensitivity[i] <- svm$test$tp[best.cheating] } ## ----svmWineResamp2,cache=TRUE------------------------------------------------ svm <- do.full.svm(wine.data2.smoted,wine.data.test) best.cheating <- which.min(svm$test$perf) svm2.smoted.roc.point.learn <- c(svm$learn$tn[best.cheating],svm$learn$tp[best.cheating]) svm2.smoted.roc.point.test <- c(svm$test$tn[best.cheating],svm$test$tp[best.cheating]) svm <- do.full.svm(wine.data2.ROSE$data,wine.data.test) best.cheating <- which.min(svm$test$perf) svm2.rose.roc.point.learn <- c(svm$learn$tn[best.cheating],svm$learn$tp[best.cheating]) svm2.rose.roc.point.test <- c(svm$test$tn[best.cheating],svm$test$tp[best.cheating]) ## ----WineData2SVMRoc,fig.width=3.25,fig.height=3.25--------------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm2.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm2.roc.test,col=2,pch=20) lines(wine.data2.wglm$learn$roc,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,lty=2,lwd=1,col=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineData2SVMRocWithRS,fig.width=3.25,fig.height=3.25--------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm2.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm2.roc.test,col=2,pch=20) lines(wine.data2.wglm$learn$roc,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,lty=2,lwd=1,col=2) points(svm2.smoted.roc.point.learn[1],svm2.smoted.roc.point.learn[2],pch=22,cex=2,bg=1) points(svm2.smoted.roc.point.test[1],svm2.smoted.roc.point.test[2],pch=22,cex=2,bg=2) points(svm2.rose.roc.point.learn[1],svm2.rose.roc.point.learn[2],pch=23,cex=2,bg=1) points(svm2.rose.roc.point.test[1],svm2.rose.roc.point.test[2],pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----svmWineCV,cache=TRUE----------------------------------------------------- svmfolds <- createFolds(wine.data.learn$class,k=5,list=FALSE) svm.cv.weights <- seq(1,50,length.out=51) svm.cv.roc.learn <- data.frame(Specificity=rep(NA,length(svm.cv.weights)),Sensitivity=rep(NA,length(svm.cv.weights))) svm.cv.roc.test <- data.frame(Specificity=rep(NA,length(svm.cv.weights)),Sensitivity=rep(NA,length(svm.cv.weights))) for(i in seq_along(svm.cv.weights)) { baseweights <- c(1,svm.cv.weights[i]) svm.cv <- do.svm.full.cv(wine.data.learn,wine.data.test,svmfolds,baseweights) svm.cv.roc.learn$Specificity[i] <- svm.cv$learn$tn svm.cv.roc.learn$Sensitivity[i] <- svm.cv$learn$tp svm.cv.roc.test$Specificity[i] <- svm.cv$test$tn svm.cv.roc.test$Sensitivity[i] <- svm.cv$test$tp } ## ----svmWineMethodsBefore,cache=TRUE------------------------------------------ svmfolds.smoted <- createFolds(wine.data.smoted$class,k=5,list=FALSE) svm.smoted.cv <- do.svm.full.cv(wine.data.smoted,wine.data.test,svmfolds.smoted) svmfolds.rose <- createFolds(wine.data.ROSE$data$class,k=5,list=FALSE) svm.rose.cv <- do.svm.full.cv(wine.data.ROSE$data,wine.data.test,svmfolds.rose) ## ----svmWineMethodsInside,cache=TRUE------------------------------------------ svm.smoted.cv.in <- do.svm.full.cv(wine.data.learn,wine.data.test,svmfolds,balencing="smote") svm.rose.cv.in <- do.svm.full.cv(wine.data.learn,wine.data.test,svmfolds,balencing="rose") ## ----WineDataSVMRocWithRS,fig.width=3.25,fig.height=3.25---------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm.roc.test,col=2,pch=20) lines(wine.data.wglm$learn$roc,lty=2,lwd=1) lines(wine.data.wglm$test$roc,lty=2,lwd=1,col=2) points(svm.smoted.roc.point.learn[1],svm.smoted.roc.point.learn[2],pch=22,cex=2,bg=1) points(svm.smoted.roc.point.test[1],svm.smoted.roc.point.test[2],pch=22,cex=2,bg=2) points(svm.rose.roc.point.learn[1],svm.rose.roc.point.learn[2],pch=23,cex=2,bg=1) points(svm.rose.roc.point.test[1],svm.rose.roc.point.test[2],pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineDataSVMCVRoc,fig.width=3.25,fig.height=3.25-------------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm.cv.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm.cv.roc.test,col=2,pch=20) lines(wine.data.wglm$learn$roc,lty=2,lwd=1) lines(wine.data.wglm$test$roc,lty=2,lwd=1,col=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineDataSVMCVOutRoc,fig.width=3.25,fig.height=3.25----------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm.cv.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm.cv.roc.test,col=2,pch=20) lines(wine.data.wglm$learn$roc,lty=2,lwd=1) lines(wine.data.wglm$test$roc,lty=2,lwd=1,col=2) points(svm.smoted.cv$learn$tn,svm.smoted.cv$learn$tp,pch=22,cex=2,bg=1) points(svm.smoted.cv$test$tn,svm.smoted.cv$test$tp,pch=22,cex=2,bg=2) points(svm.rose.cv$learn$tn,svm.rose.cv$learn$tp,pch=23,cex=2,bg=1) points(svm.rose.cv$test$tn,svm.rose.cv$test$tp,pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineDataSVMCVInRoc,fig.width=3.25,fig.height=3.25------------------------ par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm.cv.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm.cv.roc.test,col=2,pch=20) lines(wine.data.wglm$learn$roc,lty=2,lwd=1) lines(wine.data.wglm$test$roc,lty=2,lwd=1,col=2) points(svm.smoted.cv.in$learn$tn,svm.smoted.cv.in$learn$tp,pch=22,cex=2,bg=1) points(svm.smoted.cv.in$test$tn,svm.smoted.cv.in$test$tp,pch=22,cex=2,bg=2) points(svm.rose.cv.in$learn$tn,svm.rose.cv.in$learn$tp,pch=23,cex=2,bg=1) points(svm.rose.cv.in$test$tn,svm.rose.cv.in$test$tp,pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----svmWine2CV,cache=TRUE---------------------------------------------------- svm2folds <- createFolds(wine.data2.learn$class,k=5,list=FALSE) svm2.cv.weights <- seq(1,100,length.out=100) svm2.cv.roc.learn <- data.frame(Specificity=rep(NA,length(svm2.cv.weights)),Sensitivity=rep(NA,length(svm2.cv.weights))) svm2.cv.roc.test <- data.frame(Specificity=rep(NA,length(svm2.cv.weights)),Sensitivity=rep(NA,length(svm2.cv.weights))) for(i in seq_along(svm2.cv.weights)) { baseweights <- c(1,svm2.cv.weights[i]) svm2.cv <- do.svm.full.cv(wine.data2.learn,wine.data2.test,svm2folds,baseweights) svm2.cv.roc.learn$Specificity[i] <- svm2.cv$learn$tn svm2.cv.roc.learn$Sensitivity[i] <- svm2.cv$learn$tp svm2.cv.roc.test$Specificity[i] <- svm2.cv$test$tn svm2.cv.roc.test$Sensitivity[i] <- svm2.cv$test$tp } ## ----svm2WineMethodsBefore,cache=TRUE----------------------------------------- svm2folds.smoted <- createFolds(wine.data2.smoted$class,k=5,list=FALSE) svm2.smoted.cv <- do.svm.full.cv(wine.data2.smoted,wine.data2.test,svm2folds.smoted) svm2folds.rose <- createFolds(wine.data2.ROSE$data$class,k=5,list=FALSE) svm2.rose.cv <- do.svm.full.cv(wine.data2.ROSE$data,wine.data2.test,svm2folds.rose) ## ----svm2WineMethodsInside,cache=TRUE----------------------------------------- svm2.smoted.cv.in <- do.svm.full.cv(wine.data2.learn,wine.data2.test,svm2folds,balencing="smote") svm2.rose.cv.in <- do.svm.full.cv(wine.data2.learn,wine.data2.test,svm2folds,balencing="rose") ## ----WineData2SVMRocWithRS,fig.width=3.25,fig.height=3.25--------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm2.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm2.roc.test,col=2,pch=20) lines(wine.data2.wglm$learn$roc,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,lty=2,lwd=1,col=2) points(svm2.smoted.roc.point.learn[1],svm2.smoted.roc.point.learn[2],pch=22,cex=2,bg=1) points(svm2.smoted.roc.point.test[1],svm2.smoted.roc.point.test[2],pch=22,cex=2,bg=2) points(svm2.rose.roc.point.learn[1],svm2.rose.roc.point.learn[2],pch=23,cex=2,bg=1) points(svm2.rose.roc.point.test[1],svm2.rose.roc.point.test[2],pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineData2SVMCVRoc,fig.width=3.25,fig.height=3.25------------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm2.cv.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm2.cv.roc.test,col=2,pch=20) lines(wine.data2.wglm$learn$roc,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,lty=2,lwd=1,col=2) ## ----WineData2SVMCVRocFull,fig.width=3.25,fig.height=3.25--------------------- par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm2.cv.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm2.cv.roc.test,col=2,pch=20) lines(wine.data2.wglm$learn$roc,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,lty=2,lwd=1,col=2) points(svm2.smoted.cv$learn$tn,svm2.smoted.cv$learn$tp,pch=22,cex=2,bg=1) points(svm2.smoted.cv$test$tn,svm2.smoted.cv$test$tp,pch=22,cex=2,bg=2) points(svm2.rose.cv$learn$tn,svm2.rose.cv$learn$tp,pch=23,cex=2,bg=1) points(svm2.rose.cv$test$tn,svm2.rose.cv$test$tp,pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----WineData2SVMCVOutRocFull,fig.width=3.25,fig.height=3.25------------------ par(mar=c(4,4,2,2)+0.1,cex=0.5) plot(svm2.cv.roc.learn,xlim=c(1,0),ylim=c(0,1),pch=20,asp=1) abline(a=1,b=-1,col="grey") points(svm2.cv.roc.test,col=2,pch=20) lines(wine.data2.wglm$learn$roc,lty=2,lwd=1) lines(wine.data2.wglm$test$roc,lty=2,lwd=1,col=2) points(svm2.smoted.cv.in$learn$tn,svm2.smoted.cv.in$learn$tp,pch=22,cex=2,bg=1) points(svm2.smoted.cv.in$test$tn,svm2.smoted.cv.in$test$tp,pch=22,cex=2,bg=2) points(svm2.rose.cv.in$learn$tn,svm2.rose.cv.in$learn$tp,pch=23,cex=2,bg=1) points(svm2.rose.cv.in$test$tn,svm2.rose.cv.in$test$tp,pch=23,cex=2,bg=2) legend("bottomright",legend=c("learning","test","smote","rose"),cex=0.9,col=c(1,2,1,1),pch=c(20,20,22,23)) ## ----wineRf,cache=TRUE-------------------------------------------------------- wine.rf <- randomForest(class~.,data=wine.data.learn,ntree=2000) wine.rf.roc.learn <- roc(wine.data.learn$class,predict(wine.rf,type="prob")[,2]) wine.rf.roc.test <- roc(wine.data.test$class,predict(wine.rf,wine.data.test,type="prob")[,2]) wine.rf.strat <- randomForest(class~.,data=wine.data.learn,strat=wine.data.learn$class,sampsize=table(wine.data.learn$class),ntree=2000) wine.rf.strat.roc.learn <- roc(wine.data.learn$class,predict(wine.rf.strat,type="prob")[,2]) wine.rf.strat.roc.test <- roc(wine.data.test$class,predict(wine.rf.strat,wine.data.test,type="prob")[,2]) wine.rf.wstrat <- randomForest(class~.,data=wine.data.learn,strat=wine.data.learn$class,sampsize=table(wine.data.learn$class),ntree=2000,classwt=1/table(wine.data.learn$class)) wine.rf.wstrat.roc.learn <- roc(wine.data.learn$class,predict(wine.rf.wstrat,type="prob")[,2]) wine.rf.wstrat.roc.test <- roc(wine.data.test$class,predict(wine.rf.wstrat,wine.data.test,type="prob")[,2]) ## ----WineRFRoc,fig.width=3.25,fig.height=3.25--------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf.roc.learn,lwd=1) lines(wine.rf.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineRFRocStrat,fig.width=3.25,fig.height=3.25---------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf.strat.roc.learn,lwd=1) lines(wine.rf.strat.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineRFRocWStrat,fig.width=3.25,fig.height=3.25--------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf.wstrat.roc.learn,lwd=1) lines(wine.rf.wstrat.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----wineRf2,cache=TRUE------------------------------------------------------- wine.rf2 <- randomForest(class~.,data=wine.data2.learn,ntree=2000) wine.rf2.roc.learn <- roc(wine.data2.learn$class,predict(wine.rf2,type="prob")[,2]) wine.rf2.roc.test <- roc(wine.data2.test$class,predict(wine.rf2,wine.data2.test,type="prob")[,2]) wine.rf2.strat <- randomForest(class~.,data=wine.data2.learn,strat=wine.data2.learn$class,sampsize=table(wine.data2.learn$class),ntree=2000) wine.rf2.strat.roc.learn <- roc(wine.data2.learn$class,predict(wine.rf2.strat,type="prob")[,2]) wine.rf2.strat.roc.test <- roc(wine.data2.test$class,predict(wine.rf2.strat,wine.data2.test,type="prob")[,2]) wine.rf2.wstrat <- randomForest(class~.,data=wine.data2.learn,strat=wine.data2.learn$class,sampsize=table(wine.data2.learn$class),ntree=2000,classwt=1/table(wine.data2.learn$class)) wine.rf2.wstrat.roc.learn <- roc(wine.data2.learn$class,predict(wine.rf2.wstrat,type="prob")[,2]) wine.rf2.wstrat.roc.test <- roc(wine.data2.test$class,predict(wine.rf2.wstrat,wine.data2.test,type="prob")[,2]) ## ----WineRF2Roc,fig.width=3.25,fig.height=3.25-------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf2.roc.learn,lwd=1) lines(wine.rf2.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineRF2RocStrat,fig.width=3.25,fig.height=3.25--------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf2.strat.roc.learn,lwd=1) lines(wine.rf2.strat.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineRF2RocWStrat,fig.width=3.25,fig.height=3.25-------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf2.wstrat.roc.learn,lwd=1) lines(wine.rf2.wstrat.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----wineRfBalanced,cache=TRUE------------------------------------------------ wine.brf <- randomForest(class~.,data=wine.data.learn,ntree=2000,strat=wine.data.learn$class,sampsize=rep(min(table(wine.data.learn$class)),2)) wine.brf.roc.learn <- roc(wine.data.learn$class,predict(wine.brf,type="prob")[,2]) wine.brf.roc.test <- roc(wine.data.test$class,predict(wine.brf,wine.data.test,type="prob")[,2]) wine.brf2 <- randomForest(class~.,data=wine.data2.learn,ntree=2000,strat=wine.data2.learn$class,sampsize=rep(min(table(wine.data2.learn$class)),2)) wine.brf2.roc.learn <- roc(wine.data2.learn$class,predict(wine.brf2,type="prob")[,2]) wine.brf2.roc.test <- roc(wine.data2.test$class,predict(wine.brf2,wine.data2.test,type="prob")[,2]) ## ----WineBRFRoc,fig.width=3.25,fig.height=3.25-------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.brf.roc.learn,lwd=1) lines(wine.brf.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineBRF2Roc,fig.width=3.25,fig.height=3.25------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.brf2.roc.learn,lwd=1) lines(wine.brf2.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----rfRose,cache=TRUE-------------------------------------------------------- wine.rf.rose <- randomForest(class~.,data=wine.data.ROSE$data,ntree=2000) wine.rf.rose.roc.learn <- roc(wine.data.learn$class,predict(wine.rf.rose,wine.data.learn,type="prob")[,2]) wine.rf.rose.roc.test <- roc(wine.data.test$class,predict(wine.rf.rose,wine.data.test,type="prob")[,2]) wine.rf2.rose <- randomForest(class~.,data=wine.data2.ROSE$data,ntree=2000) wine.rf2.rose.roc.learn <- roc(wine.data2.learn$class,predict(wine.rf2.rose,wine.data2.learn,type="prob")[,2]) wine.rf2.rose.roc.test <- roc(wine.data2.test$class,predict(wine.rf2.rose,wine.data2.test,type="prob")[,2]) ## ----WineRoseRFRoc,fig.width=3.25,fig.height=3.25----------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf.rose.roc.learn,lwd=1) lines(wine.rf.rose.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineRoseRF2Roc,fig.width=3.25,fig.height=3.25---------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine.rf2.rose.roc.learn,lwd=1) lines(wine.rf2.rose.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----wineboost,cache=TRUE----------------------------------------------------- winexg.train <- xgb.DMatrix(data = data.matrix(wine.data.learn[,1:10]), label = as.integer(as.character(wine.data.learn$class))) winexg.w.train <- xgb.DMatrix(data = data.matrix(wine.data.learn[,1:10]), label = as.integer(as.character(wine.data.learn$class)), weight=wine.weights[as.integer(wine.data.learn$class)]) winexg.test <- xgb.DMatrix(data = data.matrix(wine.data.test[,1:10]), label = as.integer(as.character(wine.data.test$class))) nbfolds <- max(svmfolds) xbg.folds <- vector(mode="list",length=nbfolds) for(f in 1:nbfolds) { xbg.folds[[f]] <- which(svmfolds==f) } winexg.cv <- xgb.cv(params = list(objective = "binary:logistic"),data=winexg.train,folds=xbg.folds,nrounds=50,verbose=FALSE) winexg.model <- xgboost(data=winexg.train,params = list(objective = "binary:logistic"),nrounds=which.min(winexg.cv$evaluation_log$test_error_mean),verbose=FALSE) winexg.model.roc.learn <- roc(wine.data.learn$class,predict(winexg.model,winexg.train)) winexg.model.roc.test <- roc(wine.data.test$class,predict(winexg.model,winexg.test)) winexg.w.cv <- xgb.cv(params = list(objective = "binary:logistic"),data=winexg.w.train,folds=xbg.folds,nrounds=50,verbose=FALSE) winexg.w.model <- xgboost(data=winexg.w.train,params = list(objective = "binary:logistic"),nrounds=which.min(winexg.w.cv$evaluation_log$test_error_mean),verbose=FALSE) winexg.w.model.roc.learn <- roc(wine.data.learn$class,predict(winexg.w.model,winexg.train)) winexg.w.model.roc.test <- roc(wine.data.test$class,predict(winexg.w.model,winexg.test)) ## ----WineXGboost,fig.width=3.25,fig.height=3.25------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(winexg.model.roc.learn,lwd=1) lines(winexg.model.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineWXGboost,fig.width=3.25,fig.height=3.25------------------------------ par(mar=c(4,4,0.1,0.1),cex=0.5) plot(winexg.w.model.roc.learn,lwd=1) lines(winexg.w.model.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----wineboost2,cache=TRUE---------------------------------------------------- winexg2.train <- xgb.DMatrix(data = data.matrix(wine.data2.learn[,1:10]), label = as.integer(as.character(wine.data2.learn$class))) winexg2.w.train <- xgb.DMatrix(data = data.matrix(wine.data2.learn[,1:10]), label = as.integer(as.character(wine.data2.learn$class)), weight=wine.weights2[as.integer(wine.data2.learn$class)]) winexg2.test <- xgb.DMatrix(data = data.matrix(wine.data2.test[,1:10]), label = as.integer(as.character(wine.data2.test$class))) nbfolds <- max(svmfolds) xbg.folds <- vector(mode="list",length=nbfolds) for(f in 1:nbfolds) { xbg.folds[[f]] <- which(svmfolds==f) } winexg2.cv <- xgb.cv(params = list(objective = "binary:logistic"),data=winexg2.train,folds=xbg.folds,nrounds=50,verbose=FALSE) winexg2.model <- xgboost(data=winexg2.train,params = list(objective = "binary:logistic"),nrounds=which.min(winexg2.cv$evaluation_log$test_error_mean),verbose=FALSE) winexg2.model.roc.learn <- roc(wine.data.learn$class,predict(winexg2.model,winexg2.train)) winexg2.model.roc.test <- roc(wine.data.test$class,predict(winexg2.model,winexg2.test)) winexg2.w.cv <- xgb.cv(params = list(objective = "binary:logistic"),data=winexg2.w.train,folds=xbg.folds,nrounds=50,verbose=FALSE) winexg2.w.model <- xgboost(data=winexg2.w.train,params = list(objective = "binary:logistic"),nrounds=which.min(winexg2.w.cv$evaluation_log$test_error_mean),verbose=FALSE) winexg2.w.model.roc.learn <- roc(wine.data.learn$class,predict(winexg2.w.model,winexg2.train)) winexg2.w.model.roc.test <- roc(wine.data.test$class,predict(winexg2.w.model,winexg2.test)) ## ----Winexg2boost,fig.width=3.25,fig.height=3.25------------------------------ par(mar=c(4,4,0.1,0.1),cex=0.5) plot(winexg2.model.roc.learn,lwd=1) lines(winexg2.model.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1) ## ----WineWXG2boost,fig.width=3.25,fig.height=3.25----------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(winexg2.w.model.roc.learn,lwd=1) lines(winexg2.w.model.roc.test,lwd=1,col="red") legend("bottomright",legend=c("learning","test"),cex=0.9,col=c(1,2),lwd=1)