## ----setup,cache=FALSE,echo=FALSE--------------------------------------------- current.file.basename <- "outliers" ## ----common-setup,child='beamer.Rnw'------------------------------------------ ## ----setupforbeamer,cache=FALSE,echo=FALSE,results='hide',message=FALSE------- set.seed(0) datadir <- paste("data-",current.file.basename,sep="") ## full size by default ## beamer sizes: 5.04x3.78 opts_chunk$set(echo=FALSE,cache.path=paste(".cache-",current.file.basename,"/",sep=""), fig.env='center', fig.width=4.5,fig.height=3.4, fig.path=paste("figures-",current.file.basename,"/",sep="")) options(scipen = 5, OutDec=".") beamerblue <- rgb(0.2,0.2,0.7) beamergreen <- rgb(0,0.5,0) beamerblue.alpha <- rgb(0.2,0.2,0.7,0.5) apiacoa.colors <- list("darkblue"="#051F42", "darkorange"="#E84017", "darkgreen"="#3D6B34", "blue"="#003075", "lightblue"="#99BFBB", "darkrose"="#997568") palette(unlist(apiacoa.colors)) to_alpha <- function(color,alpha=0.5) { dacol <- as.vector(col2rgb(color)) rgb(red=dacol[1]/255,green=dacol[2]/255,blue=dacol[3]/255,alpha=alpha) } ## tool for multiple figure printing hook_plot <- knit_hooks$get('plot') # the default hook hook_plot_multi <- function(x, options) { txt = hook_plot(x, options) if (options$fig.cur <= 0) return(txt) # add \only before \includegraphics gsub('(\\\\includegraphics[^}]+} )', sprintf('\\\\only<%d>{\\1}%%', options$fig.cur), txt) } to_single_fig <- function() { knit_hooks$set(plot = hook_plot) } to_multi_fig <- function() { knit_hooks$set(plot = hook_plot_multi) } ## standard packages always used library(xtable) ## ----ggplot-setup,child='beamer-ggplot.Rnw'----------------------------------- ## ----ggplotbeamer,cache=FALSE,echo=FALSE,results='hide',message=FALSE--------- library(ggplot2) theme_set(theme_bw(base_size=7)+theme( line=element_line(size=0.1), rect=element_rect(fill=grey(0.98)), legend.key.size=unit(0.5,"lines"), legend.key=element_rect(fill=grey(0.98)), panel.grid.major = element_line(size = 0.3), panel.grid.minor = element_line(size = 0.1) ) ) ## ----local-setup-------------------------------------------------------------- opts_chunk$set(cache=TRUE,autodep = TRUE) ## ----packages,cache=FALSE,echo=FALSE,results='hide',message=FALSE,warning=FALSE---- library(dbscan) library(MASS) library(tidyverse) library(EnvStats) # gamma distribution estimation library(lubridate) library(ellipse) library(DDoutlier) library(e1071) library(isotree) ## ----loadWine,cache=TRUE------------------------------------------------------ wine <- read.csv("../data/wine.data",header=FALSE) names(wine) <- c("Class","Alcohol","Malic acid","Ash","Alcalinity of ash", "Magnesium", "Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins", "Color intensity","Hue","OD280/OD315","Proline") wine$Class <- as.factor(wine$Class) ## ----wineDbScanEx,cache=TRUE-------------------------------------------------- wine.sub.dbscan <- dbscan(wine[c("Flavanoids","Alcohol")],eps=0.55,minPts=4) ## ----wineFlavAlco,fig.width=2.5,fig.height=2---------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine[c("Flavanoids","Alcohol")],asp=1,pch=20) ## ----wineFlavAlcoOut,fig.width=2.5,fig.height=2------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine[c("Flavanoids","Alcohol")],asp=1,pch=20,col=ifelse(wine[["Flavanoids"]]<5,1,2)) ## ----wineFlavAlcoDbs,fig.width=2.5,fig.height=2------------------------------- par(mar=c(4,4,0.1,0.1),cex=0.5) plot(wine[c("Flavanoids","Alcohol")],asp=1,pch=20,col=ifelse(wine.sub.dbscan$cluster!=0 ,1,2)) ## ----loadWineSpectra,cache=TRUE----------------------------------------------- spwine.spectra <- read.table("../data/wine-spectra/Wine_x_learning.txt") spwine.alcohol <- read.table("../data/wine-spectra/Wine_y_learning.txt") names(spwine.alcohol) <- "Alcohol" spwine.alcohol.m <- mean(spwine.alcohol[["Alcohol"]]) spwine.alcohol.sd <- sd(spwine.alcohol[["Alcohol"]]) spwine.alcohol[["Cumul"]] <- pnorm(spwine.alcohol[["Alcohol"]],mean=spwine.alcohol.m,sd=spwine.alcohol.sd) spwine.alcohol[["Outlier"]] <- pmin(spwine.alcohol[["Cumul"]],1-spwine.alcohol[["Cumul"]])<1e-3 spwine.spectra.m <- apply(spwine.spectra,2,mean) spwine.spectra.sd <- apply(spwine.spectra,2,sd) spwine.spectra.mp2d <- spwine.spectra.m+2*spwine.spectra.sd spwine.spectra.mm2d <- spwine.spectra.m-2*spwine.spectra.sd spwine.spectra.fr <- seq(1/4000,1/400,length.out=ncol(spwine.spectra)) spwine.spectra.carac <- data.frame(mean=apply(spwine.spectra,1,mean),sd=apply(spwine.spectra,1,sd)) spwine.outliers <- c(34, 35, 84) ## ----fig.width=2.2,fig.height=1.8--------------------------------------------- ggplot(spwine.alcohol,aes(Alcohol)) + geom_density() + geom_rug(alpha=0.5,size=0.3) ## ----fig.width=2.2,fig.height=1.8--------------------------------------------- ggplot(spwine.alcohol,aes(Alcohol)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=!Outlier),show.legend=FALSE) ## ----cleanWineSpectra--------------------------------------------------------- spwine.spectra.c1 <- spwine.spectra[-84,] spwine.spectra.c1.m <- apply(spwine.spectra.c1,2,mean) spwine.spectra.c1.sd <- apply(spwine.spectra.c1,2,sd) spwine.spectra.c1.mp2d <- spwine.spectra.c1.m+2*spwine.spectra.c1.sd spwine.spectra.c1.mm2d <- spwine.spectra.c1.m-2*spwine.spectra.c1.sd ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra.m,ylim=range(spwine.spectra.mp2d,spwine.spectra.mm2d), col=2,xlab="Frequency",ylab="",type="l",main="Mean spectrum and deviation") lines(spwine.spectra.fr,spwine.spectra.mp2d,lwd=0.5) lines(spwine.spectra.fr,spwine.spectra.mm2d,lwd=0.5) ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra[84,],ylim=range(spwine.spectra.mp2d,spwine.spectra.mm2d), col=1,xlab="Frequency",ylab="",type="l",main="Faulty spectrum") ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra.c1.m,ylim=range(spwine.spectra.c1.mp2d,spwine.spectra.c1.mm2d), col=2,xlab="Frequency",ylab="",type="l",main="Mean spectrum and deviation (clean)") lines(spwine.spectra.fr,spwine.spectra.c1.mp2d,lwd=0.5) lines(spwine.spectra.fr,spwine.spectra.c1.mm2d,lwd=0.5) ## ----fig.height=2.9----------------------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra[1,], col=1,xlab="Frequency",ylab="",type="l",main="A spectrum") ## ----fig.height=2.9----------------------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra[34,], col=1,xlab="Frequency",ylab="",type="l",main="Another spectrum") ## ----fig.height=2.9----------------------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra.c1.m,ylim=range(spwine.spectra.c1.mp2d,spwine.spectra.c1.mm2d), col=1,xlab="Frequency",ylab="",type="l",main="Mean spectrum and deviation (clean)",lwd=0.5) lines(spwine.spectra.fr,spwine.spectra.c1.mp2d,lwd=0.5) lines(spwine.spectra.fr,spwine.spectra.c1.mm2d,lwd=0.5) lines(spwine.spectra.fr,spwine.spectra[1,],col=2) ## ----fig.height=2.9----------------------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra.c1.m,ylim=range(spwine.spectra.c1.mp2d,spwine.spectra.c1.mm2d), col=1,xlab="Frequency",ylab="",type="l",main="Mean spectrum and deviation (clean)",lwd=0.5) lines(spwine.spectra.fr,spwine.spectra.c1.mp2d,lwd=0.5) lines(spwine.spectra.fr,spwine.spectra.c1.mm2d,lwd=0.5) lines(spwine.spectra.fr,spwine.spectra[34,],col=2) ## ----loadPowerCurve,cache=TRUE,message=FALSE---------------------------------- powercurves <- read_delim("../data/power/power-curves.txt"," ") hours <- seq(0,24,length.out=145)[1:144] curves <- as.matrix(powercurves[,2:145]) yrange <- range(curves) mean.power <- data.frame(date=powercurves$date,power=rowMeans(curves)) daysofyear <- 1 + as.POSIXlt(powercurves$date)$yday ## ----fig.width=2.2,fig.height=2----------------------------------------------- par(cex=0.5,mar=c(4,4,0.1,0.1)) plot(hours,curves[9,],type="l",ylab="kW",xlab="time",ylim=yrange) lines(hours,curves[10,],col=2) lines(hours,curves[11,],col=3) ## ----powerAsGamma,cache=TRUE-------------------------------------------------- mean.power.gamma <- egamma(mean.power$power) mean.power[["Cumul"]] <- pgamma(mean.power$power,shape= mean.power.gamma$parameters[1],scale= mean.power.gamma$parameters[2]) mean.power[["Oulier"]] <- pmin(mean.power[["Cumul"]],1-mean.power[["Cumul"]])<1e-3 ## ----fig.width=2.2,fig.height=2----------------------------------------------- ggplot(mean.power,aes(x=power)) + geom_density() + geom_rug(alpha=0.5,size=0.3) + ggtitle("Mean daily power") ## ----fig.width=2.3,fig.height=2----------------------------------------------- ggplot(mean.power,aes(x=date,y=power)) + geom_point(size=0.5) + ggtitle("Mean daily power") ## ----------------------------------------------------------------------------- collective.outlier <- 53:58 ## ----fig.height=2.9----------------------------------------------------------- ggplot(mean.power,aes(x=date,y=power)) + geom_line(size=0.25) + ggtitle("Mean daily power") ## ----fig.height=2.9----------------------------------------------------------- ggplot(mean.power,aes(x=date,y=power)) + geom_line(size=0.25) + ggtitle("Mean daily power") + annotate(geom="line",x=mean.power$date[collective.outlier], y=mean.power$power[collective.outlier],size=1,col=2) ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- par(cex=0.5) plot(spwine.spectra.fr,spwine.spectra.sd, xlab="Frequency",ylab="",type="l",main="Standard deviation") ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- par(cex=0.5) plot(embed(spwine.spectra.sd,2), xlab="Value at frequency i",ylab="Value at frequency i+1",pch=20,main="Standard deviation") ## ----------------------------------------------------------------------------- mean.power.week <- mean.power %>% group_by(week=week(date)) %>% summarise(power=mean(power)) mean.power.week.out <- order(mean.power.week$power)[c(1:3,length(mean.power.week$power))] mean.power.week[["Outlier"]] <- mean.power.week$week %in% mean.power.week.out mean.power[["WeekOutlier"]] <- week(mean.power$date) %in% mean.power.week.out mean.power.wo <- which(mean.power[["WeekOutlier"]]) mean.power.jumps <- c(which(diff(which(mean.power[["WeekOutlier"]]))>1),length(mean.power.wo)) ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- ggplot(mean.power.week,aes(x=power)) + geom_density() + geom_rug(alpha=0.5,size=0.3) ## ----fig.width=2.8,fig.height=2.7--------------------------------------------- ggplot(mean.power.week,aes(x=power)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(colour=!Outlier),show.legend=FALSE) ## ----fig.height=2.9----------------------------------------------------------- base <- ggplot(mean.power,aes(x=date,y=power)) + geom_line(size=0.25) + ggtitle("Mean daily power") basek <- 1 for(k in mean.power.jumps) { subset <- mean.power.wo[basek:k] base <- base + annotate(geom="line",x=mean.power$date[subset], y=mean.power$power[subset],size=1,col=2) basek <- k+1 } print(base) ## ----------------------------------------------------------------------------- wineAM <- wine %>% dplyr::select(Ash,Magnesium) wineAM2 <- wine %>% dplyr::select(Ash,Magnesium) wineAM.mu <-wineAM %>% colMeans() wineAM.S <- wineAM %>% cov() wineAM$distance <- sqrt(mahalanobis(wineAM, wineAM.mu,wineAM.S)) wineAM$chisq <- pchisq(wineAM$distance^2,df=2) wineAM[["Outlier"]] <- wineAM$chisq>=1-1e-3 wineAM$Tsq <- pf(wineAM$distance^2*nrow(wineAM)/(nrow(wineAM)-1)*(nrow(wineAM)-ncol(wineAM))/(ncol(wineAM)*(nrow(wineAM)-1)),ncol(wineAM),nrow(wineAM)-ncol(wineAM)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=!Outlier),size=0.5,show.legend=FALSE) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=!Outlier),show.legend=FALSE) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=chisq<1-1e-2),size=0.5,show.legend=FALSE) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=chisq<1-1e-2),show.legend=FALSE) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=chisq<1-1e-3),size=0.5,show.legend=FALSE) + ggtitle("Chi squared (0.001)") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=Tsq<1-1e-3),size=0.5,show.legend=FALSE) + ggtitle("T squared (0.001)") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=chisq<1-1e-2),size=0.5,show.legend=FALSE) + ggtitle("Chi squared (0.01)") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=Tsq<1-1e-2),size=0.5,show.legend=FALSE) + ggtitle("T squared (0.01)") ## ----fig.height=2.9----------------------------------------------------------- ggplot(wine,aes(Ash,Magnesium)) + geom_point(size=0.5) + stat_ellipse(type="norm",level=0.99) + stat_ellipse(type="norm",level=0.999,linetype=2)+ ggtitle("Ggplot tool") ## ----avgconv,cache=TRUE------------------------------------------------------- cumvar <- function(x) { base <- (cummean(x^2)-cummean(x)^2)[-1] base*(2:length(x))/(1:(length(x)-1)) } simust_df <- 4 simu_size <- 500 simuconv <- data.frame(gaussian=rnorm(simu_size,sd=sqrt(simust_df/(simust_df-2))), student=rt(simu_size,simust_df)) simumean <- data.frame(n=1:simu_size,gaussian=cummean(simuconv$gaussian), student=cummean(simuconv$student)) simuvar <- data.frame(n=2:simu_size,gaussian=cumvar(simuconv$gaussian), student=cumvar(simuconv$student)) simutheo <- data.frame(x=seq(-4*sqrt(simust_df/(simust_df-2)),4*sqrt(simust_df/(simust_df-2)),length.out=501)) simutheo <- simutheo %>% mutate(gaussian=dnorm(x,sd=sqrt(simust_df/(simust_df-2))),student=dt(x,simust_df)) simutheo <- simutheo %>% gather(gaussian,student,key="distribution",value="density") simuconv.g <- simuconv %>% gather(gaussian,student,key="distribution",value="value") simumean.g <- simumean %>% gather(gaussian,student,key="distribution",value="value") simuvar.g <- simuvar %>% gather(gaussian,student,key="distribution",value="value") ## ----fig.height=2.9----------------------------------------------------------- ggplot(simutheo,aes(x=x,y=density,group=distribution,color=distribution)) + geom_line() + labs(title=paste("Variance =",simust_df/(simust_df-2))) ## ----fig.height=2.6----------------------------------------------------------- ggplot(simuconv.g,aes(x=value)) + geom_density() + facet_wrap(.~distribution) + geom_rug(alpha=0.5,size=0.3) ## ----fig.height=2.9----------------------------------------------------------- ggplot(simumean.g ,aes(x=n,y=value,group=distribution,color=distribution)) + geom_line() +labs(title="Mean estimation") + annotate("line",x=c(1,500),y=0,size=0.5,alpha=0.25) ## ----fig.height=2.9----------------------------------------------------------- ggplot(simumean.g %>% filter(n>50),aes(x=n,y=value,group=distribution,color=distribution)) + geom_line() +labs(title="Mean estimation")+ annotate("line",x=c(51,500),y=0,size=0.5,alpha=0.25) ## ----fig.height=2.9----------------------------------------------------------- ggplot(simuvar.g ,aes(x=n,y=value,group=distribution,color=distribution)) + geom_line() +labs(title="Variance estimation") + annotate("line",x=c(1,500),y=simust_df/(simust_df-2),size=0.5,alpha=0.25) ## ----fig.height=2.9----------------------------------------------------------- ggplot(simuvar.g %>% filter(n>50),aes(x=n,y=value,group=distribution,color=distribution)) + geom_line() +labs(title="Variance estimation") + annotate("line",x=c(51,500),y=simust_df/(simust_df-2),size=0.5,alpha=0.25) ## ----------------------------------------------------------------------------- wineAM_MLE <- cov(wineAM2) wineAM_MLE_el95 <- data.frame(ellipse(wineAM_MLE,centre=colMeans(wineAM),level=0.95)) wineAM_MVE <- cov.rob(wineAM2,method="mve") wineAM_MVE_el95 <- data.frame(ellipse(wineAM_MVE$cov,centre=wineAM_MVE$center,level=0.95)) wineAM_MCD <- cov.rob(wineAM2,method="mcd") wineAM_MCD_el95 <- data.frame(ellipse(wineAM_MCD$cov,centre=wineAM_MCD$center,level=0.95)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) + geom_path(data=wineAM_MLE_el95,aes(Ash,Magnesium))+ggtitle("MLE, 95%")+scale_y_continuous(limits=c(55,165)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) + geom_path(data=wineAM_MVE_el95,aes(Ash,Magnesium))+ggtitle("MVE, 95%")+scale_y_continuous(limits=c(55,165)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) + geom_path(data=wineAM_MCD_el95,aes(Ash,Magnesium))+ggtitle("MCD, 95%")+scale_y_continuous(limits=c(55,165)) ## ----------------------------------------------------------------------------- wineAM_MLE_el99 <- data.frame(ellipse(wineAM_MLE,centre=colMeans(wineAM),level=0.99)) wineAM_MVE_el99 <- data.frame(ellipse(wineAM_MVE$cov,centre=wineAM_MVE$center,level=0.99)) wineAM_MCD_el99 <- data.frame(ellipse(wineAM_MCD$cov,centre=wineAM_MCD$center,level=0.99)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) + geom_path(data=wineAM_MLE_el99,aes(Ash,Magnesium))+ggtitle("MLE, 99%")+scale_y_continuous(limits=c(55,165)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) + geom_path(data=wineAM_MVE_el99,aes(Ash,Magnesium))+ggtitle("MVE, 99%")+scale_y_continuous(limits=c(55,165)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(size=0.5) + geom_path(data=wineAM_MCD_el99,aes(Ash,Magnesium))+ggtitle("MCD, 99%")+scale_y_continuous(limits=c(55,165)) ## ----------------------------------------------------------------------------- wineAM2$distance <- sqrt(mahalanobis(wineAM2,wineAM_MCD$center,wineAM_MCD$cov)) wineAM2$chisq <- pchisq(wineAM2$distance^2,df=2) wineAM2[["Outlier"]] <- wineAM2$chisq>=1-1e-3 wineAM2$Tsq <- pf(wineAM2$distance^2*nrow(wineAM2)/(nrow(wineAM2)-1)*(nrow(wineAM2)-ncol(wineAM2))/(ncol(wineAM2)*(nrow(wineAM2)-1)),ncol(wineAM2),nrow(wineAM2)-ncol(wineAM2)) ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM2,aes(Ash,Magnesium)) + geom_point(size=0.5) + ggtitle("MCD") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM2,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3)+ ggtitle("MCD") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM2,aes(Ash,Magnesium)) + geom_point(aes(color=!Outlier),size=0.5,show.legend=FALSE) + ggtitle("MCD, 99.9%") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM2,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=!Outlier),show.legend=FALSE)+ ggtitle("MCD, 99.9%") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM2,aes(Ash,Magnesium)) + geom_point(aes(color=chisq<1-1e-2),size=0.5,show.legend=FALSE)+ ggtitle("MCD, 99%") ## ----fig.width=2.1,fig.height=2----------------------------------------------- ggplot(wineAM2,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=chisq<1-1e-2),show.legend=FALSE)+ ggtitle("MCD, 99%") ## ----fig.width=2.1,fig.height=1.5--------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium)) + geom_point(aes(color=!Outlier),size=0.5,show.legend=FALSE)+ ggtitle("MLE, 99.9%") ## ----fig.width=2.1,fig.height=1.5--------------------------------------------- ggplot(wineAM,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=!Outlier),show.legend=FALSE)+ ggtitle("MLE, 99.9%") ## ----fig.width=2.1,fig.height=1.5--------------------------------------------- ggplot(wineAM2,aes(Ash,Magnesium)) + geom_point(aes(color=!Outlier),size=0.5,show.legend=FALSE)+ ggtitle("MCD, 99.9%") ## ----fig.width=2.1,fig.height=1.5--------------------------------------------- ggplot(wineAM2,aes(distance)) + geom_density() + geom_rug(alpha=0.5,size=0.3,aes(color=!Outlier),show.legend=FALSE)+ ggtitle("MCD, 99.9%") ## ----powerCov,cache=TRUE------------------------------------------------------ power_MCD <- cov.rob(curves,method="mcd") power_MLE <- list(cov=cov(curves),center=colMeans(curves)) ## ----powerPCA,cache=TRUE------------------------------------------------------ power_PCA <- prcomp(curves) power_PCA_exp <- cumsum(power_PCA$sdev^2) power_PCA_exp <- power_PCA_exp/power_PCA_exp[length(power_PCA_exp)] ## ----powerDist---------------------------------------------------------------- powerDist <- list() for(dims in c(2,4,10,20,50,144)) { powerDist[[as.character(dims)]] <- apply(power_PCA$x[,1:dims],1,function(x) sqrt(sum(x^2))) } powerDist <- as_tibble(powerDist) powerDist <- powerDist %>% gather(key="dimension",value="distance") powerDist <- powerDist %>% mutate(dimension=factor(dimension,levels=as.character(c(2,4,10,20,50,144)))) ## ----fig.height=2,fig.width=2.2----------------------------------------------- par(mar=c(4,4,1.1,0.1),cex=0.5) barplot(power_PCA_exp[1:40],xlab="Components",ylab="% of explained variance",ylim=c(0,1)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(powerDist,aes(distance,group=dimension,color=dimension)) + geom_density() + facet_wrap(.~dimension) + geom_rug(show.legend=FALSE,alpha=0.5,size=0.3) ## ----powerMaha---------------------------------------------------------------- power_MCD_M <- sqrt(mahalanobis(curves,power_MCD$center,power_MCD$cov)) power_MLE_M <- sqrt(mahalanobis(curves,power_MLE$center,power_MLE$cov)) power_mah <- tibble(MCD=power_MCD_M,MLE=power_MLE_M) power_mah_g <- power_mah %>% gather(key="method",value="distance") ## ----fig.height=2.9----------------------------------------------------------- ggplot(power_mah_g,aes(distance,group=method,color=method)) + geom_density() + geom_rug(show.legend=FALSE,alpha=0.5,size=0.3) ## ----fig.height=2.9----------------------------------------------------------- ggplot(power_mah,aes(MCD,MLE)) + geom_point(size=0.5)+ggtitle("") ## ----------------------------------------------------------------------------- bounds.99 <- list(min=sqrt(qchisq(1-0.995,df=ncol(curves))),max=sqrt(qchisq(0.995,df=ncol(curves)))) bounds.999 <- list(min=sqrt(qchisq(1-0.9995,df=ncol(curves))),max=sqrt(qchisq(0.9995,df=ncol(curves)))) bounds.99999 <- list(min=sqrt(qchisq(1-0.999995,df=ncol(curves))),max=sqrt(qchisq(0.999995,df=ncol(curves)))) ## ----fig.height=2.9----------------------------------------------------------- ggplot(power_mah,aes(MCD,MLE)) + geom_point(size=0.5) +annotate("rect",xmin=bounds.99$min,xmax=bounds.99$max,ymin=bounds.99$min,ymax=bounds.99$max,alpha=0.1,fill=2)+ggtitle("99% region") ## ----fig.height=2.9----------------------------------------------------------- ggplot(power_mah,aes(MCD,MLE)) + geom_point(size=0.5) +annotate("rect",xmin=bounds.999$min,xmax=bounds.999$max,ymin=bounds.999$min,ymax=bounds.999$max,alpha=0.1,fill=2)+ggtitle("99.9% region") ## ----fig.height=2.9----------------------------------------------------------- ggplot(power_mah,aes(MCD,MLE)) + geom_point(size=0.5) +annotate("rect",xmin=bounds.99999$min,xmax=bounds.99999$max,ymin=bounds.99999$min,ymax=bounds.99999$max,alpha=0.1,fill=2)+ggtitle("99.999% region") ## ----------------------------------------------------------------------------- nb.comp <- which.min(power_PCA_exp<=0.95) power_PCA_C <- list(cov=cov(power_PCA$x[,1:nb.comp]),center=colMeans(power_PCA$x[,1:nb.comp])) power_PCA_D <- sqrt(mahalanobis(power_PCA$x[,1:nb.comp],power_PCA_C$center,power_PCA_C$cov)) bounds.PCA.999 <- list(min=sqrt(qchisq(1-0.9995,df=nb.comp)),max=sqrt(qchisq(0.9995,df=nb.comp))) ## ----wineLOF------------------------------------------------------------------ wineAM$LOF <- LOF(scale(as.matrix(wine %>% dplyr::select(Ash,Magnesium))),k=5) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium,size=LOF))+geom_point(alpha=0.2,shape=1)+geom_point(size=0.1)+ggtitle("LOF (k=5)") ## ----wineINFLO---------------------------------------------------------------- wineAM$INFLO <- INFLO(scale(as.matrix(wine %>% dplyr::select(Ash,Magnesium))),k=5) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium,size=INFLO))+geom_point(alpha=0.2,shape=1)+geom_point(size=0.1)+ggtitle("INFLO (k=5)") ## ----OneClassSVM-------------------------------------------------------------- wineOC <- as.matrix(wine %>% dplyr::select(Ash,Magnesium)) gammas <- seq(0.01,0.75,length.out=15) nus <- seq(0.005,0.2,length.out=15) params <- expand.grid(nu=nus,gamma=gammas) OCWine <- list() for(p in 1:nrow(params)) { OCWine[[p]] <- svm(wineOC,scale=T,type="one-classification",nu=params[p,]$nu,gamma=params[p,]$gamma) } results <- params results$outliers <- sapply(OCWine,function(x) sum(!x$fitted)) ## ----fig.height=2.9----------------------------------------------------------- ggplot(wineAM,aes(Ash,Magnesium))+geom_point(size=0.5) ## ----fig.height=2.9----------------------------------------------------------- ggplot(results,aes(nu,gamma,z=outliers)) + scale_x_continuous(expand=expansion()) +scale_y_continuous(expand=expansion())+ geom_contour_filled(breaks=c(0:10,seq(15,60,by=5)))+ggtitle("Number of outliers") ## ----------------------------------------------------------------------------- myexpanded <- function(arange,nb,mult=0.05) { min <- arange[1] max <- arange[2] nmin <- min-(max-min)*mult nmax <- max+(max-min)*mult seq(nmin,nmax,length.out=nb) } ## ----OneClassSVMEval---------------------------------------------------------- places <- expand.grid(Ash=myexpanded(range(wine$Ash),101), Magnesium=myexpanded(range(wine$Magnesium),101)) predictions <- list() for(p in 1:nrow(params)) { the.model <- OCWine[[p]] predictions[[p]] <- predict(the.model,places) } ## ----------------------------------------------------------------------------- resultsOCSVM <- places resultsOCSVM$level <- rep(0,nrow(resultsOCSVM)) tokeep <- which(params$nu==nus[2]) for(tk in tokeep) { resultsOCSVM$level <- resultsOCSVM$level + as.integer(predictions[[tk]]) } ## ----fig.height=2.9----------------------------------------------------------- ggplot(resultsOCSVM,aes(Ash,Magnesium,z=level))+ geom_contour_filled(breaks=seq(-1,length(tokeep)+1),alpha=0.75)+ scale_x_continuous(expand=expansion()) +scale_y_continuous(expand=expansion())+ annotate("point",x=wineAM$Ash,y=wineAM$Magnesium,size=0.5)+ggtitle(paste("Fixed nu:",round(nus[2],4))) ## ----------------------------------------------------------------------------- resultsOCSVM <- places resultsOCSVM$level <- rep(0,nrow(resultsOCSVM)) tokeep <- which(params$gamma==gammas[3]) for(tk in tokeep) { resultsOCSVM$level <- resultsOCSVM$level + as.integer(predictions[[tk]]) } ## ----fig.height=2.9----------------------------------------------------------- ggplot(resultsOCSVM,aes(Ash,Magnesium,z=level))+ geom_contour_filled(breaks=seq(-1,length(tokeep)+1),alpha=0.75)+ scale_x_continuous(expand=expansion()) +scale_y_continuous(expand=expansion())+ annotate("point",x=wineAM$Ash,y=wineAM$Magnesium,size=0.5)+ggtitle(paste("Fixed gamma",round(gammas[3],4))) ## ----------------------------------------------------------------------------- resultsOCSVM <- places resultsOCSVM$level <- rep(0,nrow(resultsOCSVM)) tokeep <- which(params$gamma==gammas[5]) for(tk in tokeep) { resultsOCSVM$level <- resultsOCSVM$level + as.integer(predictions[[tk]]) } ## ----fig.height=2.9----------------------------------------------------------- ggplot(resultsOCSVM,aes(Ash,Magnesium,z=level))+ geom_contour_filled(breaks=seq(-1,length(tokeep)+1),alpha=0.75)+ scale_x_continuous(expand=expansion()) +scale_y_continuous(expand=expansion())+ annotate("point",x=wineAM$Ash,y=wineAM$Magnesium,size=0.5)+ggtitle(paste("Fixed gamma",round(gammas[5],5))) ## ----wineIsoF----------------------------------------------------------------- wineIsoF <- isolation.forest(wine %>% dplyr::select(Ash,Magnesium)) resultIsoF <- places resultIsoF$score <- predict(wineIsoF,resultIsoF) ## ----fig.height=2.9----------------------------------------------------------- ggplot(resultIsoF,aes(Ash,Magnesium,z=score))+ geom_contour_filled(alpha=0.6,breaks=c(0,seq(0.5,1,length.out=11)))+ scale_x_continuous(expand=expansion()) +scale_y_continuous(expand=expansion())+ annotate("point",x=wineAM$Ash,y=wineAM$Magnesium,size=0.5) ## ----saveAll------------------------------------------------------------------ saveRDS(power_MCD,file=file.path(paste(".cache-",current.file.basename,sep=""),"power_MCD.Rds"))