This vignette aims at presenting the features of the Channel Binary Entropy Triangle (CBET) for the evaluation of supervised classification in an incremental manner.
library(tidyverse) # That (in)famous Mr. Wickham!
library(caret) # To build the classifiers.
library(mlbench) # Many databases for ML tasks
library(vcd) # Categorical benchmarks
library(candisc) # Wine dataset
library(entropies) # Processing and visualizing joint entropies
library(compositions)# Statistics work differently on compositional data
Some top level switches and options gathered in one place.
debugLevel <- 0 # Debug level 0-non-existent, 1-minimal, the greater the more verbose.
fancy <- TRUE # set this for nicer on-screen visualization.
#fancy <- FALSE # Set this for either printing matter or more austere plots.
getPlot <- TRUE # Flag to obtain plots for publication.
getPlot <- FALSE #Comment to get .jpeg files rather than plots of ets.
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4)
if (getPlot)
knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
Some choices for visualization parameters, and primitives.
splitShapesForTypes <- c("X"=4, "Y"=1, "XY"=20) #To draw split diagrams
# Naive transformation from factors to numbers in 0 to num.factors - 1
factor.as.numeric <- function(f){
nums <- as.numeric(f)
return(nums - min(nums))
}
datasets <- getDatasets()
datasets
# A tibble: 7 x 7
name className packName idNumber K n m
<chr> <chr> <chr> <dbl> <int> <int> <int>
1 Ionosphere Class mlbench NaN 2 34 351
2 iris Species datasets NaN 3 4 150
3 Glass Type mlbench NaN 7 9 214
4 Arthritis Improved vcd 1 3 3 84
5 BreastCancer Class mlbench 1 2 9 699
6 Sonar Class mlbench NaN 2 60 208
7 Wine Cultivar candisc NaN 3 13 178
if (getPlot){# For papers, it helps to have the table in latex.
library(xtable)
print.xtable(xtable(dplyr::select(datasets, name, K, n, m)))
}
# Uncomment the name of the database to be analyzed
# dsName <- "Ionosphere"
# dsName <- "iris"
# dsName <- "BreastCancer"#Supply the name of the database to be analyzed#FACTORS
# dsName <- "Wine"
# dsName <- "Glass"#Cannot take logarithms for PCA: zeros returns -Inf
dsName <- "Arthritis"#It has non-numeric factors.
# dsName <- "Sonar"
dsRecord <- filter(datasets, name == dsName)
ds <- evalDataset(dsName)
We’ll use the Arthritis data throughout this vignette.
#id columns, if existent in dsRecord$idNumber
# log transform of everything but the class and any id if existant.
if (!is.na(dsRecord$idNumber)){
ds <- ds[,-dsRecord$idNumber]
}
#class column
ds.classNum <- which(names(ds)==dsRecord$className)
#take away the class, but keep it just in case.
class.ds <- ds[, ds.classNum]#saving the class. Warning A FACTOR!
ds <- ds[,-ds.classNum]
ds <- ds %>%
#transform factors to number
mutate_if(is.factor,factor.as.numeric) %>%
# Dispose of columns with NaN
select_if(function(v) !any(is.na(v))) %>%
# Dispose of constant columns: they carry no information
select_if(function(v)(var(v) > 0))
ncols <- ncol(ds)#Mnemonic shortcut: num of columns
dsDiscretized <- infotheo::discretize(ds, disc="equalwidth")
if (dsName != "Ionosphere"){
log.ds <- log(ds)#this has to be made conditional on the database
log.dsDiscretized <- infotheo::discretize(log.ds)
#TODO: try to get rid of annoying warnings each time entropy is called.
}
X <- as.matrix(ds)
Y <- class.ds
classes <- unique(Y)
numC <- length(classes)
print(sprintf("%s has %d classes with distribution: ", dsName, numC))
[1] "Arthritis has 3 classes with distribution: "
summary(Y)
None Some Marked
42 14 28
Throughout this vignette we use a k-nearest neigbour classifier.
In this initial evaluation, we first carry out a basic random partitioning of the data.
set.seed(2117)
inTrain <- createDataPartition(y=Y,
p=0.80, # Tries to do stratified sampling
list=FALSE)
trainX <- X[inTrain,]; trainY <- Y[inTrain]
testX <- X[-inTrain,]; testY <- Y[-inTrain]
#Basic model fitting
fit <- train(x=trainX, y=trainY,
method="knn",
tuneLength = 15,
preProcess = c("center", "scale"))
Evaluation proceeds by obtaining the confusion matrices for the train and test sets…
## obtain a training caret::confusion matrix
trCM <- confusionMatrix(predict(fit,trainX), trainY)
#trEntropies <-
trCoords <- jentropies(t(trCM$table))
## prediction and the test confusion matrix
predicted <- predict(fit, testX)
teCM <- confusionMatrix(predicted,testY)
#teEntropies <-
teCoords <- jentropies(t(teCM$table))
And then printing the results in the Entropy Triangle for a single classification experiment:
experiments <- rbind(cbind(trCoords, Phase="train", method="knn"),
cbind(teCoords, Phase="test", method="knn")
)
experiments <- cbind(dSet=dsName, experiments)
# The basic plot for the entropy triangle training and testX in different colours and glyphs
gp <- ggmetern(data=experiments %>% filter(type=="XY"), fancy) +
geom_point(aes(colour=Phase, shape=dSet), size=1) +
labs(shape="Dataset") +
scale_colour_brewer(palette="Set1")
gp
Note that, at least for Arthritis (and iris), there is a suspicious behaviour in the plot in that the classifier achieves a better information transfer (correlated with accuracy) in test than in training.
print(sprintf("Training accuracy= %f vs. Testing accuracy=%f ", trCM$overall[1], teCM$overall[1]))
[1] "Training accuracy= 0.579710 vs. Testing accuracy=0.666667 "
This is part of the “evaluation paradox” for classifications: since the test must have a higher variance, there will be instances of train-test partitions where the performance on the testing set will be higher that on the training set. This is partially solved with n-fold validation.
To confirm this intuition and get all the value for our coin in the entropy triangle, in the following, we use n-fold validation to visualize several experiments and their mean performance.
First we create the folds: the number of folds is a parameter of this script.
numFolds <- 5
set.seed(1717) # For reproducibility
folds <- createFolds(Y, numFolds)
print("Check that the sampling was stratified...")
[1] "Check that the sampling was stratified..."
for(i in 1:numFolds){
print(summary(Y[folds[[i]]]))
}
None Some Marked
8 3 6
None Some Marked
8 3 5
None Some Marked
9 3 5
None Some Marked
8 2 6
None Some Marked
9 3 6
summary(Y)
None Some Marked
42 14 28
Run the experiments
models <- c("knn") #c("knn", "logreg")
results <- data.frame()
for (i in 1:numFolds){
for (m in models){
# 1. select training and testX data and classes
trainObs <- unlist(folds[-i])
testObs <- folds[[i]]
trainX <- X[trainObs, ]; trainY <- Y[trainObs]
testX <- X[testObs, ]; testY <- Y[testObs]
# 2. Fit the model with the
model <- train(x=trainX, y=trainY,
method=m,
tuneLength = 15,
preProcess = c("center", "scale"))
# 3. Estimate the labels for the train set: confusion matrix, entropies, etc.
trainYhat <- predict(model, trainX)
trainCM <- confusionMatrix(trainYhat, trainY)
print(trainCM$table)
# 4. Estimate the labels for the test set
testYhat <- predict(model, testX)
testCM <- confusionMatrix(testYhat, testY)
print(testCM$table)
# 5. Gather results for
# CAVEAT: our framework supposes that in confusion matrices, rows are indexed by
# the reference, hence the transposition below
results <- rbind(results,
evaluate(t(trainCM$table)) %>% mutate(Fold=i,method=m, Phase="train",
Acc=trainCM$overall[1]),
evaluate(t(testCM$table)) %>% mutate(Fold=i,method=m, Phase="test",
Acc=testCM$overall[1])
)
print(sprintf("Fold %d, method %s Train accuracy = %f\t Test accuracy= %f",
i, m, trainCM$overall[1],testCM$overall[1])
)
}
}
Reference
Prediction None Some Marked
None 27 7 10
Some 0 0 0
Marked 7 4 12
Reference
Prediction None Some Marked
None 7 2 1
Some 0 0 0
Marked 1 1 5
[1] "Fold 1, method knn Train accuracy = 0.582090\t Test accuracy= 0.705882"
Reference
Prediction None Some Marked
None 30 7 9
Some 1 0 1
Marked 3 4 13
Reference
Prediction None Some Marked
None 7 2 2
Some 0 0 0
Marked 1 1 3
[1] "Fold 2, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
Reference
Prediction None Some Marked
None 28 5 8
Some 2 3 2
Marked 3 3 13
Reference
Prediction None Some Marked
None 5 1 2
Some 2 0 0
Marked 2 2 3
[1] "Fold 3, method knn Train accuracy = 0.656716\t Test accuracy= 0.470588"
Reference
Prediction None Some Marked
None 30 8 9
Some 0 0 0
Marked 4 4 13
Reference
Prediction None Some Marked
None 7 1 3
Some 0 0 0
Marked 1 1 3
[1] "Fold 4, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
Reference
Prediction None Some Marked
None 29 6 6
Some 0 0 0
Marked 4 5 16
Reference
Prediction None Some Marked
None 8 3 4
Some 0 0 0
Marked 1 0 2
[1] "Fold 5, method knn Train accuracy = 0.681818\t Test accuracy= 0.555556"
results <- cbind(dSet=dsName,results)#Watch it! This is only possible at last!
We show the plot for the result on a per-plot basis.
eT <- ggmetern(data=results %>% filter(type=="XY"), fancy) +
geom_point(aes(colour=Phase, shape=dSet), size=2) +
labs(shape="Dataset") +
scale_colour_manual(values=c("blue","red")) # Don't trust the training, that is the red
eT
Clearly, some test results are better than training results. What about centrality and dispersion measures?
results %>% filter(type=="XY") %>% select(-one_of(c("method"))) %>%
group_by(type, Phase) %>%
summarise(meanAcc=mean(Acc), sdAcc=sd(Acc), meanEMA = mean(EMA), sdEMA = sd(EMA))
# A tibble: 2 x 6
# Groups: type [?]
type Phase meanAcc sdAcc meanEMA sdEMA
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 XY test 0.596 0.0882 0.242 0.0392
2 XY train 0.637 0.0369 0.229 0.0242
This agrees with the theory that insists on the variance of the testing instances being higher.
Note that the Entropy-Modified Accuracy (EMA) is an alternative, and more pessimistic, measure than the Accuracy.
To use the split triangle to advantage we have to use an unbalanced dataset, e.g. Glass or Arthritis.
First we look at the entropies of the different folds.
eTsplit <- ggmetern(data=results %>% filter(type!="XY"), fancy) +
geom_point(aes(colour=Phase, shape=type), size=2) +
labs(shape="Split Entropies") +
scale_shape_manual(values=splitShapesForTypes) +
scale_colour_manual(values=c("train"="blue","test"="red"))
eTsplit + geom_text(data=results %>% filter(type == "XY"), aes(label=Fold,color=Phase))
#color="blue") size=4, vjust=2, hjust=1)
The number of the fold appears in the place where the aggregate entropies appeared in the previous CBET.
We can see that the input entropies \(H_{P_X}\) are much more concentrated. This is because the sampling was stratified. While in theory it should obtain similar reference class distributions for all folds, e.g. the “x” above should lie in a line parallel to the left side of the triangle, in practice the stratification is imperfect.
Also the predicted class distributions are much less entropic than the reference class distributions: their uncertainty has decreased, contrary to the data processing inequality. The reason for this is the classifier concentrating on majority classes, i.e. specializing, which increases the Accuracy.
To better view this phenomenon, we obtain the (compositional) means of the entropy values.
meanCompositions <- data.frame()
for(dsNam in unique(results$dSet)){
for(ph in unique(results$Phase)){
for(eType in unique(results$type)){
meanCompositions <-
rbind(meanCompositions,
cbind(dSet=dsNam, Phase=ph, type=eType,
data.frame(as.list(
mean(acomp(results %>%
filter(dSet==dsNam, Phase==ph, type==eType) %>%
select(one_of(c("DeltaH_P", "M_P", "VI_P"))))
)))
)
)
}
}
}
meanCompositions %>% filter(type=="XY")
dSet Phase type DeltaH_P M_P VI_P
1 Arthritis train XY 0.2144597 0.1034439 0.6820963
2 Arthritis test XY 0.2274859 0.1050174 0.6674967
And now we add it with a different glyph but the same colors.
eTsplit + geom_point(data=meanCompositions, aes(colour=Phase, shape=type), size=4)
This shows that there is very little mutual information transfer. It was made worse by the classifier specializing, that is, increasing \(\Delta H_{P_{XY}}\) at the expense of mutual information.
All in all, knn is a very bad classifier for Arthritis.
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] bindrcpp_0.2.2 compositions_1.40-2 bayesm_3.1-0.1
[4] energy_1.7-4 robustbase_0.93-1 tensorA_0.36
[7] entropies_0.9.1 candisc_0.8-0 heplots_1.3-5
[10] car_3.0-0 carData_3.0-1 vcd_1.4-4
[13] mlbench_2.1-1 caret_6.0-80 lattice_0.20-35
[16] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.6
[19] purrr_0.2.5 readr_1.1.1 tidyr_0.8.1
[22] tibble_1.4.2 ggplot2_2.2.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] colorspace_1.3-2 class_7.3-14 rio_0.5.10
[4] rprojroot_1.3-2 pls_2.6-0 rstudioapi_0.7
[7] DRR_0.0.3 prodlim_2018.04.18 lubridate_1.7.4
[10] xml2_1.2.0 codetools_0.2-15 splines_3.5.0
[13] mnormt_1.5-5 knitr_1.20 RcppRoll_0.3.0
[16] entropy_1.2.1 jsonlite_1.5 ggtern_2.2.1
[19] broom_0.4.5 ddalpha_1.3.4 kernlab_0.9-26
[22] sfsmisc_1.1-2 latex2exp_0.4.0 compiler_3.5.0
[25] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
[28] Matrix_1.2-14 lazyeval_0.2.1 cli_1.0.0
[31] htmltools_0.3.6 tools_3.5.0 gtable_0.2.0
[34] glue_1.2.0 reshape2_1.4.3 Rcpp_0.12.17
[37] cellranger_1.1.0 nlme_3.1-137 iterators_1.0.9
[40] psych_1.8.4 lmtest_0.9-36 timeDate_3043.102
[43] gower_0.1.2 proto_1.0.0 openxlsx_4.1.0
[46] rvest_0.3.2 DEoptimR_1.0-8 MASS_7.3-49
[49] zoo_1.8-2 scales_0.5.0 ipred_0.9-6
[52] hms_0.4.2 parallel_3.5.0 RColorBrewer_1.1-2
[55] yaml_2.1.19 curl_3.2 gridExtra_2.3
[58] rpart_4.1-13 stringi_1.2.3 foreach_1.4.4
[61] e1071_1.6-8 boot_1.3-20 zip_1.0.0
[64] lava_1.6.2 geometry_0.3-6 rlang_0.2.1
[67] pkgconfig_2.0.1 evaluate_0.10.1 bindr_0.1.1
[70] labeling_0.3 recipes_0.1.3 CVST_0.2-2
[73] tidyselect_0.2.4 plyr_1.8.4 magrittr_1.5
[76] R6_2.2.2 dimRed_0.1.0 pillar_1.2.3
[79] haven_1.1.2 foreign_0.8-70 withr_2.1.2
[82] survival_2.41-3 abind_1.4-5 nnet_7.3-12
[85] modelr_0.1.2 crayon_1.3.4 utf8_1.1.4
[88] rmarkdown_1.10 readxl_1.1.0 data.table_1.11.4
[91] infotheo_1.2.0 ModelMetrics_1.1.0 digest_0.6.15
[94] stats4_3.5.0 munsell_0.5.0 magic_1.5-8