This vignette aims at presenting the features of the Channel Binary Entropy Triangle (CBET) for the evaluation of supervised classification in an incremental manner.
library(tidyverse) # That (in)famous Mr. Wickham!
library(caret) # To build the classifiers.
library(mlbench) # Many databases for ML tasks
library(vcd) # Categorical benchmarks
library(candisc) # Wine dataset
library(entropies) # Processing and visualizing joint entropies
library(compositions)# Statistics work differently on compositional data
Some top level switches and options gathered in one place.
debugLevel <- 0 # Debug level 0-non-existent, 1-minimal, the greater the more verbose.
fancy <- TRUE # set this for nicer on-screen visualization.
#fancy <- FALSE # Set this for either printing matter or more austere plots.
getPlot <- TRUE # Flag to obtain plots for publication.
getPlot <- FALSE #Comment to get .jpeg files rather than plots of ets.
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4)
if (getPlot)
knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication
Some choices for visualization parameters, and primitives.
splitShapesForTypes <- c("X"=4, "Y"=1, "XY"=20) #To draw split diagrams
# Naive transformation from factors to numbers in 0 to num.factors - 1
factor.as.numeric <- function(f){
nums <- as.numeric(f)
return(nums - min(nums))
}
datasets <- getDatasets()
datasets
# A tibble: 7 x 7
name className packName idNumber K n m
<chr> <chr> <chr> <dbl> <int> <int> <int>
1 Ionosphere Class mlbench NaN 2 34 351
2 iris Species datasets NaN 3 4 150
3 Glass Type mlbench NaN 7 9 214
4 Arthritis Improved vcd 1 3 3 84
5 BreastCancer Class mlbench 1 2 9 699
6 Sonar Class mlbench NaN 2 60 208
7 Wine Cultivar candisc NaN 3 13 178
if (getPlot){# For papers, it helps to have the table in latex.
library(xtable)
print.xtable(xtable(dplyr::select(datasets, name, K, n, m)))
}
# Uncomment the name of the database to be analyzed
# dsName <- "Ionosphere"
# dsName <- "iris"
# dsName <- "BreastCancer"#Supply the name of the database to be analyzed#FACTORS
# dsName <- "Wine"
# dsName <- "Glass"#Cannot take logarithms for PCA: zeros returns -Inf
dsName <- "Arthritis"#It has non-numeric factors.
# dsName <- "Sonar"
dsRecord <- filter(datasets, name == dsName)
ds <- evalDataset(dsName)
We’ll use the Arthritis data throughout this vignette.
#id columns, if existent in dsRecord$idNumber
# log transform of everything but the class and any id if existant.
if (!is.na(dsRecord$idNumber)){
ds <- ds[,-dsRecord$idNumber]
}
#class column
ds.classNum <- which(names(ds)==dsRecord$className)
#take away the class, but keep it just in case.
class.ds <- ds[, ds.classNum]#saving the class. Warning A FACTOR!
ds <- ds[,-ds.classNum]
ds <- ds %>%
#transform factors to number
mutate_if(is.factor,factor.as.numeric) %>%
# Dispose of columns with NaN
select_if(function(v) !any(is.na(v))) %>%
# Dispose of constant columns: they carry no information
select_if(function(v)(var(v) > 0))
ncols <- ncol(ds)#Mnemonic shortcut: num of columns
dsDiscretized <- infotheo::discretize(ds, disc="equalwidth")
if (dsName != "Ionosphere"){
log.ds <- log(ds)#this has to be made conditional on the database
log.dsDiscretized <- infotheo::discretize(log.ds)
#TODO: try to get rid of annoying warnings each time entropy is called.
}
X <- as.matrix(ds)
Y <- class.ds
classes <- unique(Y)
numC <- length(classes)
print(sprintf("%s has %d classes with distribution: ", dsName, numC))
[1] "Arthritis has 3 classes with distribution: "
summary(Y)
None Some Marked
42 14 28
Throughout this vignette we use a k-nearest neigbour classifier.
In this initial evaluation, we first carry out a basic random partitioning of the data.
set.seed(2117)
inTrain <- createDataPartition(y=Y,
p=0.80, # Tries to do stratified sampling
list=FALSE)
trainX <- X[inTrain,]; trainY <- Y[inTrain]
testX <- X[-inTrain,]; testY <- Y[-inTrain]
#Basic model fitting
fit <- train(x=trainX, y=trainY,
method="knn",
tuneLength = 15,
preProcess = c("center", "scale"))
Evaluation proceeds by obtaining the confusion matrices for the train and test sets…
## obtain a training caret::confusion matrix
trCM <- confusionMatrix(predict(fit,trainX), trainY)
#trEntropies <-
trCoords <- jentropies(t(trCM$table))
## prediction and the test confusion matrix
predicted <- predict(fit, testX)
teCM <- confusionMatrix(predicted,testY)
#teEntropies <-
teCoords <- jentropies(t(teCM$table))
And then printing the results in the Entropy Triangle for a single classification experiment:
experiments <- rbind(cbind(trCoords, Phase="train", method="knn"),
cbind(teCoords, Phase="test", method="knn")
)
experiments <- cbind(dSet=dsName, experiments)
# The basic plot for the entropy triangle training and testX in different colours and glyphs
gp <- ggmetern(data=experiments %>% filter(type=="XY"), fancy) +
geom_point(aes(colour=Phase, shape=dSet), size=1) +
labs(shape="Dataset") +
scale_colour_brewer(palette="Set1")
gp
Note that, at least for Arthritis (and iris), there is a suspicious behaviour in the plot in that the classifier achieves a better information transfer (correlated with accuracy) in test than in training.
print(sprintf("Training accuracy= %f vs. Testing accuracy=%f ", trCM$overall[1], teCM$overall[1]))
[1] "Training accuracy= 0.579710 vs. Testing accuracy=0.666667 "
This is part of the “evaluation paradox” for classifications: since the test must have a higher variance, there will be instances of train-test partitions where the performance on the testing set will be higher that on the training set. This is partially solved with n-fold validation.
To confirm this intuition and get all the value for our coin in the entropy triangle, in the following, we use n-fold validation to visualize several experiments and their mean performance.
First we create the folds: the number of folds is a parameter of this script.
numFolds <- 5
set.seed(1717) # For reproducibility
folds <- createFolds(Y, numFolds)
print("Check that the sampling was stratified...")
[1] "Check that the sampling was stratified..."
for(i in 1:numFolds){
print(summary(Y[folds[[i]]]))
}
None Some Marked
8 3 6
None Some Marked
8 3 5
None Some Marked
9 3 5
None Some Marked
8 2 6
None Some Marked
9 3 6
summary(Y)
None Some Marked
42 14 28
Run the experiments
models <- c("knn") #c("knn", "logreg")
results <- data.frame()
for (i in 1:numFolds){
for (m in models){
# 1. select training and testX data and classes
trainObs <- unlist(folds[-i])
testObs <- folds[[i]]
trainX <- X[trainObs, ]; trainY <- Y[trainObs]
testX <- X[testObs, ]; testY <- Y[testObs]
# 2. Fit the model with the
model <- train(x=trainX, y=trainY,
method=m,
tuneLength = 15,
preProcess = c("center", "scale"))
# 3. Estimate the labels for the train set: confusion matrix, entropies, etc.
trainYhat <- predict(model, trainX)
trainCM <- confusionMatrix(trainYhat, trainY)
print(trainCM$table)
# 4. Estimate the labels for the test set
testYhat <- predict(model, testX)
testCM <- confusionMatrix(testYhat, testY)
print(testCM$table)
# 5. Gather results for
# CAVEAT: our framework supposes that in confusion matrices, rows are indexed by
# the reference, hence the transposition below
results <- rbind(results,
evaluate(t(trainCM$table)) %>% mutate(Fold=i,method=m, Phase="train",
Acc=trainCM$overall[1]),
evaluate(t(testCM$table)) %>% mutate(Fold=i,method=m, Phase="test",
Acc=testCM$overall[1])
)
print(sprintf("Fold %d, method %s Train accuracy = %f\t Test accuracy= %f",
i, m, trainCM$overall[1],testCM$overall[1])
)
}
}
Reference
Prediction None Some Marked
None 27 7 10
Some 0 0 0
Marked 7 4 12
Reference
Prediction None Some Marked
None 7 2 1
Some 0 0 0
Marked 1 1 5
[1] "Fold 1, method knn Train accuracy = 0.582090\t Test accuracy= 0.705882"
Reference
Prediction None Some Marked
None 30 7 9
Some 1 0 1
Marked 3 4 13
Reference
Prediction None Some Marked
None 7 2 2
Some 0 0 0
Marked 1 1 3
[1] "Fold 2, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
Reference
Prediction None Some Marked
None 28 5 8
Some 2 3 2
Marked 3 3 13
Reference
Prediction None Some Marked
None 5 1 2
Some 2 0 0
Marked 2 2 3
[1] "Fold 3, method knn Train accuracy = 0.656716\t Test accuracy= 0.470588"
Reference
Prediction None Some Marked
None 30 8 9
Some 0 0 0
Marked 4 4 13
Reference
Prediction None Some Marked
None 7 1 3
Some 0 0 0
Marked 1 1 3
[1] "Fold 4, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
Reference
Prediction None Some Marked
None 29 6 6
Some 0 0 0
Marked 4 5 16
Reference
Prediction None Some Marked
None 8 3 4
Some 0 0 0
Marked 1 0 2
[1] "Fold 5, method knn Train accuracy = 0.681818\t Test accuracy= 0.555556"
results <- cbind(dSet=dsName,results)#Watch it! This is only possible at last!
We show the plot for the result on a per-plot basis.
eT <- ggmetern(data=results %>% filter(type=="XY"), fancy) +
geom_point(aes(colour=Phase, shape=dSet), size=2) +
labs(shape="Dataset") +
scale_colour_manual(values=c("blue","red")) # Don't trust the training, that is the red
eT