This vignette aims at presenting the features of the Channel Binary Entropy Triangle (CBET) for the evaluation of supervised classification in an incremental manner.

Environment construction

library(tidyverse) # That (in)famous Mr. Wickham!
library(caret)    # To build the classifiers.
library(mlbench)  # Many databases for ML tasks
library(vcd)       # Categorical benchmarks
library(candisc)   # Wine dataset
library(entropies) # Processing and visualizing joint entropies
library(compositions)# Statistics work differently on compositional data

Some top level switches and options gathered in one place.

debugLevel <- 0 # Debug level 0-non-existent, 1-minimal, the greater the more verbose.
fancy <- TRUE  # set this for nicer on-screen visualization.
#fancy <- FALSE # Set this for either printing matter or more austere plots.
getPlot <- TRUE # Flag to obtain plots for publication. 
getPlot <- FALSE #Comment to get .jpeg files rather than plots of ets.
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4)
if (getPlot)
    knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication

Some choices for visualization parameters, and primitives.

splitShapesForTypes <- c("X"=4, "Y"=1, "XY"=20) #To draw split diagrams
# Naive transformation from factors to numbers in 0 to num.factors - 1
factor.as.numeric <- function(f){
    nums <- as.numeric(f)
    return(nums - min(nums))
}

Datasets available for entropy analysis in this package

datasets <- getDatasets()
datasets
# A tibble: 7 x 7
  name         className packName idNumber     K     n     m
  <chr>        <chr>     <chr>       <dbl> <int> <int> <int>
1 Ionosphere   Class     mlbench       NaN     2    34   351
2 iris         Species   datasets      NaN     3     4   150
3 Glass        Type      mlbench       NaN     7     9   214
4 Arthritis    Improved  vcd             1     3     3    84
5 BreastCancer Class     mlbench         1     2     9   699
6 Sonar        Class     mlbench       NaN     2    60   208
7 Wine         Cultivar  candisc       NaN     3    13   178
if (getPlot){# For papers, it helps to have the table in latex.
    library(xtable)
    print.xtable(xtable(dplyr::select(datasets, name, K, n, m)))
}
# Uncomment the name of the database to be analyzed
# dsName <- "Ionosphere"
# dsName <- "iris"
# dsName <- "BreastCancer"#Supply the name of the database to be analyzed#FACTORS
# dsName <- "Wine"
# dsName <- "Glass"#Cannot take logarithms for PCA: zeros returns -Inf
dsName <- "Arthritis"#It has non-numeric factors.
# dsName <- "Sonar"
dsRecord <-  filter(datasets, name == dsName)
ds <- evalDataset(dsName) 

We’ll use the Arthritis data throughout this vignette.

Classifier design

Basic data from the set for classification

#id columns, if existent in dsRecord$idNumber
# log transform of everything but the class and any id if existant. 
if (!is.na(dsRecord$idNumber)){
    ds <- ds[,-dsRecord$idNumber]
}
#class column
ds.classNum <- which(names(ds)==dsRecord$className)
#take away the class, but keep it just in case.
class.ds <- ds[, ds.classNum]#saving the class. Warning A FACTOR!
ds <- ds[,-ds.classNum]
ds <- ds %>%     
    #transform factors to number
    mutate_if(is.factor,factor.as.numeric) %>%
    # Dispose of columns with NaN
    select_if(function(v) !any(is.na(v))) %>% 
    # Dispose of constant columns: they carry no information
    select_if(function(v)(var(v) > 0))
ncols <- ncol(ds)#Mnemonic shortcut: num of columns
dsDiscretized <- infotheo::discretize(ds, disc="equalwidth")
if (dsName != "Ionosphere"){
    log.ds <- log(ds)#this has to be made conditional on the database
    log.dsDiscretized <- infotheo::discretize(log.ds)
    #TODO: try to get rid of annoying warnings each time entropy is called. 
}
X <- as.matrix(ds)
Y <- class.ds
classes <- unique(Y)
numC <- length(classes)
print(sprintf("%s has %d classes with distribution: ", dsName, numC))
[1] "Arthritis has 3 classes with distribution: "
summary(Y)
  None   Some Marked 
    42     14     28 

Design a simple classifier

Throughout this vignette we use a k-nearest neigbour classifier.

In this initial evaluation, we first carry out a basic random partitioning of the data.

set.seed(2117)
inTrain <- createDataPartition(y=Y,
                               p=0.80, # Tries to do stratified sampling
                               list=FALSE)
trainX <- X[inTrain,]; trainY <- Y[inTrain]
testX <- X[-inTrain,]; testY <- Y[-inTrain]
#Basic model fitting
fit <- train(x=trainX, y=trainY, 
              method="knn",
              tuneLength = 15,
              preProcess = c("center", "scale"))

Evaluation using the CBET

Evaluation proceeds by obtaining the confusion matrices for the train and test sets…

## obtain a training caret::confusion matrix
trCM <- confusionMatrix(predict(fit,trainX), trainY)
#trEntropies <- 
trCoords <- jentropies(t(trCM$table))

## prediction and the test confusion matrix
predicted <- predict(fit, testX)
teCM <- confusionMatrix(predicted,testY)
#teEntropies <- 
teCoords <- jentropies(t(teCM$table))

And then printing the results in the Entropy Triangle for a single classification experiment:

experiments <- rbind(cbind(trCoords, Phase="train", method="knn"),
                     cbind(teCoords, Phase="test", method="knn")
                     )
experiments <- cbind(dSet=dsName, experiments)
# The basic plot for the entropy triangle training and testX in different colours and glyphs
gp <- ggmetern(data=experiments %>% filter(type=="XY"), fancy) +
    geom_point(aes(colour=Phase, shape=dSet), size=1)  +
    labs(shape="Dataset") + 
    scale_colour_brewer(palette="Set1")
gp

Note that, at least for Arthritis (and iris), there is a suspicious behaviour in the plot in that the classifier achieves a better information transfer (correlated with accuracy) in test than in training.

print(sprintf("Training accuracy= %f vs. Testing accuracy=%f ", trCM$overall[1], teCM$overall[1]))
[1] "Training accuracy= 0.579710 vs. Testing accuracy=0.666667 "

This is part of the “evaluation paradox” for classifications: since the test must have a higher variance, there will be instances of train-test partitions where the performance on the testing set will be higher that on the training set. This is partially solved with n-fold validation.

A better picture with n-fold validation

To confirm this intuition and get all the value for our coin in the entropy triangle, in the following, we use n-fold validation to visualize several experiments and their mean performance.

First we create the folds: the number of folds is a parameter of this script.

numFolds <- 5
set.seed(1717) # For reproducibility
folds <- createFolds(Y, numFolds)
print("Check that the sampling was stratified...")
[1] "Check that the sampling was stratified..."
for(i in 1:numFolds){
    print(summary(Y[folds[[i]]]))
}
  None   Some Marked 
     8      3      6 
  None   Some Marked 
     8      3      5 
  None   Some Marked 
     9      3      5 
  None   Some Marked 
     8      2      6 
  None   Some Marked 
     9      3      6 
summary(Y)
  None   Some Marked 
    42     14     28 

Run the experiments

models <- c("knn") #c("knn", "logreg") 
results <- data.frame()
for (i in 1:numFolds){
    for (m in models){
        # 1. select training and testX data and classes
        trainObs <- unlist(folds[-i])
        testObs <- folds[[i]]
        trainX <- X[trainObs, ]; trainY <- Y[trainObs]
        testX <- X[testObs, ]; testY <- Y[testObs]
        # 2. Fit the model with the 
        model <- train(x=trainX, y=trainY, 
                       method=m,
                       tuneLength = 15,
                       preProcess = c("center", "scale"))
        # 3. Estimate the labels for the train set: confusion matrix, entropies, etc.
        trainYhat <- predict(model, trainX)
        trainCM <- confusionMatrix(trainYhat, trainY)
        print(trainCM$table)
        # 4. Estimate the labels for the test set
        testYhat <- predict(model, testX)
        testCM <- confusionMatrix(testYhat, testY)
        print(testCM$table)
        # 5. Gather results for 
        # CAVEAT: our framework supposes that in confusion matrices, rows are indexed by 
        # the reference, hence the transposition below
        results <- rbind(results, 
                         evaluate(t(trainCM$table)) %>% mutate(Fold=i,method=m, Phase="train",
                               Acc=trainCM$overall[1]),
                         evaluate(t(testCM$table)) %>% mutate(Fold=i,method=m, Phase="test",
                               Acc=testCM$overall[1])
        )
        print(sprintf("Fold %d, method %s Train accuracy = %f\t Test accuracy= %f", 
                      i, m, trainCM$overall[1],testCM$overall[1])
        )
    }
}
          Reference
Prediction None Some Marked
    None     27    7     10
    Some      0    0      0
    Marked    7    4     12
          Reference
Prediction None Some Marked
    None      7    2      1
    Some      0    0      0
    Marked    1    1      5
[1] "Fold 1, method knn Train accuracy = 0.582090\t Test accuracy= 0.705882"
          Reference
Prediction None Some Marked
    None     30    7      9
    Some      1    0      1
    Marked    3    4     13
          Reference
Prediction None Some Marked
    None      7    2      2
    Some      0    0      0
    Marked    1    1      3
[1] "Fold 2, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
          Reference
Prediction None Some Marked
    None     28    5      8
    Some      2    3      2
    Marked    3    3     13
          Reference
Prediction None Some Marked
    None      5    1      2
    Some      2    0      0
    Marked    2    2      3
[1] "Fold 3, method knn Train accuracy = 0.656716\t Test accuracy= 0.470588"
          Reference
Prediction None Some Marked
    None     30    8      9
    Some      0    0      0
    Marked    4    4     13
          Reference
Prediction None Some Marked
    None      7    1      3
    Some      0    0      0
    Marked    1    1      3
[1] "Fold 4, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
          Reference
Prediction None Some Marked
    None     29    6      6
    Some      0    0      0
    Marked    4    5     16
          Reference
Prediction None Some Marked
    None      8    3      4
    Some      0    0      0
    Marked    1    0      2
[1] "Fold 5, method knn Train accuracy = 0.681818\t Test accuracy= 0.555556"
results <- cbind(dSet=dsName,results)#Watch it! This is only possible at last!

We show the plot for the result on a per-plot basis.

eT <- ggmetern(data=results %>% filter(type=="XY"), fancy) + 
    geom_point(aes(colour=Phase, shape=dSet), size=2)  +
    labs(shape="Dataset") + 
    scale_colour_manual(values=c("blue","red")) # Don't trust the training, that is the red
eT