This vignette aims at presenting the features of the Channel Binary Entropy Triangle (CBET) for the evaluation of supervised classification in an incremental manner.

Environment construction

library(tidyverse) # That (in)famous Mr. Wickham!
library(caret)    # To build the classifiers.
library(mlbench)  # Many databases for ML tasks
library(vcd)       # Categorical benchmarks
library(candisc)   # Wine dataset
library(entropies) # Processing and visualizing joint entropies
library(compositions)# Statistics work differently on compositional data

Some top level switches and options gathered in one place.

debugLevel <- 0 # Debug level 0-non-existent, 1-minimal, the greater the more verbose.
fancy <- TRUE  # set this for nicer on-screen visualization.
#fancy <- FALSE # Set this for either printing matter or more austere plots.
getPlot <- TRUE # Flag to obtain plots for publication. 
getPlot <- FALSE #Comment to get .jpeg files rather than plots of ets.
knitr::opts_chunk$set(comment=NA, fig.width=6, fig.height=4)
if (getPlot)
    knitr::opts_chunk$set(dev = 'pdf') # plots in pdf, better for publication

Some choices for visualization parameters, and primitives.

splitShapesForTypes <- c("X"=4, "Y"=1, "XY"=20) #To draw split diagrams
# Naive transformation from factors to numbers in 0 to num.factors - 1
factor.as.numeric <- function(f){
    nums <- as.numeric(f)
    return(nums - min(nums))
}

Datasets available for entropy analysis in this package

datasets <- getDatasets()
datasets
# A tibble: 7 x 7
  name         className packName idNumber     K     n     m
  <chr>        <chr>     <chr>       <dbl> <int> <int> <int>
1 Ionosphere   Class     mlbench       NaN     2    34   351
2 iris         Species   datasets      NaN     3     4   150
3 Glass        Type      mlbench       NaN     7     9   214
4 Arthritis    Improved  vcd             1     3     3    84
5 BreastCancer Class     mlbench         1     2     9   699
6 Sonar        Class     mlbench       NaN     2    60   208
7 Wine         Cultivar  candisc       NaN     3    13   178
if (getPlot){# For papers, it helps to have the table in latex.
    library(xtable)
    print.xtable(xtable(dplyr::select(datasets, name, K, n, m)))
}
# Uncomment the name of the database to be analyzed
# dsName <- "Ionosphere"
# dsName <- "iris"
# dsName <- "BreastCancer"#Supply the name of the database to be analyzed#FACTORS
# dsName <- "Wine"
# dsName <- "Glass"#Cannot take logarithms for PCA: zeros returns -Inf
dsName <- "Arthritis"#It has non-numeric factors.
# dsName <- "Sonar"
dsRecord <-  filter(datasets, name == dsName)
ds <- evalDataset(dsName) 

We’ll use the Arthritis data throughout this vignette.

Classifier design

Basic data from the set for classification

#id columns, if existent in dsRecord$idNumber
# log transform of everything but the class and any id if existant. 
if (!is.na(dsRecord$idNumber)){
    ds <- ds[,-dsRecord$idNumber]
}
#class column
ds.classNum <- which(names(ds)==dsRecord$className)
#take away the class, but keep it just in case.
class.ds <- ds[, ds.classNum]#saving the class. Warning A FACTOR!
ds <- ds[,-ds.classNum]
ds <- ds %>%     
    #transform factors to number
    mutate_if(is.factor,factor.as.numeric) %>%
    # Dispose of columns with NaN
    select_if(function(v) !any(is.na(v))) %>% 
    # Dispose of constant columns: they carry no information
    select_if(function(v)(var(v) > 0))
ncols <- ncol(ds)#Mnemonic shortcut: num of columns
dsDiscretized <- infotheo::discretize(ds, disc="equalwidth")
if (dsName != "Ionosphere"){
    log.ds <- log(ds)#this has to be made conditional on the database
    log.dsDiscretized <- infotheo::discretize(log.ds)
    #TODO: try to get rid of annoying warnings each time entropy is called. 
}
X <- as.matrix(ds)
Y <- class.ds
classes <- unique(Y)
numC <- length(classes)
print(sprintf("%s has %d classes with distribution: ", dsName, numC))
[1] "Arthritis has 3 classes with distribution: "
summary(Y)
  None   Some Marked 
    42     14     28 

Design a simple classifier

Throughout this vignette we use a k-nearest neigbour classifier.

In this initial evaluation, we first carry out a basic random partitioning of the data.

set.seed(2117)
inTrain <- createDataPartition(y=Y,
                               p=0.80, # Tries to do stratified sampling
                               list=FALSE)
trainX <- X[inTrain,]; trainY <- Y[inTrain]
testX <- X[-inTrain,]; testY <- Y[-inTrain]
#Basic model fitting
fit <- train(x=trainX, y=trainY, 
              method="knn",
              tuneLength = 15,
              preProcess = c("center", "scale"))

Evaluation using the CBET

Evaluation proceeds by obtaining the confusion matrices for the train and test sets…

## obtain a training caret::confusion matrix
trCM <- confusionMatrix(predict(fit,trainX), trainY)
#trEntropies <- 
trCoords <- jentropies(t(trCM$table))

## prediction and the test confusion matrix
predicted <- predict(fit, testX)
teCM <- confusionMatrix(predicted,testY)
#teEntropies <- 
teCoords <- jentropies(t(teCM$table))

And then printing the results in the Entropy Triangle for a single classification experiment:

experiments <- rbind(cbind(trCoords, Phase="train", method="knn"),
                     cbind(teCoords, Phase="test", method="knn")
                     )
experiments <- cbind(dSet=dsName, experiments)
# The basic plot for the entropy triangle training and testX in different colours and glyphs
gp <- ggmetern(data=experiments %>% filter(type=="XY"), fancy) +
    geom_point(aes(colour=Phase, shape=dSet), size=1)  +
    labs(shape="Dataset") + 
    scale_colour_brewer(palette="Set1")
gp

Note that, at least for Arthritis (and iris), there is a suspicious behaviour in the plot in that the classifier achieves a better information transfer (correlated with accuracy) in test than in training.

print(sprintf("Training accuracy= %f vs. Testing accuracy=%f ", trCM$overall[1], teCM$overall[1]))
[1] "Training accuracy= 0.579710 vs. Testing accuracy=0.666667 "

This is part of the “evaluation paradox” for classifications: since the test must have a higher variance, there will be instances of train-test partitions where the performance on the testing set will be higher that on the training set. This is partially solved with n-fold validation.

A better picture with n-fold validation

To confirm this intuition and get all the value for our coin in the entropy triangle, in the following, we use n-fold validation to visualize several experiments and their mean performance.

First we create the folds: the number of folds is a parameter of this script.

numFolds <- 5
set.seed(1717) # For reproducibility
folds <- createFolds(Y, numFolds)
print("Check that the sampling was stratified...")
[1] "Check that the sampling was stratified..."
for(i in 1:numFolds){
    print(summary(Y[folds[[i]]]))
}
  None   Some Marked 
     8      3      6 
  None   Some Marked 
     8      3      5 
  None   Some Marked 
     9      3      5 
  None   Some Marked 
     8      2      6 
  None   Some Marked 
     9      3      6 
summary(Y)
  None   Some Marked 
    42     14     28 

Run the experiments

models <- c("knn") #c("knn", "logreg") 
results <- data.frame()
for (i in 1:numFolds){
    for (m in models){
        # 1. select training and testX data and classes
        trainObs <- unlist(folds[-i])
        testObs <- folds[[i]]
        trainX <- X[trainObs, ]; trainY <- Y[trainObs]
        testX <- X[testObs, ]; testY <- Y[testObs]
        # 2. Fit the model with the 
        model <- train(x=trainX, y=trainY, 
                       method=m,
                       tuneLength = 15,
                       preProcess = c("center", "scale"))
        # 3. Estimate the labels for the train set: confusion matrix, entropies, etc.
        trainYhat <- predict(model, trainX)
        trainCM <- confusionMatrix(trainYhat, trainY)
        print(trainCM$table)
        # 4. Estimate the labels for the test set
        testYhat <- predict(model, testX)
        testCM <- confusionMatrix(testYhat, testY)
        print(testCM$table)
        # 5. Gather results for 
        # CAVEAT: our framework supposes that in confusion matrices, rows are indexed by 
        # the reference, hence the transposition below
        results <- rbind(results, 
                         evaluate(t(trainCM$table)) %>% mutate(Fold=i,method=m, Phase="train",
                               Acc=trainCM$overall[1]),
                         evaluate(t(testCM$table)) %>% mutate(Fold=i,method=m, Phase="test",
                               Acc=testCM$overall[1])
        )
        print(sprintf("Fold %d, method %s Train accuracy = %f\t Test accuracy= %f", 
                      i, m, trainCM$overall[1],testCM$overall[1])
        )
    }
}
          Reference
Prediction None Some Marked
    None     27    7     10
    Some      0    0      0
    Marked    7    4     12
          Reference
Prediction None Some Marked
    None      7    2      1
    Some      0    0      0
    Marked    1    1      5
[1] "Fold 1, method knn Train accuracy = 0.582090\t Test accuracy= 0.705882"
          Reference
Prediction None Some Marked
    None     30    7      9
    Some      1    0      1
    Marked    3    4     13
          Reference
Prediction None Some Marked
    None      7    2      2
    Some      0    0      0
    Marked    1    1      3
[1] "Fold 2, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
          Reference
Prediction None Some Marked
    None     28    5      8
    Some      2    3      2
    Marked    3    3     13
          Reference
Prediction None Some Marked
    None      5    1      2
    Some      2    0      0
    Marked    2    2      3
[1] "Fold 3, method knn Train accuracy = 0.656716\t Test accuracy= 0.470588"
          Reference
Prediction None Some Marked
    None     30    8      9
    Some      0    0      0
    Marked    4    4     13
          Reference
Prediction None Some Marked
    None      7    1      3
    Some      0    0      0
    Marked    1    1      3
[1] "Fold 4, method knn Train accuracy = 0.632353\t Test accuracy= 0.625000"
          Reference
Prediction None Some Marked
    None     29    6      6
    Some      0    0      0
    Marked    4    5     16
          Reference
Prediction None Some Marked
    None      8    3      4
    Some      0    0      0
    Marked    1    0      2
[1] "Fold 5, method knn Train accuracy = 0.681818\t Test accuracy= 0.555556"
results <- cbind(dSet=dsName,results)#Watch it! This is only possible at last!

We show the plot for the result on a per-plot basis.

eT <- ggmetern(data=results %>% filter(type=="XY"), fancy) + 
    geom_point(aes(colour=Phase, shape=dSet), size=2)  +
    labs(shape="Dataset") + 
    scale_colour_manual(values=c("blue","red")) # Don't trust the training, that is the red
eT

Clearly, some test results are better than training results. What about centrality and dispersion measures?

results %>% filter(type=="XY") %>% select(-one_of(c("method"))) %>% 
    group_by(type, Phase) %>%
    summarise(meanAcc=mean(Acc), sdAcc=sd(Acc), meanEMA = mean(EMA), sdEMA = sd(EMA))
# A tibble: 2 x 6
# Groups:   type [?]
  type  Phase meanAcc  sdAcc meanEMA  sdEMA
  <chr> <chr>   <dbl>  <dbl>   <dbl>  <dbl>
1 XY    test    0.596 0.0882   0.242 0.0392
2 XY    train   0.637 0.0369   0.229 0.0242

This agrees with the theory that insists on the variance of the testing instances being higher.

Note that the Entropy-Modified Accuracy (EMA) is an alternative, and more pessimistic, measure than the Accuracy.

Visualization with the split triangle

To use the split triangle to advantage we have to use an unbalanced dataset, e.g. Glass or Arthritis.

First we look at the entropies of the different folds.

eTsplit <- ggmetern(data=results %>% filter(type!="XY"), fancy) + 
    geom_point(aes(colour=Phase, shape=type), size=2)  +
    labs(shape="Split Entropies") + 
    scale_shape_manual(values=splitShapesForTypes) +
    scale_colour_manual(values=c("train"="blue","test"="red"))

eTsplit + geom_text(data=results %>% filter(type == "XY"), aes(label=Fold,color=Phase))

              #color="blue") size=4, vjust=2, hjust=1)

The number of the fold appears in the place where the aggregate entropies appeared in the previous CBET.

We can see that the input entropies \(H_{P_X}\) are much more concentrated. This is because the sampling was stratified. While in theory it should obtain similar reference class distributions for all folds, e.g. the “x” above should lie in a line parallel to the left side of the triangle, in practice the stratification is imperfect.

Also the predicted class distributions are much less entropic than the reference class distributions: their uncertainty has decreased, contrary to the data processing inequality. The reason for this is the classifier concentrating on majority classes, i.e. specializing, which increases the Accuracy.

To better view this phenomenon, we obtain the (compositional) means of the entropy values.

meanCompositions <- data.frame()
for(dsNam in unique(results$dSet)){
    for(ph in unique(results$Phase)){
        for(eType in unique(results$type)){
            meanCompositions <- 
                rbind(meanCompositions,
                      cbind(dSet=dsNam, Phase=ph, type=eType,
                            data.frame(as.list(
                                mean(acomp(results %>% 
                                               filter(dSet==dsNam, Phase==ph, type==eType) %>%
                                               select(one_of(c("DeltaH_P", "M_P", "VI_P"))))
                                     )))
                            )
                      )
        }
    }
}
meanCompositions %>% filter(type=="XY")
       dSet Phase type  DeltaH_P       M_P      VI_P
1 Arthritis train   XY 0.2144597 0.1034439 0.6820963
2 Arthritis  test   XY 0.2274859 0.1050174 0.6674967

And now we add it with a different glyph but the same colors.

eTsplit + geom_point(data=meanCompositions, aes(colour=Phase, shape=type), size=4)

This shows that there is very little mutual information transfer. It was made worse by the classifier specializing, that is, increasing \(\Delta H_{P_{XY}}\) at the expense of mutual information.

All in all, knn is a very bad classifier for Arthritis.

Session information

sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.5

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] bindrcpp_0.2.2      compositions_1.40-2 bayesm_3.1-0.1     
 [4] energy_1.7-4        robustbase_0.93-1   tensorA_0.36       
 [7] entropies_0.9.1     candisc_0.8-0       heplots_1.3-5      
[10] car_3.0-0           carData_3.0-1       vcd_1.4-4          
[13] mlbench_2.1-1       caret_6.0-80        lattice_0.20-35    
[16] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.6        
[19] purrr_0.2.5         readr_1.1.1         tidyr_0.8.1        
[22] tibble_1.4.2        ggplot2_2.2.1       tidyverse_1.2.1    

loaded via a namespace (and not attached):
 [1] colorspace_1.3-2   class_7.3-14       rio_0.5.10        
 [4] rprojroot_1.3-2    pls_2.6-0          rstudioapi_0.7    
 [7] DRR_0.0.3          prodlim_2018.04.18 lubridate_1.7.4   
[10] xml2_1.2.0         codetools_0.2-15   splines_3.5.0     
[13] mnormt_1.5-5       knitr_1.20         RcppRoll_0.3.0    
[16] entropy_1.2.1      jsonlite_1.5       ggtern_2.2.1      
[19] broom_0.4.5        ddalpha_1.3.4      kernlab_0.9-26    
[22] sfsmisc_1.1-2      latex2exp_0.4.0    compiler_3.5.0    
[25] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
[28] Matrix_1.2-14      lazyeval_0.2.1     cli_1.0.0         
[31] htmltools_0.3.6    tools_3.5.0        gtable_0.2.0      
[34] glue_1.2.0         reshape2_1.4.3     Rcpp_0.12.17      
[37] cellranger_1.1.0   nlme_3.1-137       iterators_1.0.9   
[40] psych_1.8.4        lmtest_0.9-36      timeDate_3043.102 
[43] gower_0.1.2        proto_1.0.0        openxlsx_4.1.0    
[46] rvest_0.3.2        DEoptimR_1.0-8     MASS_7.3-49       
[49] zoo_1.8-2          scales_0.5.0       ipred_0.9-6       
[52] hms_0.4.2          parallel_3.5.0     RColorBrewer_1.1-2
[55] yaml_2.1.19        curl_3.2           gridExtra_2.3     
[58] rpart_4.1-13       stringi_1.2.3      foreach_1.4.4     
[61] e1071_1.6-8        boot_1.3-20        zip_1.0.0         
[64] lava_1.6.2         geometry_0.3-6     rlang_0.2.1       
[67] pkgconfig_2.0.1    evaluate_0.10.1    bindr_0.1.1       
[70] labeling_0.3       recipes_0.1.3      CVST_0.2-2        
[73] tidyselect_0.2.4   plyr_1.8.4         magrittr_1.5      
[76] R6_2.2.2           dimRed_0.1.0       pillar_1.2.3      
[79] haven_1.1.2        foreign_0.8-70     withr_2.1.2       
[82] survival_2.41-3    abind_1.4-5        nnet_7.3-12       
[85] modelr_0.1.2       crayon_1.3.4       utf8_1.1.4        
[88] rmarkdown_1.10     readxl_1.1.0       data.table_1.11.4 
[91] infotheo_1.2.0     ModelMetrics_1.1.0 digest_0.6.15     
[94] stats4_3.5.0       munsell_0.5.0      magic_1.5-8