In this tutorial, we will go through basic evaluation statergies. The agenda is to cover:


1.   Download dataset - train and test dataset.
2.   Train a basic Classifier on the training dataset.
3.   Predict the labels on the test dataset.
4.   Evaluate the predicted labels with gold labels.
  1.   Accuracy
  2.   Precision
  3.   Recall
  4.   F-1 score
5.   Cross Validation
6.   Paired t-test


In [0]:
import numpy as np

from sklearn import linear_model
from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Download dataset

In [0]:
breast_cancer = datasets.load_breast_cancer()
X = np.array(breast_cancer.data)
Y = np.array(breast_cancer.target)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

# Train a SGD/Decision Tree Classifier on the training dataset

In [0]:
def SGD_fit(X_train, Y_train):
  clf = linear_model.SGDClassifier(loss='log', penalty='l2', max_iter=1000)
  clf.fit(X_train, Y_train)
  return clf

sgd_clf = SGD_fit(X_train, Y_train)

In [0]:
def DT_fit(X_train, Y_train):
  clf = tree.DecisionTreeClassifier(max_depth=None, criterion='entropy')
  clf.fit(X_train, Y_train)
  return clf

dt_clf = DT_fit(X_train, Y_train)

# Predict on the test dataset

In [0]:
def predict(clf, X_test):
  Y_pred = clf.predict(X_test)
  return Y_pred

Y_pred = predict(sgd_clf, X_test)
print(Y_test)
print(Y_pred)

# Evaluation Measures

## Accuracy

In [0]:
def get_accuracy(Y_pred, Y_gold):
  accuracy = np.equal(Y_pred, Y_gold).sum() / len(Y_pred)
  return accuracy

accuracy = get_accuracy(Y_pred, Y_test)
print(accuracy)

## Some Terminology

1. True Positives: Instances where people actually had breast cancer and system said they do
2. True Negatives: Instances where people didn't have breast cancer and system said they don't
3. False Positives: Instances where people didn't have breast cancer but system said they do
4. False Negatives: Instances where people actually had breast cancer but system said they don't

In [0]:
def get_stats(Y_pred, Y_gold):
  TP = sum([1 if pred == 1 and gold == 1 else 0 for pred,gold in zip(Y_pred, Y_gold)])
  FP = sum([1 if pred == 1 and gold == 0 else 0 for pred,gold in zip(Y_pred, Y_gold)])
  FN = sum([1 if pred == 0 and gold == 1 else 0 for pred,gold in zip(Y_pred, Y_gold)])
  return TP, FP, FN

## Precision

In [0]:
def get_precision(Y_pred, Y_gold):
  TP, FP, FN = get_stats(Y_pred, Y_gold)
  if TP == 0 and FP == 0:
    return 0.0
  precision = TP / float(TP + FP)
  return precision

precision = get_precision(Y_pred, Y_test)
print(precision)

## Recall

In [0]:
def get_recall(Y_pred, Y_gold):
  TP, FP, FN = get_stats(Y_pred, Y_gold)
  if TP == 0 and FN == 0:
    return 0.0
  recall = TP / float(TP + FN)
  return recall

recall = get_recall(Y_pred, Y_test)
print(recall)

## F1-Score

In [0]:
def get_f1(Y_pred, Y_gold):
  recall = get_recall(Y_pred, Y_gold)
  precision = get_precision(Y_pred, Y_gold)
  if precision == 0 and recall == 0:
    return 0.0
  f1 = 2 * precision * recall / (precision + recall)
  return f1

f1 = get_f1(Y_pred, Y_test)
print(f1)

# Cross Validation

In [0]:
kf = KFold(n_splits=5, shuffle=True)
split_data = kf.split(X)
folds_list = [idx for idx in split_data]

sgd_f1_list = []
dt_f1_list = []

fold_cnt = 0

for train_idx,test_idx in folds_list:
  fold_cnt += 1
  # Split the original data into train and test according to the fold indices
  X_fold_train, X_fold_test = X[train_idx], X[test_idx]
  Y_fold_train, Y_fold_test = Y[train_idx], Y[test_idx]
  
  # Train classifiers on the data
  sgd_clf = SGD_fit(X_fold_train, Y_fold_train)
  dt_clf = DT_fit(X_fold_train, Y_fold_train)
  
  # Predict on held out test data
  Y_fold_pred_SGD = predict(sgd_clf, X_fold_test)
  Y_fold_pred_DT = predict(dt_clf, X_fold_test)
  
  # Get F1 scores
  sgd_f1 = get_f1(Y_fold_pred_SGD, Y_fold_test)
  sgd_f1_list.append(sgd_f1)
  dt_f1 = get_f1(Y_fold_pred_DT, Y_fold_test)
  dt_f1_list.append(dt_f1)
  print("Fold {fold_cnt} F1 Score DT: {dt_f1}, SGD: {sgd_f1}".format(**locals()))

# Paired t-test

In [0]:
def get_t_value(post_list, pre_list):
  difference = [n1-n2 for n1,n2 in zip(post_list, pre_list)]
  mean = np.mean(difference)
  std = np.std(difference)
  std_err = std / np.sqrt(len(post_list))
  t_value = mean / std_err
  return t_value

t_value = get_t_value(dt_f1_list, sgd_f1_list)
print(t_value)