import numpy as np
from pandas import read_csv, concat, unique, DataFrame
import matplotlib.pyplot as plt
import ds_charts as ds
from sklearn.model_selection import train_test_split

file_tag = 'diabetes'
data: DataFrame = read_csv('data/diabetes.csv')
target = 'class'
positive = 'P'
negative = 'N'
values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]}

y: np.ndarray = data.pop(target).values
X: np.ndarray = data.values
labels: np.ndarray = unique(y)


trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)

train = concat([DataFrame(trnX, columns=data.columns), DataFrame(trnY,columns=[target])], axis=1)
train.to_csv(f'data/{file_tag}_train.csv', index=False)

test = concat([DataFrame(tstX, columns=data.columns), DataFrame(tstY,columns=[target])], axis=1)
test.to_csv(f'data/{file_tag}_test.csv', index=False)
values['Train'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()


from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(trnX, trnY)
clf.score(tstX, tstY)

0.7878787878787878


import numpy as np
import sklearn.metrics as metrics

labels: np.ndarray = unique(y)
prdY: np.ndarray = clf.predict(tstX)
cnf_mtx: np.ndarray = metrics.confusion_matrix(tstY, prdY, labels)
cnf_mtx

array([[ 56,  25],
       [ 24, 126]])


import itertools
import matplotlib.pyplot as plt
CMAP = plt.cm.Blues

def plot_confusion_matrix(cnf_matrix: np.ndarray, classes_names: np.ndarray, ax: plt.Axes = None,
                          normalize: bool = False):
    if ax is None:
        ax = plt.gca()
    if normalize:
        total = cnf_matrix.sum(axis=1)[:, np.newaxis]
        cm = cnf_matrix.astype('float') / total
        title = "Normalized confusion matrix"
    else:
        cm = cnf_matrix
        title = 'Confusion matrix'
    np.set_printoptions(precision=2)
    tick_marks = np.arange(0, len(classes_names), 1)
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes_names)
    ax.set_yticklabels(classes_names)
    ax.imshow(cm, interpolation='nearest', cmap=CMAP)

    fmt = '.2f' if normalize else 'd'
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, format(cm[i, j], fmt), color='w', horizontalalignment="center")

plt.figure()
fig, axs = plt.subplots(1, 2, figsize=(8, 4), squeeze=False)
plot_confusion_matrix(cnf_mtx, labels, ax=axs[0,0])
plot_confusion_matrix(metrics.confusion_matrix(tstY, prdY, labels), labels, axs[0,1], normalize=True)
plt.tight_layout()
plt.show()

<Figure size 600x450 with 0 Axes>


data = read_csv('data/iris.csv')
y = data.pop('class').values
X = data.values
labels = unique(y)

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)

clf = GaussianNB()
clf.fit(trnX, trnY)
prdY = clf.predict(tstX)

plt.figure()
fig, axs = plt.subplots(1, 2, figsize=(8, 4), squeeze=False)
plot_confusion_matrix(metrics.confusion_matrix(tstY, prdY, labels), labels, ax=axs[0,0], )
plot_confusion_matrix(metrics.confusion_matrix(tstY, prdY, labels), labels, ax=axs[0,1], normalize=True)
plt.tight_layout()
plt.show()

<Figure size 600x450 with 0 Axes>


def plot_roc_chart(models: dict, tstX: np.ndarray, tstY: np.ndarray, ax: plt.Axes = None, target: str = 'class'):
    if ax is None:
        ax = plt.gca()
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    ax.set_xlabel('FP rate')
    ax.set_ylabel('TP rate')
    ax.set_title('ROC chart for %s' % target)

    ax.plot([0, 1], [0, 1], color='navy', label='random', linewidth=1, linestyle='--',  marker='')
    for clf in models.keys():
        metrics.plot_roc_curve(models[clf], tstX, tstY, ax=ax, marker='', linewidth=1)
    ax.legend(loc="lower right")


data = read_csv('data/diabetes.csv')
y = data.pop('class').values
X = data.values
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)
model = GaussianNB().fit(trnX, trnY)

plt.figure()
plot_roc_chart({'GaussianNB': model}, tstX, tstY, target='class')
plt.show()

Lab 3: Classification

Training Models

train_test_split function

Evaluation

Accuracy

Confusion Matrix

ROC Charts

Estimators and Models

Classification metrics

`recall_score(tstY: np.ndarray, prdY: np.ndarray) -> [0..1]`
also called sensitivity> and TP rate, reveals the models ability to recognize the positive records, and is given by `TP/(TP+FN)`; receives the known labels in tstY and the predicted ones in prdY
`precision_score(tstY: np.ndarray, prdY: np.ndarray) -> [0..1]`
reveals the models ability to not misclassify negative records, and is given by `TP/(TP+FP)`; receives the known labels in tstY and the predicted ones in prdY
`f1_score(tstY: np.ndarray, prdY: np.ndarray) -> [0..1]`
computes the average between precision and recall, and is given by `2 * (precision * recall) / (precision + recall)`; receives the known labels in tstY and the predicted ones in prdY
`balanced_accuracy_score(tstY: np.ndarray, prdY: np.ndarray) -> [0..1]`
reveals the average of recall scores for all the classes; receives the known labels in tstY and the predicted ones in prdY

Estimators Methods

`fit(trnX: np.ndarray, trnY: np.ndarray)`
trains the classifier over the data trnX labeled according to trnY, creating an internal model
`predict(trnX: np.ndarray) -> np.ndarray`
applies the learnt model to the training data in trnX and returns their predicted labels
`score(tstX: np.ndarray, tstY: np.ndarray) -> float`
applies the model to tstX and compares the predicted labels to the labels in tstY, computing model's mean accuracy on the given data