In [None]:
import numpy as np

## Dataset loading

### Default datasets

In [None]:
from sklearn.datasets import load_breast_cancer
breast = load_breast_cancer()
print(breast.DESCR)

# Matrix of examples and associated labels
X, y = breast.data, breast.target

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
print(digits.DESCR)

In [None]:
import matplotlib.pyplot as plt
plt.gray()
plt.matshow(digits.images[2])
plt.show()

### Da file svmlight

In [None]:
#Let's load a dataset svmlight from a URL
import urllib
raw_data = urllib.request.urlopen("https://www.math.unipd.it/~aiolli/teaching/ml2122/tic-tac-toe.svmlight")


#SVMLIGHT
from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file(raw_data) #or "filename" if loaded from a file 
X = X.toarray() #Needed to convert a sparse matrix to a dense matrix (alternative todense)

## Training and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Preprocessing

### Standard scaler

"Standardize" features by removing mean and scaling to a unitary variance (mean 0, variance 1).

### Minmax scaler
By default, it scales features between 0 and 1 according to the following transformation

```
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
```


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Evaluation (classification)

$\textit{accuracy} = \frac{\text{TP}+\text{TN}}{\text{TP}+\text{TN}+\text{FP}+\text{FN}}$

$\textit{precision} = \frac{\text{TP}}{\text{TP}+\text{FP}}$

$\textit{recall} = \frac{\text{TP}}{\text{TP}+\text{FN}}$

*AUC* = Area Under the ROC Curve

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def evaluate(y_test, y_pred):
	print("accuracy:", accuracy_score(y_test, y_pred))
	print("precision:", precision_score(y_test, y_pred))
	print("recall:", recall_score(y_test, y_pred))
	#print "AUC:", roc_auc_score(y_test, y_pred)
	print

# Methods for classification

## Decision trees

In [None]:
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier()
#DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False)

clf_tree = clf_tree.fit(X_train, y_train)
y_pred = clf_tree.predict(X_test)

print("DT")
evaluate(y_test, y_pred)

## Multi Layer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=1, tol=1e-8, learning_rate_init=.01)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

print("MLP")
evaluate(y_test, y_pred)

## Support Vector Machine

In [None]:
#SVM
from sklearn import svm
svc = svm.SVC(gamma=0.0001, C=100.0) #kernel : string, optional (default=rbf) linear, poly, rbf, sigmoid
#SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print("SVM")
evaluate(y_test, y_pred)

# Model selection

## k-fold cross validation and grid search

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn import metrics

Dictionary containingthe parameter grid for SVM (linear kernel, RBF and custom) and NN

In [None]:
C_values = [2**i for i in range(-5,5)]
p_grid = { 
 "svm" : [{"C": C_values, "kernel": ["rbf"], "gamma" : [10**i for i in range(-4, 4)]},
			 {"C": C_values, "kernel": ["poly"], "degree": [2+i for i in range(4)]}],
 "precomputed" : {"C": C_values, "kernel": ["precomputed"]},
			"nn" : {"alpha" : [10**i for i in range(-5,1)], 'hidden_layer_sizes': [(10,), (50,), (100,), (200,)]}
 }

In [None]:
#from sklearn import svm
#from sklearn.neural_network import MLPClassifier

### hdden warnings of sklearn
#import warnings
#warnings.filterwarnings("ignore", category=DeprecationWarning)

kernel = "precomputed" #rbf/linear/poly

#Example of linear kernel
Klin = np.dot(X, X.T)

### Test repeated N times
skf = KFold(n_splits=5, shuffle=True, random_state=42)

fold, accs = 1, []
for train, test in skf.split(X, y):
	print("FOLD:", fold)
	X_train, X_test = X[train], X[test]
	
	### The content of X should be a custom kernel!
	if kernel == "precomputed": 
		X_train, X_test = Klin[train][:,train], Klin[test][:,train]
	
	### Possible Preprocessing!!!
	
	### Validation of parameters with Grid Search
	#GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score='warn')
	clf = GridSearchCV(svm.SVC(kernel), param_grid=p_grid["precomputed"], cv=5, scoring='accuracy') #SVM
	#clf = GridSearchCV(MLPClassifier(), param_grid=p_grid["nn"], cv=5, scoring='accuracy') #NN
	
	### Training
	clf.fit(X_train, y[train])
	
	### Info about validation
	#print "CV info:", clf.cv_results_#.keys()
	print("VALIDATION score:", clf.best_score_)
	print("BEST parameters:", clf.best_params_)
	#clf.best_estimator_
	
	### score (not the prediction!) Useful to compute AUC (ranking metric)
	#y_pred = clf.decision_function(X_test) 
	
	y_pred = clf.predict(X_test)
	y_true = y[test]
	
	### Training classification report and confusion matrix
	print(metrics.classification_report(y_true, y_pred))
	print(metrics.confusion_matrix(y_true, y_pred))
	
	#auc = roc_auc_score(y_true, y_pred) # AUC
	acc = accuracy_score(y_true, y_pred) # Accuracy
	print("TEST score:{}\n".format(acc))
	
	accs.append(acc)
	fold += 1

print("AVG ACCURACY:", np.mean(accs), "+-", np.std(accs))