In [1]:
from PIL import Image, ImageOps
import numpy, os
from sklearn.cross_validation import cross_val_score
import numpy as np
import pandas as pd



In [2]:
path="dataset/"
Xlist=[]
Ylist=[]
size = 100, 100

In [3]:
print("reading dataset images files")
for directory in os.listdir(path):
    for file in os.listdir(path+directory):
        #print(path+directory+"/"+file)
        img=Image.open(path+directory+"/"+file)
        #resize
        thumb = ImageOps.fit(img, size, Image.ANTIALIAS)
        image_data = np.array(thumb).flatten()[:100]
        #image_data=numpy.array(img).flatten()[:50] #in my case the images dont have the same dimensions, so [:50] only takes the first 50 values
        Xlist.append(image_data)
        Ylist.append(directory)

reading dataset images files


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xlist, Ylist, test_size=0.2)

### AdaBoostClassifier

In [5]:
from sklearn.ensemble import AdaBoostClassifier
clf=AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train, y_train, cv=3)
print(scores.mean())

0.762399355878


### GaussianNB

In [6]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
scores = cross_val_score(clf, Xlist, Ylist)
print(scores.mean())

0.721908939014


### KNeighborsClassifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=10)
scores = cross_val_score(clf, Xlist, Ylist)
print(scores.mean())

0.751357560568


### LinearSVC

In [8]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
scores = cross_val_score(clf, Xlist, Ylist)
print(scores.mean())

0.66238512949


### SVC

In [9]:
from sklearn.svm import SVC
clf = SVC()
scores = cross_val_score(clf, Xlist, Ylist)
print(scores.mean())

0.668650793651


### GaussianProcessClassifier

In [10]:
from sklearn.gaussian_process import GaussianProcessClassifier
clf = GaussianProcessClassifier()
scores = cross_val_score(clf, Xlist, Ylist)
print(scores.mean())

0.491228070175


### RandomForestClassifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
scores = cross_val_score(clf, Xlist, Ylist)
print(scores.mean())

0.775793650794


# Hyperparameters Tuning using sklearn pipeline and gridsearch

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [13]:
pipe = [Pipeline([
        ('clf', RandomForestClassifier()),
    ]),
    Pipeline([
        ('clf', KNeighborsClassifier()),
    ]),
    Pipeline([
        ('clf', GaussianProcessClassifier()),
    ]),
    Pipeline([
        ('clf', AdaBoostClassifier()),
    ]),
    Pipeline([
        ('clf', SVC()),
    ]),
]

In [14]:
param_grid = [dict(clf__n_estimators=[3, 10, 100]),
              dict(clf__n_neighbors=[3,10]),
              dict(clf__n_restarts_optimizer=[0,1]),
              dict(clf__n_estimators=[3, 10, 100]),
              dict(clf__C=[3, 10, 100]),
             ]

In [15]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, verbose=1, cv=3)

In [16]:
# Utility function to report best scores
def report(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [17]:
from time import time

for i in range(len(pipe)):
        start = time()
        print("-----")
        print("classifier:")
        print(pipe[i].named_steps['clf'])
        grid_search = GridSearchCV(pipe[i], param_grid[i], n_jobs=-1, verbose=1, cv=3)
        grid_search.fit(X_train, y_train)
        print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
              % (time() - start, len(grid_search.cv_results_['params'])))
        print("finished GridSearch")
        report(grid_search.cv_results_)

-----
classifier:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.8s finished


GridSearchCV took 1.45 seconds for 3 candidate parameter settings.
finished GridSearch
Model with rank: 1
Mean validation score: 0.800 (std: 0.085)
Parameters: {'clf__n_estimators': 100}

Model with rank: 2
Mean validation score: 0.778 (std: 0.035)
Parameters: {'clf__n_estimators': 3}

Model with rank: 3
Mean validation score: 0.741 (std: 0.046)
Parameters: {'clf__n_estimators': 10}

-----
classifier:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Fitting 3 folds for each of 2 candidates, totalling 6 fits
GridSearchCV took 0.35 seconds for 2 candidate parameter settings.
finished GridSearch
Model with rank: 1
Mean validation score: 0.756 (std: 0.056)
Parameters: {'clf__n_neighbors': 10}

Model with rank: 2
Mean validation score: 0.748 (std: 0.111)
Parameters: {'clf__n_neighbors': 3}

-----
classifier:
GaussianProcessClassifier(copy_X_train=True, kernel=None,
            

[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.1s finished


GridSearchCV took 0.47 seconds for 2 candidate parameter settings.
finished GridSearch
Model with rank: 1
Mean validation score: 0.496 (std: 0.005)
Parameters: {'clf__n_restarts_optimizer': 0}

Model with rank: 1
Mean validation score: 0.496 (std: 0.005)
Parameters: {'clf__n_restarts_optimizer': 1}

-----
classifier:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    0.3s finished


GridSearchCV took 1.06 seconds for 3 candidate parameter settings.
finished GridSearch
Model with rank: 1
Mean validation score: 0.793 (std: 0.088)
Parameters: {'clf__n_estimators': 3}

Model with rank: 2
Mean validation score: 0.785 (std: 0.084)
Parameters: {'clf__n_estimators': 10}

Model with rank: 3
Mean validation score: 0.763 (std: 0.048)
Parameters: {'clf__n_estimators': 100}

-----
classifier:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.9s finished


GridSearchCV took 0.36 seconds for 3 candidate parameter settings.
finished GridSearch
Model with rank: 1
Mean validation score: 0.689 (std: 0.067)
Parameters: {'clf__C': 3}

Model with rank: 1
Mean validation score: 0.689 (std: 0.067)
Parameters: {'clf__C': 10}

Model with rank: 1
Mean validation score: 0.689 (std: 0.067)
Parameters: {'clf__C': 100}



[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.1s finished
