Andreas Mueller
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target
print(X.shape)
print(y.shape)
plt.matshow(X[0, :].reshape(8, 8))
(1797, 64) (1797,)
<matplotlib.image.AxesImage at 0x7fd5da21cd50>
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
0.973333333333
from sklearn.grid_search import GridSearchCV
search = GridSearchCV(rf, param_grid={'max_depth': [1, 3, 5, 10]}, cv=5)
search.fit(X_train, y_train)
print(search.score(X_test, y_test))
0.975555555556
import pandas as pd
df_train = pd.read_csv("../../biology_datasets_stuff/AID687red_train.csv")
print(df_train.Outcome.value_counts())
X_train, y_train = df_train.values[:, :-1], df_train.values[:, -1]
y_train = y_train == 'Active'
print(X_train.shape)
print(y_train.shape)
Inactive 26378 Active 76 dtype: int64 (26454, 153) (26454,)
search = GridSearchCV(rf, param_grid={'max_depth': [1, 3, 5, 10]}, cv=3, verbose=10)
search.fit(X_train, y_train)
Fitting 3 folds for each of 4 candidates, totalling 12 fits [CV] max_depth=1 ..................................................... [CV] ............................ max_depth=1, score=0.997052 - 0.6s [CV] max_depth=1 ..................................................... [CV] ............................ max_depth=1, score=0.997165 - 0.6s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.6s [Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 1.3s
[CV] max_depth=1 ..................................................... [CV] ............................ max_depth=1, score=0.997165 - 0.6s [CV] max_depth=3 ..................................................... [CV] ............................ max_depth=3, score=0.997052 - 1.1s [CV] max_depth=3 ..................................................... [CV] ............................ max_depth=3, score=0.997165 - 1.1s [CV] max_depth=3 ..................................................... [CV] ............................ max_depth=3, score=0.997165 - 1.1s [CV] max_depth=5 ..................................................... [CV] ............................ max_depth=5, score=0.997052 - 1.6s [CV] max_depth=5 ..................................................... [CV] ............................ max_depth=5, score=0.997165 - 1.8s
[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 4.1s [Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 8.6s
[CV] max_depth=5 ..................................................... [CV] ............................ max_depth=5, score=0.997165 - 2.0s [CV] max_depth=10 .................................................... [CV] ........................... max_depth=10, score=0.997052 - 2.7s [CV] max_depth=10 .................................................... [CV] ........................... max_depth=10, score=0.997165 - 2.6s [CV] max_depth=10 .................................................... [CV] ........................... max_depth=10, score=0.997165 - 2.0s
[Parallel(n_jobs=1)]: Done 12 out of 12 | elapsed: 17.9s finished
GridSearchCV(cv=3, estimator=RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'max_depth': [1, 3, 5, 10]}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None, verbose=10)
search = GridSearchCV(rf, param_grid={'max_depth': [1, 3, 5, 10]}, scoring='roc_auc', cv=3, verbose=10)
search.fit(X_train, y_train)
Fitting 3 folds for each of 4 candidates, totalling 12 fits [CV] max_depth=1 ..................................................... [CV] ............................ max_depth=1, score=0.599804 - 0.6s [CV] max_depth=1 ..................................................... [CV] ............................ max_depth=1, score=0.640751 - 0.7s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.6s [Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 1.3s
[CV] max_depth=1 ..................................................... [CV] ............................ max_depth=1, score=0.487825 - 0.7s [CV] max_depth=3 ..................................................... [CV] ............................ max_depth=3, score=0.632505 - 1.2s [CV] max_depth=3 ..................................................... [CV] ............................ max_depth=3, score=0.715722 - 1.2s [CV] max_depth=3 ..................................................... [CV] ............................ max_depth=3, score=0.491206 - 1.2s [CV] max_depth=5 ..................................................... [CV] ............................ max_depth=5, score=0.633345 - 1.7s [CV] max_depth=5 ..................................................... [CV] ............................ max_depth=5, score=0.705129 - 1.6s
[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 4.4s [Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 8.9s
[CV] max_depth=5 ..................................................... [CV] ............................ max_depth=5, score=0.527236 - 1.5s [CV] max_depth=10 .................................................... [CV] ........................... max_depth=10, score=0.716901 - 2.4s [CV] max_depth=10 .................................................... [CV] ........................... max_depth=10, score=0.694252 - 2.5s [CV] max_depth=10 .................................................... [CV] ........................... max_depth=10, score=0.456285 - 2.0s
[Parallel(n_jobs=1)]: Done 12 out of 12 | elapsed: 17.4s finished
GridSearchCV(cv=3, estimator=RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'max_depth': [1, 3, 5, 10]}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring='roc_auc', verbose=10)
param_grid = {'max_depth': [1, 3, 5, 10]}
GridSearchCV(rf, param_grid, scoring='accuracy')
GridSearchCV(rf, param_grid, scoring='roc_auc')
GridSearchCV(rf, param_grid, scoring='average_precision');
from sklearn.metrics import SCORERS
for scoring in SCORERS:
print(scoring)
adjusted_rand_score f1 mean_absolute_error r2 recall precision log_loss mean_squared_error roc_auc average_precision accuracy
from sklearn.metrics import make_scorer
def my_freaky_loss(y_true, probability):
return np.sum(probability[np.arange(len(y_true)), y_true] > .4)
my_freaky_scorer = make_scorer(my_freaky_loss, needs_probability=True)
GridSearchCV(rf, param_grid, scoring=my_freaky_scorer)
def freaky_regularized_scorer(estimator, X, y_true):
return estimator.score(X, y_true) + .1 * np.linalg.norm(estimator.coef_)
GridSearchCV(rf, param_grid, scoring=freaky_regularized_scorer);
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import expon
X, y = digits.data, digits.target
pipe = Pipeline([("feature_selection", SelectKBest()), ("classifier", SVC())])
param_distribution = {'feature_selection__k': np.arange(1, X.shape[1]), 'classifier__C': expon(0, 1),
'classifier__gamma': expon(0, 1), 'classifier__kernel': ['linear', 'rbf', 'poly'],
'classifier__degree': [1, 2, 3, 4]}
search = RandomizedSearchCV(pipe, param_distribution, verbose=10)
search.fit(X, y)
print(search.best_params_)
Fitting 3 folds for each of 10 candidates, totalling 30 fits [CV] classifier__gamma=0.300178061032, feature_selection__k=12, classifier__kernel=linear, classifier__C=2.25680169839, classifier__degree=1 [CV] classifier__gamma=0.300178061032, feature_selection__k=12, classifier__kernel=linear, classifier__C=2.25680169839, classifier__degree=1, score=0.792359 - 0.3s [CV] classifier__gamma=0.300178061032, feature_selection__k=12, classifier__kernel=linear, classifier__C=2.25680169839, classifier__degree=1 [CV] classifier__gamma=0.300178061032, feature_selection__k=12, classifier__kernel=linear, classifier__C=2.25680169839, classifier__degree=1, score=0.821369 - 0.2s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.3s [Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.5s
[CV] classifier__gamma=0.300178061032, feature_selection__k=12, classifier__kernel=linear, classifier__C=2.25680169839, classifier__degree=1 [CV] classifier__gamma=0.300178061032, feature_selection__k=12, classifier__kernel=linear, classifier__C=2.25680169839, classifier__degree=1, score=0.862416 - 0.1s [CV] classifier__gamma=0.502230132663, feature_selection__k=24, classifier__kernel=linear, classifier__C=0.0224484874016, classifier__degree=3 [CV] classifier__gamma=0.502230132663, feature_selection__k=24, classifier__kernel=linear, classifier__C=0.0224484874016, classifier__degree=3, score=0.900332 - 0.0s [CV] classifier__gamma=0.502230132663, feature_selection__k=24, classifier__kernel=linear, classifier__C=0.0224484874016, classifier__degree=3 [CV] classifier__gamma=0.502230132663, feature_selection__k=24, classifier__kernel=linear, classifier__C=0.0224484874016, classifier__degree=3, score=0.923205 - 0.0s [CV] classifier__gamma=0.502230132663, feature_selection__k=24, classifier__kernel=linear, classifier__C=0.0224484874016, classifier__degree=3 [CV] classifier__gamma=0.502230132663, feature_selection__k=24, classifier__kernel=linear, classifier__C=0.0224484874016, classifier__degree=3, score=0.899329 - 0.0s [CV] classifier__gamma=0.524948417571, feature_selection__k=10, classifier__kernel=rbf, classifier__C=0.494783753598, classifier__degree=4 [CV] classifier__gamma=0.524948417571, feature_selection__k=10, classifier__kernel=rbf, classifier__C=0.494783753598, classifier__degree=4, score=0.104651 - 0.2s [CV] classifier__gamma=0.524948417571, feature_selection__k=10, classifier__kernel=rbf, classifier__C=0.494783753598, classifier__degree=4 [CV] classifier__gamma=0.524948417571, feature_selection__k=10, classifier__kernel=rbf, classifier__C=0.494783753598, classifier__degree=4, score=0.126878 - 0.2s
[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 0.7s [Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 1.2s
[CV] classifier__gamma=0.524948417571, feature_selection__k=10, classifier__kernel=rbf, classifier__C=0.494783753598, classifier__degree=4 [CV] classifier__gamma=0.524948417571, feature_selection__k=10, classifier__kernel=rbf, classifier__C=0.494783753598, classifier__degree=4, score=0.105705 - 0.3s [CV] classifier__gamma=0.224656234883, feature_selection__k=21, classifier__kernel=rbf, classifier__C=1.90826331415, classifier__degree=4 [CV] classifier__gamma=0.224656234883, feature_selection__k=21, classifier__kernel=rbf, classifier__C=1.90826331415, classifier__degree=4, score=0.101329 - 0.4s [CV] classifier__gamma=0.224656234883, feature_selection__k=21, classifier__kernel=rbf, classifier__C=1.90826331415, classifier__degree=4 [CV] classifier__gamma=0.224656234883, feature_selection__k=21, classifier__kernel=rbf, classifier__C=1.90826331415, classifier__degree=4, score=0.101836 - 0.4s [CV] classifier__gamma=0.224656234883, feature_selection__k=21, classifier__kernel=rbf, classifier__C=1.90826331415, classifier__degree=4 [CV] classifier__gamma=0.224656234883, feature_selection__k=21, classifier__kernel=rbf, classifier__C=1.90826331415, classifier__degree=4, score=0.112416 - 0.4s [CV] classifier__gamma=1.09954547482, feature_selection__k=44, classifier__kernel=rbf, classifier__C=0.401147417793, classifier__degree=3 [CV] classifier__gamma=1.09954547482, feature_selection__k=44, classifier__kernel=rbf, classifier__C=0.401147417793, classifier__degree=3, score=0.101329 - 0.4s [CV] classifier__gamma=1.09954547482, feature_selection__k=44, classifier__kernel=rbf, classifier__C=0.401147417793, classifier__degree=3 [CV] classifier__gamma=1.09954547482, feature_selection__k=44, classifier__kernel=rbf, classifier__C=0.401147417793, classifier__degree=3, score=0.101836 - 0.3s [CV] classifier__gamma=1.09954547482, feature_selection__k=44, classifier__kernel=rbf, classifier__C=0.401147417793, classifier__degree=3 [CV] classifier__gamma=1.09954547482, feature_selection__k=44, classifier__kernel=rbf, classifier__C=0.401147417793, classifier__degree=3, score=0.288591 - 0.3s [CV] classifier__gamma=0.439926483976, feature_selection__k=48, classifier__kernel=poly, classifier__C=2.09571163157, classifier__degree=2 [CV] classifier__gamma=0.439926483976, feature_selection__k=48, classifier__kernel=poly, classifier__C=2.09571163157, classifier__degree=2, score=0.951827 - 0.1s [CV] classifier__gamma=0.439926483976, feature_selection__k=48, classifier__kernel=poly, classifier__C=2.09571163157, classifier__degree=2 [CV] classifier__gamma=0.439926483976, feature_selection__k=48, classifier__kernel=poly, classifier__C=2.09571163157, classifier__degree=2, score=0.963272 - 0.1s [CV] classifier__gamma=0.439926483976, feature_selection__k=48, classifier__kernel=poly, classifier__C=2.09571163157, classifier__degree=2 [CV] classifier__gamma=0.439926483976, feature_selection__k=48, classifier__kernel=poly, classifier__C=2.09571163157, classifier__degree=2, score=0.942953 - 0.1s
[Parallel(n_jobs=1)]: Done 13 jobs | elapsed: 3.0s [Parallel(n_jobs=1)]: Done 18 jobs | elapsed: 3.9s
[CV] classifier__gamma=0.047018954104, feature_selection__k=61, classifier__kernel=rbf, classifier__C=0.99233273121, classifier__degree=1 [CV] classifier__gamma=0.047018954104, feature_selection__k=61, classifier__kernel=rbf, classifier__C=0.99233273121, classifier__degree=1, score=0.101329 - 0.5s [CV] classifier__gamma=0.047018954104, feature_selection__k=61, classifier__kernel=rbf, classifier__C=0.99233273121, classifier__degree=1 [CV] classifier__gamma=0.047018954104, feature_selection__k=61, classifier__kernel=rbf, classifier__C=0.99233273121, classifier__degree=1, score=0.105175 - 0.5s [CV] classifier__gamma=0.047018954104, feature_selection__k=61, classifier__kernel=rbf, classifier__C=0.99233273121, classifier__degree=1 [CV] classifier__gamma=0.047018954104, feature_selection__k=61, classifier__kernel=rbf, classifier__C=0.99233273121, classifier__degree=1, score=0.110738 - 0.6s [CV] classifier__gamma=0.414464483331, feature_selection__k=13, classifier__kernel=rbf, classifier__C=0.0828898390697, classifier__degree=1 [CV] classifier__gamma=0.414464483331, feature_selection__k=13, classifier__kernel=rbf, classifier__C=0.0828898390697, classifier__degree=1, score=0.101329 - 0.3s [CV] classifier__gamma=0.414464483331, feature_selection__k=13, classifier__kernel=rbf, classifier__C=0.0828898390697, classifier__degree=1 [CV] classifier__gamma=0.414464483331, feature_selection__k=13, classifier__kernel=rbf, classifier__C=0.0828898390697, classifier__degree=1, score=0.101836 - 0.2s [CV] classifier__gamma=0.414464483331, feature_selection__k=13, classifier__kernel=rbf, classifier__C=0.0828898390697, classifier__degree=1 [CV] classifier__gamma=0.414464483331, feature_selection__k=13, classifier__kernel=rbf, classifier__C=0.0828898390697, classifier__degree=1, score=0.104027 - 0.3s [CV] classifier__gamma=0.956751973624, feature_selection__k=26, classifier__kernel=rbf, classifier__C=0.623129733918, classifier__degree=2 [CV] classifier__gamma=0.956751973624, feature_selection__k=26, classifier__kernel=rbf, classifier__C=0.623129733918, classifier__degree=2, score=0.101329 - 0.3s [CV] classifier__gamma=0.956751973624, feature_selection__k=26, classifier__kernel=rbf, classifier__C=0.623129733918, classifier__degree=2 [CV] classifier__gamma=0.956751973624, feature_selection__k=26, classifier__kernel=rbf, classifier__C=0.623129733918, classifier__degree=2, score=0.101836 - 0.3s [CV] classifier__gamma=0.956751973624, feature_selection__k=26, classifier__kernel=rbf, classifier__C=0.623129733918, classifier__degree=2 [CV] classifier__gamma=0.956751973624, feature_selection__k=26, classifier__kernel=rbf, classifier__C=0.623129733918, classifier__degree=2, score=0.197987 - 0.3s [CV] classifier__gamma=1.17760610429, feature_selection__k=54, classifier__kernel=poly, classifier__C=1.4499113676, classifier__degree=4 [CV] classifier__gamma=1.17760610429, feature_selection__k=54, classifier__kernel=poly, classifier__C=1.4499113676, classifier__degree=4, score=0.953488 - 0.1s [CV] classifier__gamma=1.17760610429, feature_selection__k=54, classifier__kernel=poly, classifier__C=1.4499113676, classifier__degree=4 [CV] classifier__gamma=1.17760610429, feature_selection__k=54, classifier__kernel=poly, classifier__C=1.4499113676, classifier__degree=4, score=0.978297 - 0.1s [CV] classifier__gamma=1.17760610429, feature_selection__k=54, classifier__kernel=poly, classifier__C=1.4499113676, classifier__degree=4 [CV] classifier__gamma=1.17760610429, feature_selection__k=54, classifier__kernel=poly, classifier__C=1.4499113676, classifier__degree=4, score=0.947987 - 0.1s
[Parallel(n_jobs=1)]: Done 25 jobs | elapsed: 6.6s [Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 7.5s finished
{'classifier__gamma': 1.1776061042881776, 'feature_selection__k': 54, 'classifier__kernel': 'poly', 'classifier__C': 1.4499113675965216, 'classifier__degree': 4}
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
for i in range(10):
y_train_partial = np.load("y_train_%d.npy" % i)
X_train_partial = np.load("X_train_%d.npy" % i)
nb.partial_fit(X_train_partial, y_train_partial, classes=np.arange(10))
X_test, y_test = np.load("X_test.npy"), np.load("y_test.npy")
nb.score(X_test, y_test)
0.8357
Aka: Trees in Random Forsts share
results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(
delayed(_parallel_apply)(tree, X) for tree in self.estimators_)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-12-09c0604b37b8> in <module>() ----> 1 results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( 2 delayed(_parallel_apply)(tree, X) for tree in self.estimators_) NameError: name 'Parallel' is not defined
(maybe)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
# before:
from sklearn.pipeline import Pipeline, FeatureUnion
pipe = Pipeline([("features", FeatureUnion([("word_ngrams", CountVectorizer()), ("char_ngrams", CountVectorizer(analyzer="char"))])),
("svm", LinearSVC())])
print(pipe)
# now:
from sklearn.pipeline import make_pipeline, make_union
pipe = make_pipeline(make_union(CountVectorizer(), CountVectorizer(analyzer="char")),
LinearSVC())
print(pipe)
Pipeline(steps=[('features', FeatureUnion(n_jobs=1, transformer_list=[('word_ngrams', CountVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_d...ling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))]) Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1, transformer_list=[('countvectorizer-1', CountVectorizer(analyzer=u'word', binary=False, charset=None, charset_error=None, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=T...ling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))])
Thanks to
@ogrisel | @GaelVaroquaux |
@larsmans | @glouppe |
@pprett | @vene |
@mblondel | @agramfort |
@arjoly | @jaquesgrobler |
@jakevdp | @robertlayton |
@NelleV | @ndawe |
@jnothmann |
and many more....
t3kcit@gmail.com | |
@t3kcit | |
@amueller | |
peekaboo-vision.blogspot.com |