import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.bbox"] = "tight"
np.set_printoptions(precision=3, suppress=True)
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler
from sklearn.datasets import make_blobs, make_circles, make_moons
rng = np.random.RandomState(6)
X_blobs, y_blobs = make_blobs(n_samples=200, random_state=1)
X_blobs = scale(X_blobs @ rng.normal(0, 4, size=(2, 2)))
X_circles, y_circles = make_circles(n_samples=400, random_state=0, noise=.09, factor=.4)
X_circles = scale(X_circles)
X_moons, y_moons = make_moons(n_samples=200, random_state=0, noise=.09)
X_moons = scale(X_moons)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import adjusted_rand_score, silhouette_score, normalized_mutual_info_score, adjusted_mutual_info_score
fig, axes = plt.subplots(1, 3, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 3))
for ax, (X, y) in zip(axes, [(X_blobs, y_blobs), (X_circles, y_circles), (X_moons, y_moons)]):
    ax.scatter(X[:, 0], X[:, 1], c=y, s=10)
../_images/17-cluster-evaluation_4_0.png
fig, axes = plt.subplots(2, 2, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 10))
for ax, eps in zip(axes.ravel(), [.15, .2, .3, .4]):
    dbs = DBSCAN(eps=eps).fit(X_circles)
    ari = adjusted_rand_score(y_circles, dbs.labels_)
    nmi = normalized_mutual_info_score(y_circles, dbs.labels_)
    ami = adjusted_mutual_info_score(y_circles, dbs.labels_)
    ax.set_title("eps: {:.2f} ARI: {:.2f} NMI: {:.2f} AMI: {:.2f}".format(eps, ari, nmi, ami))
    ax.scatter(X_circles[:, 0], X_circles[:, 1], c=dbs.labels_, s=20)
../_images/17-cluster-evaluation_5_0.png
fig, axes = plt.subplots(2, 4, subplot_kw={'xticks': (), 'yticks':()}, figsize=(20, 10))
for ax, eps in zip(axes.ravel(), [5, 6, 8, 10, 11, 12]):
    dbs = KMeans(n_clusters=eps).fit(X_circles)
    ari = adjusted_rand_score(y_circles, dbs.labels_)
    nmi = normalized_mutual_info_score(y_circles, dbs.labels_)
    ami = adjusted_mutual_info_score(y_circles, dbs.labels_)
    ax.set_title("eps: {:.2f} ARI: {:.2f} NMI: {:.2f} AMI: {:.2f}".format(eps, ari, nmi, ami))
    ax.scatter(X_circles[:, 0], X_circles[:, 1], c=dbs.labels_, s=20)
../_images/17-cluster-evaluation_6_0.png
from sklearn.datasets import load_digits
digits = load_digits()
aris, nmis, amis = [], [], []
for i in range(1, 30):
    km = KMeans(n_clusters=i).fit(digits.data)
    ari = adjusted_rand_score(digits.target, km.labels_)
    nmi = normalized_mutual_info_score(digits.target, km.labels_)
    ami = adjusted_mutual_info_score(digits.target, km.labels_)
    aris.append(ari)
    nmis.append(nmi)
    amis.append(ami)
plt.plot(aris, label="ARI")
plt.plot(amis, label="AMI")
plt.plot(nmis, label="NMI")
plt.legend()
<matplotlib.legend.Legend at 0x7f712f805390>
../_images/17-cluster-evaluation_7_1.png
plt.plot(range(1, 30), aris, label="ARI")
plt.plot(range(1, 30), amis, label="AMI")
plt.plot(range(1, 30), nmis, label="NMI")
plt.legend()
plt.xlabel("n_clusters")
Text(0.5,0,'n_clusters')
../_images/17-cluster-evaluation_8_1.png
fig, axes = plt.subplots(2, 2, subplot_kw={'xticks': (), 'yticks':()}, figsize=(7, 7))
agg = AgglomerativeClustering(n_clusters=3, connectivity=lambda x: kneighbors_graph(x, 10))

for ax, alg in zip(axes.ravel(), [KMeans(n_clusters=5), KMeans(n_clusters=3), agg, DBSCAN(eps=.3)]):
    alg.fit(X)
    ax.scatter(X[:, 0], X[:, 1], c=alg.labels_, s=10)
    #ax.set_aspect("equal")
    ari = adjusted_rand_score(y, alg.labels_)
    silhouette = silhouette_score(X, alg.labels_)
    ax.set_title("ARI: {:.2f} Silhouette: {:.2f}".format(ari, silhouette))
../_images/17-cluster-evaluation_9_0.png
fig, axes = plt.subplots(3, 3, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 10))
agg = AgglomerativeClustering(n_clusters=3, connectivity=lambda x: kneighbors_graph(x, 10))
for axes2, alg in zip(axes, [KMeans(n_clusters=3), DBSCAN(eps=.3), agg]):
    for ax, (X, y) in zip(axes2, [(X_blobs, y_blobs), (X_circles, y_circles), (X_moons, y_moons)]):
        alg.fit(X)
        ax.scatter(X[:, 0], X[:, 1], c=alg.labels_, s=10)
        #ax.set_aspect("equal")
        ari = adjusted_rand_score(y, alg.labels_)
        silhouette = silhouette_score(X, alg.labels_)
        ax.set_title("ARI: {:.2f} Silhouette: {:.2f}".format(ari, silhouette))
../_images/17-cluster-evaluation_10_0.png
from sklearn.metrics import silhouette_samples

def silhouette_plot(X, cluster_labels, ax=None):
    silhouette_scores = silhouette_samples(X, cluster_labels)
    if ax is None:
        ax = plt.gca()
    y_lower = 10
    inliers = cluster_labels != -1
    X = X[inliers]
    cluster_labels = cluster_labels[inliers]
    silhouette_scores = silhouette_scores[inliers]
    labels = np.unique(cluster_labels)
    cm = plt.cm.Vega10 if len(labels) <= 10 else plt.cm.Vega20
    for i in labels:
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            silhouette_scores[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm(i)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples
    X, y = make_blobs(n_samples=1000, centers=4, random_state=14)
plt.scatter(X[:, 0], X[:, 1], s=10)
<matplotlib.collections.PathCollection at 0x7f712d0e3208>
../_images/17-cluster-evaluation_12_1.png
fig, axes = plt.subplots(2, 5, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 5))
for ax, n_clusters in zip(axes.T, [2, 3, 4, 5, 8]):
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    ax[0].scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(km.labels_), s=10)
    silhouette_plot(X, km.labels_, ax=ax[1])
    ax[0].set_title("score: {:.2f}".format(silhouette_score(X, km.labels_)))
../_images/17-cluster-evaluation_13_0.png
rng = np.random.RandomState(1)
X, y = make_blobs(n_samples=500, centers=10, random_state=rng, cluster_std=[rng.gamma(2) for i in range(10)])
plt.scatter(X[:, 0], X[:, 1], s=5, alpha=.6)
plt.gca().set_aspect("equal")
#plt.scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(y), s=5, alpha=.6)
xlim = plt.xlim()
../_images/17-cluster-evaluation_14_0.png
fig, axes = plt.subplots(4, 5, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 10))
axes = np.hstack([axes[:2], axes[2:]]).T

for ax, n_clusters in zip(axes, range(3, 13)):
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    ax[0].scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(km.labels_), s=10)
    silhouette_plot(X, km.labels_, ax=ax[1])
    ax[0].set_title("score: {:.2f}".format(silhouette_score(X, km.labels_)))
../_images/17-cluster-evaluation_15_0.png
fig, axes = plt.subplots(4, 5, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 10))
axes = np.hstack([axes[:2], axes[2:]]).T

for ax, eps in zip(axes, np.linspace(0.1, 2, 10)):
    db = DBSCAN(eps=eps)
    db.fit(X)
    colors = plt.cm.Vega10(db.labels_)
    colors[db.labels_ == -1] = [.5, .5, .5, 1]
    ax[0].scatter(X[:, 0], X[:, 1], c=colors, s=10)
    silhouette_plot(X, db.labels_, ax=ax[1])
    ax[0].set_title("score: {:.2f}".format(silhouette_score(X, db.labels_)))
../_images/17-cluster-evaluation_16_0.png
fig, axes = plt.subplots(1, 5, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 5))
for ax, n_clusters in zip(axes, [2, 5, 10, 15, 20]):
    km = KMeans(n_clusters=n_clusters)
    km.fit(digits.data)
    silhouette_plot(digits.data, km.labels_, ax=ax)
    ax.set_title("n_clusters={}\n score: {:.2f}".format(n_clusters, silhouette_score(digits.data, km.labels_)))
../_images/17-cluster-evaluation_17_0.png

Cluster Stability

from sklearn.base import clone
from sklearn.utils import check_random_state

def cluster_stability(X, est, n_iter=20, random_state=None):
    labels = []
    indices = []
    for i in range(n_iter):
        # draw bootstrap samples, store indices
        sample_indices = rng.randint(0, X.shape[0], X.shape[0])
        indices.append(sample_indices)
        est = clone(est)
        if hasattr(est, "random_state"):
            # randomize estimator if possible
            est.random_state = rng.randint(1e5)
        X_bootstrap = X[sample_indices]
        est.fit(X_bootstrap)
        # store clustering outcome using original indices
        relabel = -np.ones(X.shape[0], dtype=np.int)
        relabel[sample_indices] = est.labels_
        labels.append(relabel)
    scores = []
    for l, i in zip(labels, indices):
        for k, j in zip(labels, indices):
            # we also compute the diagonal which is a bit silly
            in_both = np.intersect1d(i, j)
            scores.append(adjusted_rand_score(l[in_both], k[in_both]))
    return np.mean(scores)
X, y = make_blobs(n_samples=200, centers=4, random_state=14)

fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 7))
for ax, n_clusters in zip(axes.ravel(), [2, 3, 4, 5, 8, 10]):
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    ax.scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(km.labels_), s=10)
    ax.set_title("KM(k={}) stability: {:.2f}".format(n_clusters, cluster_stability(X, KMeans(n_clusters=n_clusters, n_init=10), n_iter=20)))
../_images/17-cluster-evaluation_20_0.png
rng = np.random.RandomState(1)
X, y = make_blobs(n_samples=500, centers=10, random_state=rng, cluster_std=[rng.gamma(2) for i in range(10)])

fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 7))
for ax, n_clusters in zip(axes.ravel(), [2, 4, 6, 8, 10, 12]):
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    ax.scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(km.labels_), s=10)
    ax.set_title("KM(k={}) stability: {:.2f}".format(n_clusters, cluster_stability(X, KMeans(n_clusters=n_clusters, n_init=10))))
../_images/17-cluster-evaluation_21_0.png
stability = []
silhouette = []
ari = []
cluster_range = range(2, 30, 2)
for n_clusters in cluster_range:
    km = KMeans(n_clusters=n_clusters)
    stability.append(cluster_stability(digits.data, km))
    km.fit(digits.data)
    silhouette.append(silhouette_score(digits.data, km.labels_))
    ari.append(adjusted_rand_score(digits.target, km.labels_))
st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.yticks(())

si, = plt.twinx().plot(cluster_range, silhouette, label="silhouette")
plt.yticks(())

ar, = plt.twinx().plot(cluster_range, ari, label="ari", c='r')
plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")
Text(0.5,1,'Scanning n_clusters with different scores')
../_images/17-cluster-evaluation_23_1.png
st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.ylabel("stability")
#plt.yticks(())

si, = plt.plot(cluster_range, silhouette, label="silhouette")
plt.ylabel("ARI, Silhouette")
#plt.yticks(())

ar, = plt.plot(cluster_range, ari, label="ari", c='r')
#plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")
Text(0.5,1,'Scanning n_clusters with different scores')
../_images/17-cluster-evaluation_24_1.png
# stability for different clustering algorithms on digits
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

km_stability = []
agg_stability = []

dbscan_stability = []
X = PCA(n_components=.9).fit_transform(digits.data / 16.)


cluster_range = range(2, 26, 2)
for n_clusters in cluster_range:
    print(n_clusters)
    km = KMeans(n_clusters=n_clusters, n_init=10, init="random")
    km_stability.append(cluster_stability(X, km))
    agg_stability.append(cluster_stability(X, AgglomerativeClustering(n_clusters=n_clusters)))
2
4
6
8
10
12
14
16
18
20
22
24
db_stability = []
n_clusters_db = []
for eps in np.linspace(.2, 2, 20):
    print(eps)
    db_stability.append(cluster_stability(X, DBSCAN(eps=eps)))
    n_clusters_db.append(len(np.unique(DBSCAN(eps=eps).fit(X).labels_)))
0.2
0.294736842105
0.389473684211
0.484210526316
0.578947368421
0.673684210526
0.768421052632
0.863157894737
0.957894736842
1.05263157895
1.14736842105
1.24210526316
1.33684210526
1.43157894737
1.52631578947
1.62105263158
1.71578947368
1.81052631579
1.90526315789
2.0
cluster_range = range(2, 26, 2)

plt.plot(cluster_range, km_stability, label="k-means")
plt.plot(cluster_range, agg_stability, label="agglomerative")
plt.plot(n_clusters_db, db_stability, label="DBSCAN")
for eps, n_clusters, stability in zip(np.linspace(.2, 2, 20), n_clusters_db, db_stability):
    plt.text(n_clusters, stability, "{:.2f}".format(eps))
plt.legend()
plt.xlabel("n_clusters")
plt.ylabel("stability")
Text(0,0.5,'stability')
../_images/17-cluster-evaluation_27_1.png
from sklearn.manifold import TSNE
X_tsne = TSNE().fit_transform(digits.data / 16.)


fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
for ax, est in zip(axes.T, [DBSCAN(eps=1.2), AgglomerativeClustering(n_clusters=10), KMeans(n_clusters=10)]):
    est.fit(X)
    # X is PCA components
    ax[0].set_title(est.__class__.__name__)
    ax[0].scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(est.labels_), s=5, alpha=.7)
    ax[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=plt.cm.Vega10(est.labels_), s=5, alpha=.7)

axes[0, 0].set_ylabel("PCA")
axes[1, 0].set_ylabel("TSNE")
Text(0,0.5,'TSNE')
../_images/17-cluster-evaluation_29_1.png
dbscan = DBSCAN(eps=1.0, min_samples=5).fit(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=plt.cm.Vega20(dbscan.labels_), s=5, alpha=.7)
np.bincount(dbscan.labels_ + 1)
array([552, 163, 114, 161,  27, 144,  42, 121,  86, 122,  11,  76,   4,
        56,   7,   7,   7,  10,  24,   7,   4,   8,  13,  12,   3,  11,   5])
../_images/17-cluster-evaluation_30_1.png
fig, axes = plt.subplots(2, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
km = KMeans(n_clusters=10).fit(digits.data)
for ax, center in zip(axes.ravel(), km.cluster_centers_):
    ax.imshow(center.reshape(8, 8), cmap='gray_r')
fig.suptitle("K-Means cluster centers")
Text(0.5,0.98,'K-Means cluster centers')
../_images/17-cluster-evaluation_31_1.png
agg = AgglomerativeClustering(n_clusters=10).fit(digits.data)
fig, axes = plt.subplots(5, 10, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
for ax, center in zip(axes.T, range(10)):
    cluster_mask = agg.labels_ == center
    for a, im in zip(ax, digits.data[cluster_mask]):
        a.imshow(im.reshape(8, 8), cmap='gray_r')
../_images/17-cluster-evaluation_33_0.png
# PCA is not really neccessary but we know a good eps value
# when using PCA.
X = PCA(n_components=.9).fit_transform(digits.data / 16.)

dbscan = DBSCAN(eps=1.2).fit(X)
clusters = np.unique(dbscan.labels_)
n_clusters = len(clusters)
fig, axes = plt.subplots(5, n_clusters, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
for ax, center in zip(axes.T, clusters):
    cluster_mask = dbscan.labels_ == center
    ax[0].set_title(np.sum(cluster_mask))
    for a, im in zip(ax, digits.data[cluster_mask]):
        a.imshow(im.reshape(8, 8), cmap='gray_r')
../_images/17-cluster-evaluation_35_0.png

adult


import os
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
adult_path = os.path.join("adult.data")
data = pd.read_csv(
    adult_path, header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])
data = data.drop("fnlwgt", axis=1)
data.head()
age workclass education education-num marital-status occupation relationship race gender capital-gain capital-loss hours-per-week native-country income
0 39 State-gov Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
data_ = data.drop("income", axis=1)
categorical = data_.columns[data_.dtypes != "int64"]
data_dummies = pd.get_dummies(data_[categorical],
                                        columns=categorical)
data_dummies.shape
(32561, 102)
# subsample the data. Still takes a while. Maybe MiniBatchKMeans would be more appropriate
X = data_dummies.values
X = scale(X)
/home/andy/checkout/scikit-learn/sklearn/utils/validation.py:558: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.
  warnings.warn(msg, DataConversionWarning)
adult_pca = PCA(n_components=2).fit_transform(X)
km = KMeans(n_clusters=3).fit(X)
km.labels_
array([1, 2, 1, ..., 1, 1, 1], dtype=int32)
plt.scatter(adult_pca[:, 0], adult_pca[:, 1], c=km.labels_, alpha=.7)
<matplotlib.collections.PathCollection at 0x7f71253fc0b8>
../_images/17-cluster-evaluation_44_1.png
stability = []
silhouette = []
ari = []
cluster_range = range(2, 15, 2)
for n_clusters in cluster_range:
    print(n_clusters)
    km = KMeans(n_clusters=n_clusters)
    stability.append(cluster_stability(X, km, n_iter=10))
    km.fit(X)
    silhouette.append(silhouette_score(X, km.labels_))
    ari.append(adjusted_rand_score(data.income, km.labels_))
FIXME?! crashes?
2
4
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-46-ae3590749ddf> in <module>()
      8     stability.append(cluster_stability(X, km, n_iter=10))
      9     km.fit(X)
---> 10     silhouette.append(silhouette_score(X, km.labels_))
     11     ari.append(adjusted_rand_score(data.income, km.labels_))

~/checkout/scikit-learn/sklearn/metrics/cluster/unsupervised.py in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
     99         else:
    100             X, labels = X[indices], labels[indices]
--> 101     return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
    102 
    103 

~/checkout/scikit-learn/sklearn/metrics/cluster/unsupervised.py in silhouette_samples(X, labels, metric, **kwds)
    167     check_number_of_labels(len(le.classes_), X.shape[0])
    168 
--> 169     distances = pairwise_distances(X, metric=metric, **kwds)
    170     unique_labels = le.classes_
    171     n_samples_per_label = np.bincount(labels, minlength=len(unique_labels))

~/checkout/scikit-learn/sklearn/metrics/pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
   1249         func = partial(distance.cdist, metric=metric, **kwds)
   1250 
-> 1251     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1252 
   1253 

~/checkout/scikit-learn/sklearn/metrics/pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
   1088     if n_jobs == 1:
   1089         # Special case to avoid picklability checks in delayed
-> 1090         return func(X, Y, **kwds)
   1091 
   1092     # TODO: in some cases, backend='threading' may be appropriate

~/checkout/scikit-learn/sklearn/metrics/pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
    244         YY = row_norms(Y, squared=True)[np.newaxis, :]
    245 
--> 246     distances = safe_sparse_dot(X, Y.T, dense_output=True)
    247     distances *= -2
    248     distances += XX

~/checkout/scikit-learn/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
    138         return ret
    139     else:
--> 140         return np.dot(a, b)
    141 
    142 

KeyboardInterrupt: 
st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.yticks(())

si, = plt.twinx().plot(cluster_range, silhouette, label="silhouette")
plt.yticks(())

ar, = plt.twinx().plot(cluster_range, ari, label="ari", c='r')
plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")
<matplotlib.text.Text at 0x7f848fcc4908>
../_images/17-cluster-evaluation_46_1.png

Breast Cancer

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
stability = []
silhouette = []
ari = []
cluster_range = range(2, 30, 2)
for n_clusters in cluster_range:
    km = KMeans(n_clusters=n_clusters, n_init=10, init="random")
    stability.append(cluster_stability(X, km, n_iter=10))
    km.fit(X)
    silhouette.append(silhouette_score(X, km.labels_))
    ari.append(adjusted_rand_score(data.target, km.labels_))
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (19) found smaller than n_clusters (20). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (21) found smaller than n_clusters (22). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (21) found smaller than n_clusters (22). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (25) found smaller than n_clusters (26). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (27) found smaller than n_clusters (28). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (27) found smaller than n_clusters (28). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (27) found smaller than n_clusters (28). Possibly due to duplicate points in X.
  return_n_iter=True)
/home/andy/checkout/scikit-learn/sklearn/cluster/k_means_.py:909: ConvergenceWarning: Number of distinct clusters (27) found smaller than n_clusters (28). Possibly due to duplicate points in X.
  return_n_iter=True)
st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.yticks(())

si, = plt.twinx().plot(cluster_range, silhouette, label="silhouette")
plt.yticks(())

ar, = plt.twinx().plot(cluster_range, ari, label="ari", c='r')
plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")
Text(0.5,1,'Scanning n_clusters with different scores')
../_images/17-cluster-evaluation_50_1.png
from sklearn.datasets import fetch_lfw_people
people = fetch_lfw_people(min_faces_per_person=20, resize=0.7)
image_shape = people.images[0].shape
fix, axes = plt.subplots(2, 5, figsize=(15, 8), subplot_kw={'xticks': (), 'yticks': ()})
for target, image, ax in zip(people.target, people.images, axes.ravel()):
    ax.imshow(image, cmap='gray')
    ax.set_title(people.target_names[target])
../_images/17-cluster-evaluation_51_0.png
# have at most 50 images per preson - otherwise too much bush
mask = np.zeros(people.target.shape, dtype=np.bool)
for target in np.unique(people.target):
    mask[np.where(people.target == target)[0][:50]] = 1
    
X_people = people.data[mask]
y_people = people.target[mask]
# scale the grayscale values to be between 0 and 1
# instead of 0 and 255 for better numeric stability
X_people = X_people / 255.
from sklearn.decomposition import PCA
X = PCA(n_components=100, whiten=True).fit_transform(X_people)
stability = []
silhouette = []
ari = []
cluster_range = range(2, 50, 4)
for n_clusters in cluster_range:
    print(n_clusters)
    km = KMeans(n_clusters=n_clusters, n_init=5, init="random")
    stability.append(cluster_stability(X, km, n_iter=10))
    km.fit(X)
    silhouette.append(silhouette_score(X, km.labels_))
    ari.append(adjusted_rand_score(y_people, km.labels_))
2
6
10
14
18
22
26
30
34
38
42
46
st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.ylabel("stability")
#plt.yticks(())

si, = plt.twinx().plot(cluster_range, silhouette, label="silhouette")
plt.ylabel("ARI, Silhouette")
#plt.yticks(())

ar, = plt.plot(cluster_range, ari, label="ari", c='r')
#plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")
Text(0.5,1,'Scanning n_clusters with different scores')
../_images/17-cluster-evaluation_54_1.png
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

km = KMeans(n_init=1, init="random")
pipe = make_pipeline(km, LogisticRegression())

param_grid = {'kmeans__n_clusters': [10, 50, 100, 200, 500]}
grid = GridSearchCV(pipe, param_grid, cv=5, verbose=True)
grid.fit(X, y_people)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  7.0min finished
GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('kmeans', KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=8, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)), ('logisticregression', LogisticRegression(C=1.0, class_weight=N...y='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kmeans__n_clusters': [10, 50, 100, 200, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)
results = pd.DataFrame(grid.cv_results_)
res_pivot = results.pivot_table(values=['mean_test_score', 'mean_train_score'],
                                index="param_kmeans__n_clusters")
res_pivot.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1ca3e8d390>
../_images/17-cluster-evaluation_56_1.png
bank_notes = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt", header=None)
bank_data = bank_notes.drop(4, axis=1)
pd.plotting.scatter_matrix(bank_data, c=bank_notes[4], alpha=.6)
/home/andy/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:1: FutureWarning: 'pandas.tools.plotting.scatter_matrix' is deprecated, import 'pandas.plotting.scatter_matrix' instead.
  if __name__ == '__main__':
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f71255e7f60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f7125261630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712f765668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f7126c7f588>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f712f4e8978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712f4e89b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712cfea978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712c2750b8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f712cea55f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712c1c4748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712c2119e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712cf709e8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f712c2b8f28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712c2b3470>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f712cbac4a8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f7126f303c8>]], dtype=object)
../_images/17-cluster-evaluation_59_2.png
bla = PCA().fit_transform(scale(bank_data))
plt.scatter(bla[:, 0], bla[:, 1], c=bank_notes[4])
<matplotlib.collections.PathCollection at 0x7f712cbd3240>
../_images/17-cluster-evaluation_61_1.png
bank_tsne = TSNE(perplexity=400).fit_transform(scale(bank_data))
plt.scatter(bank_tsne[:, 0], bank_tsne[:, 1], c=bank_notes[4], alpha=.5)
<matplotlib.collections.PathCollection at 0x7f712ccbe7f0>
../_images/17-cluster-evaluation_63_1.png
X = scale(bank_data)
y = bank_notes[4]
stability = []
silhouette = []
ari = []
cluster_range = range(2, 30, 2)
for n_clusters in cluster_range:
    km = KMeans(n_clusters=n_clusters, n_init=10, init="random")
    stability.append(cluster_stability(X, km))
    km.fit(X)
    silhouette.append(silhouette_score(X, km.labels_))
    ari.append(adjusted_rand_score(y, km.labels_))
st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.ylabel("stability")
#plt.yticks(())

si, = plt.twinx().plot(cluster_range, silhouette, label="silhouette")
plt.ylabel("ARI, Silhouette")
#plt.yticks(())

ar, = plt.plot(cluster_range, ari, label="ari", c='r')
#plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")
Text(0.5,1,'Scanning n_clusters with different scores')
../_images/17-cluster-evaluation_65_1.png