Cluster Stability¶

from sklearn.base import clone
from sklearn.utils import check_random_state

def cluster_stability(X, est, n_iter=20, random_state=None):
    labels = []
    indices = []
    for i in range(n_iter):
        # draw bootstrap samples, store indices
        sample_indices = rng.randint(0, X.shape[0], X.shape[0])
        indices.append(sample_indices)
        est = clone(est)
        if hasattr(est, "random_state"):
            # randomize estimator if possible
            est.random_state = rng.randint(1e5)
        X_bootstrap = X[sample_indices]
        est.fit(X_bootstrap)
        # store clustering outcome using original indices
        relabel = -np.ones(X.shape[0], dtype=np.int)
        relabel[sample_indices] = est.labels_
        labels.append(relabel)
    scores = []
    for l, i in zip(labels, indices):
        for k, j in zip(labels, indices):
            # we also compute the diagonal which is a bit silly
            in_both = np.intersect1d(i, j)
            scores.append(adjusted_rand_score(l[in_both], k[in_both]))
    return np.mean(scores)

X, y = make_blobs(n_samples=200, centers=4, random_state=14)

fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 7))
for ax, n_clusters in zip(axes.ravel(), [2, 3, 4, 5, 8, 10]):
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    ax.scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(km.labels_), s=10)
    ax.set_title("KM(k={}) stability: {:.2f}".format(n_clusters, cluster_stability(X, KMeans(n_clusters=n_clusters, n_init=10), n_iter=20)))

../_images/17-cluster-evaluation_20_0.png

rng = np.random.RandomState(1)
X, y = make_blobs(n_samples=500, centers=10, random_state=rng, cluster_std=[rng.gamma(2) for i in range(10)])

fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks':()}, figsize=(10, 7))
for ax, n_clusters in zip(axes.ravel(), [2, 4, 6, 8, 10, 12]):
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)
    ax.scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(km.labels_), s=10)
    ax.set_title("KM(k={}) stability: {:.2f}".format(n_clusters, cluster_stability(X, KMeans(n_clusters=n_clusters, n_init=10))))

../_images/17-cluster-evaluation_21_0.png

stability = []
silhouette = []
ari = []
cluster_range = range(2, 30, 2)
for n_clusters in cluster_range:
    km = KMeans(n_clusters=n_clusters)
    stability.append(cluster_stability(digits.data, km))
    km.fit(digits.data)
    silhouette.append(silhouette_score(digits.data, km.labels_))
    ari.append(adjusted_rand_score(digits.target, km.labels_))

st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.yticks(())

si, = plt.twinx().plot(cluster_range, silhouette, label="silhouette")
plt.yticks(())

ar, = plt.twinx().plot(cluster_range, ari, label="ari", c='r')
plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")

Text(0.5,1,'Scanning n_clusters with different scores')

../_images/17-cluster-evaluation_23_1.png

st, = plt.plot(cluster_range, stability, label="stability", c='k')
plt.ylabel("stability")
#plt.yticks(())

si, = plt.plot(cluster_range, silhouette, label="silhouette")
plt.ylabel("ARI, Silhouette")
#plt.yticks(())

ar, = plt.plot(cluster_range, ari, label="ari", c='r')
#plt.yticks(())
plt.legend([st, si, ar], ["Stability", "Silhouette", "ARI"])
plt.xlabel("n_clusters")
plt.title("Scanning n_clusters with different scores")

Text(0.5,1,'Scanning n_clusters with different scores')

../_images/17-cluster-evaluation_24_1.png

# stability for different clustering algorithms on digits
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

km_stability = []
agg_stability = []

dbscan_stability = []
X = PCA(n_components=.9).fit_transform(digits.data / 16.)


cluster_range = range(2, 26, 2)
for n_clusters in cluster_range:
    print(n_clusters)
    km = KMeans(n_clusters=n_clusters, n_init=10, init="random")
    km_stability.append(cluster_stability(X, km))
    agg_stability.append(cluster_stability(X, AgglomerativeClustering(n_clusters=n_clusters)))

db_stability = []
n_clusters_db = []
for eps in np.linspace(.2, 2, 20):
    print(eps)
    db_stability.append(cluster_stability(X, DBSCAN(eps=eps)))
    n_clusters_db.append(len(np.unique(DBSCAN(eps=eps).fit(X).labels_)))

2
294736842105
389473684211
484210526316
578947368421
673684210526
768421052632
863157894737
957894736842
05263157895
14736842105
24210526316
33684210526
43157894737
52631578947
62105263158
71578947368
81052631579
90526315789
0

cluster_range = range(2, 26, 2)

plt.plot(cluster_range, km_stability, label="k-means")
plt.plot(cluster_range, agg_stability, label="agglomerative")
plt.plot(n_clusters_db, db_stability, label="DBSCAN")
for eps, n_clusters, stability in zip(np.linspace(.2, 2, 20), n_clusters_db, db_stability):
    plt.text(n_clusters, stability, "{:.2f}".format(eps))
plt.legend()
plt.xlabel("n_clusters")
plt.ylabel("stability")

Text(0,0.5,'stability')

../_images/17-cluster-evaluation_27_1.png

from sklearn.manifold import TSNE
X_tsne = TSNE().fit_transform(digits.data / 16.)

fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
for ax, est in zip(axes.T, [DBSCAN(eps=1.2), AgglomerativeClustering(n_clusters=10), KMeans(n_clusters=10)]):
    est.fit(X)
    # X is PCA components
    ax[0].set_title(est.__class__.__name__)
    ax[0].scatter(X[:, 0], X[:, 1], c=plt.cm.Vega10(est.labels_), s=5, alpha=.7)
    ax[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=plt.cm.Vega10(est.labels_), s=5, alpha=.7)

axes[0, 0].set_ylabel("PCA")
axes[1, 0].set_ylabel("TSNE")

Text(0,0.5,'TSNE')

../_images/17-cluster-evaluation_29_1.png

dbscan = DBSCAN(eps=1.0, min_samples=5).fit(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=plt.cm.Vega20(dbscan.labels_), s=5, alpha=.7)
np.bincount(dbscan.labels_ + 1)

array([552, 163, 114, 161,  27, 144,  42, 121,  86, 122,  11,  76,   4,
        56,   7,   7,   7,  10,  24,   7,   4,   8,  13,  12,   3,  11,   5])

../_images/17-cluster-evaluation_30_1.png

fig, axes = plt.subplots(2, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
km = KMeans(n_clusters=10).fit(digits.data)
for ax, center in zip(axes.ravel(), km.cluster_centers_):
    ax.imshow(center.reshape(8, 8), cmap='gray_r')
fig.suptitle("K-Means cluster centers")

Text(0.5,0.98,'K-Means cluster centers')

../_images/17-cluster-evaluation_31_1.png

agg = AgglomerativeClustering(n_clusters=10).fit(digits.data)

fig, axes = plt.subplots(5, 10, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
for ax, center in zip(axes.T, range(10)):
    cluster_mask = agg.labels_ == center
    for a, im in zip(ax, digits.data[cluster_mask]):
        a.imshow(im.reshape(8, 8), cmap='gray_r')

../_images/17-cluster-evaluation_33_0.png

# PCA is not really neccessary but we know a good eps value
# when using PCA.
X = PCA(n_components=.9).fit_transform(digits.data / 16.)

dbscan = DBSCAN(eps=1.2).fit(X)

clusters = np.unique(dbscan.labels_)
n_clusters = len(clusters)
fig, axes = plt.subplots(5, n_clusters, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 5))
for ax, center in zip(axes.T, clusters):
    cluster_mask = dbscan.labels_ == center
    ax[0].set_title(np.sum(cluster_mask))
    for a, im in zip(ax, digits.data[cluster_mask]):
        a.imshow(im.reshape(8, 8), cmap='gray_r')

../_images/17-cluster-evaluation_35_0.png

	age	workclass	education	education-num	marital-status	occupation	relationship	race	gender	capital-gain	hours-per-week	native-country	income
0	39	State-gov	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

Applied Machine Learning in Python

Cluster Stability¶

adult¶

Breast Cancer¶