Clustering + Label propagation

 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

Parameters

RANDOM_STATE  = 42
LABELED_RATIO = 0.05  # 5%
N_CLUSTERS    = 50    # labeling이 충분히 가능하다면, cluster의 개수를 크게 잡는 것이 좋다

1. Load data

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

np.random.seed(RANDOM_STATE)

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE)

X_train_org, X_test_org = X_train.copy(), X_test.copy()
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((1347, 64), (1347,), (450, 64), (450,))

2. Label propagation

2.1 Select representative image

from sklearn.cluster import KMeans

kmeans = KMeans(opt_n_clusters, random_state=RANDOM_STATE)
X_train_affinity = kmeans.fit_transform(X_train)

idxs_representative    = np.argmin(X_train_affinity, axis=0)  # closest sample to the centroid
X_train_representative = X_train[idxs_representative]

2.2 Manual labeling (expert)

X_train_org_representative = X_train_org[idxs_representative]

fig, axes = plt.subplots(1, opt_n_clusters, figsize=(15, 1))
for idx_ax, ax in enumerate(axes.flat):
    if idx_ax < len(X_train_org_representative):
        ax.imshow(X_train_org_representative[idx_ax].reshape(8, 8), 'binary')
    ax.axis('off')

png

y_train_representative = np.array([4, 8, 7, 6, 0, 3, 2, 1, 1, 3, 5, 0, 2, 6, 3, 5, 5, 7, 9, 4, 4, 1, 8, 1, 0, 7, 7, 1, 2, 1, 2, 3, 2, 0, 4, 9, 5, 7, 8, 9, 1, 5, 4, 9, 6, 6, 5, 1, 5, 8])

2.3 Label propagation

2.3.1 Propagation for all data

y_train_propagated_all = np.empty(len(y_train))
for idx_cluster in range(opt_n_clusters):
    y_train_propagated_all[kmeans.labels_ == idx_cluster] = y_train_representative[idx_cluster]

2.3.2 Propagation for reliable data

def get_propagated_reliable(reliable_ratio):
    idxs_clusters = []
    for idx_cluster in range(opt_n_clusters):
        idxs = np.argsort(X_train_affinity[kmeans.labels_ == idx_cluster, idx_cluster])
        idxs = idxs[:np.ceil(reliable_ratio*len(idxs)).astype(int)]
        idxs_clusters.append(idxs)
    return np.concatenate([X_train[kmeans.labels_ == idx_cluster][idxs] for idx_cluster, idxs in enumerate(idxs_clusters)]), \
           np.concatenate([np.repeat(y_train_representative[idx_cluster], len(idxs)) for idx_cluster, idxs in enumerate(idxs_clusters)])

3. Evaluation

idxs_labeled = np.random.choice(len(X_train), int(LABELED_RATIO*len(X_train)))
X_train_random, y_train_random = X_train[idxs_labeled], y_train[idxs_labeled]
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

model = SVC(random_state=RANDOM_STATE)
result = pd.DataFrame(columns=['train_accuracy', 'train_f1_score', 'test_accuracy', 'test_f1_score'])
for exp_name, X, y in zip(
    ['Full (100%)', f'Random ({100*LABELED_RATIO}%)', f'Representative ({100*LABELED_RATIO}%)', f'Propagated (20%)', f'Propagated (40%)', f'Propagated (60%)', f'Propagated (80%)', f'Propagated (100%)'],
    [X_train, X_train_random, X_train_representative, get_propagated_reliable(0.2)[0], get_propagated_reliable(0.4)[0], get_propagated_reliable(0.6)[0], get_propagated_reliable(0.8)[0], get_propagated_reliable(1)[0]],
    [y_train, y_train_random, y_train_representative, get_propagated_reliable(0.2)[1], get_propagated_reliable(0.4)[1], get_propagated_reliable(0.6)[1], get_propagated_reliable(0.8)[1], get_propagated_reliable(1)[1]]
):
    model.fit(X, y)
    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)
    result.loc[exp_name] = [accuracy_score(y_train, y_train_pred), f1_score(y_train, y_train_pred, average='macro'), accuracy_score(y_test, y_test_pred), f1_score(y_test, y_test_pred, average='macro')]
result.index.name = model.__class__.__name__
result
train_accuracy train_f1_score test_accuracy test_f1_score
SVC
Full (100%) 0.998515 0.998524 0.991111 0.991071
Random (5.0%) 0.732739 0.716331 0.722222 0.698227
Representative (5.0%) 0.859688 0.854053 0.848889 0.835384
Propagated (20%) 0.947290 0.947040 0.940000 0.939079
Propagated (40%) 0.960653 0.960496 0.955556 0.954870
Propagated (60%) 0.962880 0.962813 0.957778 0.957274
Propagated (80%) 0.959911 0.959987 0.960000 0.959517
Propagated (100%) 0.956941 0.956851 0.953333 0.952689