tools, lockfile, deps
This commit is contained in:
274
tools/plot_scripts/background_ml_semisupervised.py
Normal file
274
tools/plot_scripts/background_ml_semisupervised.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Downloads the cats_vs_dogs dataset, extracts deep features using MobileNetV2,
|
||||
then generates two clustering comparison illustrations using two different embedding pipelines:
|
||||
- PCA followed by t-SNE (saved as: semi_supervised_clustering_tsne.png)
|
||||
- PCA followed by UMAP (saved as: semi_supervised_clustering_umap.png)
|
||||
|
||||
Each illustration compares:
|
||||
- Unsupervised clustering using the deep embedding + KMeans
|
||||
- Semi-supervised clustering using Label Spreading with a few labeled seeds
|
||||
|
||||
This script saves outputs in a datetime folder and also copies the latest outputs
|
||||
to a "latest" folder. All versions of the outputs and scripts are archived.
|
||||
"""
|
||||
|
||||
import random
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow_datasets as tfds
|
||||
import umap
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.semi_supervised import LabelSpreading
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CONFIGURATION
|
||||
# -----------------------------------------------------------------------------
|
||||
UNSUP_SAMPLES = 200 # number of samples to use for the demo
|
||||
N_LABELED_CLASS = 20 # number of labeled seeds for the semi-supervised approach
|
||||
|
||||
output_path = Path("/home/fedex/mt/plots/background_ml_semisupervised")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# UTILITIES
|
||||
# -----------------------------------------------------------------------------
|
||||
def ensure_dir(directory: Path):
|
||||
directory.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def cluster_accuracy(y_true, y_pred):
|
||||
"""
|
||||
Compute clustering accuracy by determining the optimal mapping
|
||||
between predicted clusters and true labels using the Hungarian algorithm.
|
||||
"""
|
||||
y_true = y_true.astype(np.int64)
|
||||
y_pred = y_pred.astype(np.int64)
|
||||
labels = np.unique(y_true)
|
||||
clusters = np.unique(y_pred)
|
||||
contingency = np.zeros((labels.size, clusters.size), dtype=np.int64)
|
||||
for i, label in enumerate(labels):
|
||||
for j, cluster in enumerate(clusters):
|
||||
contingency[i, j] = np.sum((y_true == label) & (y_pred == cluster))
|
||||
row_ind, col_ind = linear_sum_assignment(-contingency)
|
||||
accuracy = contingency[row_ind, col_ind].sum() / y_true.size
|
||||
return accuracy
|
||||
|
||||
|
||||
def plot_clustering_comparison_embedding(embedding, y_true, outpath, method_name=""):
|
||||
"""
|
||||
Given a 2D data embedding (e.g., from PCA+t-SNE or PCA+UMAP), this function:
|
||||
- Performs unsupervised clustering with KMeans.
|
||||
- Performs semi-supervised clustering with Label Spreading using a few labeled seeds.
|
||||
- Computes accuracy via the Hungarian algorithm.
|
||||
- Plots the decision boundaries from both methods overlaid with the true labels.
|
||||
- Annotates the plot with the accuracy results.
|
||||
|
||||
The 'method_name' is used in the plot title to indicate which embedding is used.
|
||||
"""
|
||||
n = embedding.shape[0]
|
||||
all_idx = list(range(n))
|
||||
labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
|
||||
|
||||
# Unsupervised clustering using KMeans on all embedded data
|
||||
km = KMeans(n_clusters=2, random_state=0).fit(embedding)
|
||||
unsup_pred = km.predict(embedding)
|
||||
unsup_accuracy = cluster_accuracy(y_true, unsup_pred)
|
||||
|
||||
# Create a grid over the space for decision boundaries
|
||||
x_min, x_max = embedding[:, 0].min() - 1, embedding[:, 0].max() + 1
|
||||
y_min, y_max = embedding[:, 1].min() - 1, embedding[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
|
||||
grid = np.c_[xx.ravel(), yy.ravel()]
|
||||
pred_unsup = km.predict(grid).reshape(xx.shape)
|
||||
|
||||
# Semi-supervised clustering using Label Spreading with labeled seeds
|
||||
y_train = np.full(n, -1, dtype=int)
|
||||
y_train[labeled_idx] = y_true[labeled_idx]
|
||||
ls = LabelSpreading().fit(embedding, y_train)
|
||||
semi_pred = ls.predict(embedding)
|
||||
semi_accuracy = cluster_accuracy(y_true, semi_pred)
|
||||
pred_semi = ls.predict(grid).reshape(xx.shape)
|
||||
|
||||
cmap = plt.cm.coolwarm
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
|
||||
|
||||
# Unsupervised plot:
|
||||
ax = axes[0]
|
||||
ax.contourf(xx, yy, pred_unsup, alpha=0.2, cmap=cmap)
|
||||
sc1 = ax.scatter(
|
||||
embedding[:, 0],
|
||||
embedding[:, 1],
|
||||
c=y_true,
|
||||
cmap=cmap,
|
||||
s=30,
|
||||
alpha=0.8,
|
||||
edgecolor="k",
|
||||
)
|
||||
ax.set_title(
|
||||
f"{method_name}\nUnsupervised (KMeans) - Acc: {unsup_accuracy:.2f}", fontsize=10
|
||||
)
|
||||
ax.set_xlabel("Dim 1")
|
||||
ax.set_ylabel("Dim 2")
|
||||
handles = []
|
||||
for cl in np.unique(y_true):
|
||||
handles.append(
|
||||
plt.Line2D(
|
||||
[],
|
||||
[],
|
||||
marker="o",
|
||||
linestyle="",
|
||||
color=cmap(cl / (np.max(y_true) + 1)),
|
||||
label=f"Class {cl}",
|
||||
markersize=6,
|
||||
)
|
||||
)
|
||||
ax.legend(handles=handles, loc="upper right")
|
||||
|
||||
# Semi-supervised plot:
|
||||
ax = axes[1]
|
||||
ax.contourf(xx, yy, pred_semi, alpha=0.2, cmap=cmap)
|
||||
sc2 = ax.scatter(
|
||||
embedding[:, 0],
|
||||
embedding[:, 1],
|
||||
c=y_true,
|
||||
cmap=cmap,
|
||||
s=30,
|
||||
alpha=0.8,
|
||||
edgecolor="none",
|
||||
)
|
||||
sc3 = ax.scatter(
|
||||
embedding[labeled_idx, 0],
|
||||
embedding[labeled_idx, 1],
|
||||
c=y_true[labeled_idx],
|
||||
cmap=cmap,
|
||||
s=80,
|
||||
edgecolor="k",
|
||||
marker="o",
|
||||
label="Labeled Seeds",
|
||||
)
|
||||
ax.set_title(
|
||||
f"{method_name}\nSemi-Supervised (Label Spreading) - Acc: {semi_accuracy:.2f}",
|
||||
fontsize=10,
|
||||
)
|
||||
ax.set_xlabel("Dim 1")
|
||||
ax.set_ylabel("Dim 2")
|
||||
handles = []
|
||||
for cl in np.unique(y_true):
|
||||
handles.append(
|
||||
plt.Line2D(
|
||||
[],
|
||||
[],
|
||||
marker="o",
|
||||
linestyle="",
|
||||
color=cmap(cl / (np.max(y_true) + 1)),
|
||||
label=f"Class {cl}",
|
||||
markersize=6,
|
||||
)
|
||||
)
|
||||
handles.append(
|
||||
plt.Line2D(
|
||||
[],
|
||||
[],
|
||||
marker="o",
|
||||
linestyle="",
|
||||
color="black",
|
||||
label="Labeled Seed",
|
||||
markersize=8,
|
||||
)
|
||||
)
|
||||
ax.legend(handles=handles, loc="upper right")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved clustering comparison illustration → {outpath}")
|
||||
print(f"Unsupervised Accuracy: {unsup_accuracy:.2f}")
|
||||
print(f"Semi-Supervised Accuracy: {semi_accuracy:.2f}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MAIN WITH PRE-TRAINED CNN FEATURE EXTRACTION
|
||||
# -----------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
# Create output directories
|
||||
ensure_dir(output_path)
|
||||
ensure_dir(output_datetime_path)
|
||||
ensure_dir(latest_folder_path)
|
||||
ensure_dir(archive_folder_path)
|
||||
|
||||
print("▶ Loading cats_vs_dogs dataset...")
|
||||
ds, info = tfds.load(
|
||||
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
|
||||
)
|
||||
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
|
||||
|
||||
# Load a pre-trained CNN (MobileNetV2) for feature extraction.
|
||||
cnn_model = tf.keras.applications.MobileNetV2(
|
||||
include_top=False, weights="imagenet", pooling="avg", input_shape=(224, 224, 3)
|
||||
)
|
||||
|
||||
# Extract deep features for all samples.
|
||||
features_list = []
|
||||
labels = []
|
||||
for img, lbl in ds.take(UNSUP_SAMPLES):
|
||||
# Resize to 224x224 and keep 3 channels.
|
||||
img_resized = tf.image.resize(img, (224, 224))
|
||||
# Preprocess the image for MobileNetV2.
|
||||
img_preprocessed = tf.keras.applications.mobilenet_v2.preprocess_input(
|
||||
img_resized
|
||||
)
|
||||
# Expand dims for batch, run through CNN, then squeeze.
|
||||
features = cnn_model(tf.expand_dims(img_preprocessed, axis=0))
|
||||
features = features.numpy().squeeze()
|
||||
features_list.append(features)
|
||||
labels.append(lbl.numpy())
|
||||
X = np.stack(features_list)
|
||||
y_true = np.array(labels)
|
||||
|
||||
# First, apply PCA to reduce dimensionality to 50
|
||||
pca_50 = PCA(n_components=50, random_state=0).fit_transform(X)
|
||||
|
||||
# Then compute embedding with t-SNE
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
X_tsne = TSNE(n_components=2, random_state=0, init="pca").fit_transform(pca_50)
|
||||
outfile_tsne = output_datetime_path / "semi_supervised_clustering_tsne.png"
|
||||
plot_clustering_comparison_embedding(
|
||||
X_tsne, y_true, outfile_tsne, "CNN + PCA + t-SNE"
|
||||
)
|
||||
|
||||
# Then compute embedding with UMAP
|
||||
X_umap = umap.UMAP(n_components=2, random_state=0).fit_transform(pca_50)
|
||||
outfile_umap = output_datetime_path / "semi_supervised_clustering_umap.png"
|
||||
plot_clustering_comparison_embedding(
|
||||
X_umap, y_true, outfile_umap, "CNN + PCA + UMAP"
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Update the 'latest' results folder: remove previous and copy current outputs
|
||||
# -----------------------------------------------------------------------------
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
ensure_dir(latest_folder_path)
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# Copy this script to preserve the code used for the outputs
|
||||
script_path = Path(__file__)
|
||||
shutil.copy2(script_path, output_datetime_path)
|
||||
shutil.copy2(script_path, latest_folder_path)
|
||||
|
||||
# Archive the outputs
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
Reference in New Issue
Block a user