tools, lockfile, deps
This commit is contained in:
339
tools/plot_scripts/ae_elbow_lenet.py
Normal file
339
tools/plot_scripts/ae_elbow_lenet.py
Normal file
@@ -0,0 +1,339 @@
|
||||
import pickle
|
||||
import shutil
|
||||
import unittest
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
# Configuration
|
||||
results_folders = {
|
||||
"LeNet": {
|
||||
"path": Path(
|
||||
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow_v2/"
|
||||
),
|
||||
"batch_size": 256,
|
||||
},
|
||||
"LeNet Efficient": {
|
||||
"path": Path(
|
||||
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_efficient_ae_elbow"
|
||||
),
|
||||
"batch_size": 64,
|
||||
},
|
||||
}
|
||||
output_path = Path("/home/fedex/mt/plots/ae_elbow_lenet")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
# Create output directories
|
||||
output_path.mkdir(exist_ok=True, parents=True)
|
||||
output_datetime_path.mkdir(exist_ok=True, parents=True)
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
archive_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def calculate_batch_mean_loss(scores, batch_size):
|
||||
"""Calculate mean loss over batches similar to the original testing code."""
|
||||
n_samples = len(scores)
|
||||
n_batches = (n_samples + batch_size - 1) // batch_size
|
||||
|
||||
batch_losses = []
|
||||
for i in range(0, n_samples, batch_size):
|
||||
batch_scores = scores[i : i + batch_size]
|
||||
batch_losses.append(np.mean(batch_scores))
|
||||
|
||||
return np.sum(batch_losses) / n_batches
|
||||
|
||||
|
||||
def test_loss_calculation(results, batch_size):
|
||||
"""Test if our loss calculation matches the original implementation."""
|
||||
test = unittest.TestCase()
|
||||
folds = results["ae_results"]
|
||||
dim = results["dimension"]
|
||||
|
||||
for fold_key in folds:
|
||||
fold_data = folds[fold_key]["test"]
|
||||
scores = np.array(fold_data["scores"])
|
||||
original_loss = fold_data["loss"]
|
||||
calculated_loss = calculate_batch_mean_loss(scores, batch_size)
|
||||
|
||||
try:
|
||||
test.assertAlmostEqual(
|
||||
original_loss,
|
||||
calculated_loss,
|
||||
places=5,
|
||||
msg=f"Loss mismatch for dim={dim}, {fold_key}",
|
||||
)
|
||||
except AssertionError as e:
|
||||
print(f"Warning: {str(e)}")
|
||||
print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
|
||||
raise
|
||||
|
||||
|
||||
def plot_loss_curve(dims, means, stds, title, color, output_path):
|
||||
"""Create and save a single loss curve plot."""
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
|
||||
plt.fill_between(
|
||||
dims,
|
||||
np.array(means) - np.array(stds),
|
||||
np.array(means) + np.array(stds),
|
||||
color=color,
|
||||
alpha=0.2,
|
||||
label="Std Dev",
|
||||
)
|
||||
plt.xlabel("Latent Dimension")
|
||||
plt.ylabel("Test Loss")
|
||||
plt.title(title)
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.xticks(dims)
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
|
||||
"""Create and save a loss curve plot with multiple architectures.
|
||||
|
||||
Args:
|
||||
arch_results: Dict of format {arch_name: (dims, means, stds)}
|
||||
title: Plot title
|
||||
output_path: Where to save the plot
|
||||
colors: Optional dict of colors for each architecture
|
||||
"""
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
if colors is None:
|
||||
colors = {
|
||||
"LeNet": "blue",
|
||||
"LeNet Asymmetric": "red",
|
||||
}
|
||||
|
||||
# Get unique dimensions across all architectures
|
||||
all_dims = sorted(
|
||||
set(dim for _, (dims, _, _) in arch_results.items() for dim in dims)
|
||||
)
|
||||
|
||||
for arch_name, (dims, means, stds) in arch_results.items():
|
||||
color = colors.get(arch_name, "gray")
|
||||
plt.plot(dims, means, marker="o", color=color, label=arch_name)
|
||||
plt.fill_between(
|
||||
dims,
|
||||
np.array(means) - np.array(stds),
|
||||
np.array(means) + np.array(stds),
|
||||
color=color,
|
||||
alpha=0.2,
|
||||
)
|
||||
|
||||
plt.xlabel("Latent Dimension")
|
||||
plt.ylabel("Test Loss")
|
||||
plt.title(title)
|
||||
plt.legend()
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.xticks(all_dims) # Set x-axis ticks to match data points
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
||||
plt.close()
|
||||
|
||||
|
||||
def evaluate_autoencoder_loss():
|
||||
"""Main function to evaluate autoencoder loss across different latent dimensions."""
|
||||
# Results storage for each architecture
|
||||
arch_results = {
|
||||
name: {"dims": [], "normal": [], "anomaly": []} for name in results_folders
|
||||
}
|
||||
|
||||
# Process each architecture
|
||||
for arch_name, config in results_folders.items():
|
||||
results_folder = config["path"]
|
||||
batch_size = config["batch_size"]
|
||||
result_files = sorted(
|
||||
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
|
||||
)
|
||||
|
||||
dimensions = []
|
||||
normal_means = []
|
||||
normal_stds = []
|
||||
anomaly_means = []
|
||||
anomaly_stds = []
|
||||
|
||||
# Verify loss calculation
|
||||
print(
|
||||
f"\nVerifying loss calculation for {arch_name} (batch_size={batch_size})..."
|
||||
)
|
||||
for result_file in result_files:
|
||||
with open(result_file, "rb") as f:
|
||||
results = pickle.load(f)
|
||||
test_loss_calculation(results, batch_size)
|
||||
print(f"Loss calculation verified successfully for {arch_name}!")
|
||||
|
||||
# Process files for this architecture
|
||||
for result_file in result_files:
|
||||
with open(result_file, "rb") as f:
|
||||
results = pickle.load(f)
|
||||
dim = int(results["dimension"])
|
||||
folds = results["ae_results"]
|
||||
|
||||
normal_fold_losses = []
|
||||
anomaly_fold_losses = []
|
||||
|
||||
all_scores = [] # Collect all scores for overall calculation
|
||||
all_fold_scores = [] # Collect all fold scores for std calculation
|
||||
|
||||
for fold_key in folds:
|
||||
fold_data = folds[fold_key]["test"]
|
||||
scores = np.array(fold_data["scores"])
|
||||
labels = np.array(fold_data["labels_exp_based"])
|
||||
|
||||
normal_scores = scores[labels == 1]
|
||||
anomaly_scores = scores[labels == -1]
|
||||
|
||||
normal_fold_losses.append(
|
||||
calculate_batch_mean_loss(normal_scores, batch_size)
|
||||
)
|
||||
anomaly_fold_losses.append(
|
||||
calculate_batch_mean_loss(anomaly_scores, batch_size)
|
||||
)
|
||||
|
||||
all_scores.append(scores) # Add scores to all_scores
|
||||
all_fold_scores.append(fold_data["scores"]) # Add fold scores
|
||||
|
||||
dimensions.append(dim)
|
||||
normal_means.append(np.mean(normal_fold_losses))
|
||||
normal_stds.append(np.std(normal_fold_losses))
|
||||
anomaly_means.append(np.mean(anomaly_fold_losses))
|
||||
anomaly_stds.append(np.std(anomaly_fold_losses))
|
||||
|
||||
# Sort by dimension
|
||||
sorted_data = sorted(
|
||||
zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds)
|
||||
)
|
||||
dims, n_means, n_stds, a_means, a_stds = zip(*sorted_data)
|
||||
|
||||
# Store results for this architecture
|
||||
arch_results[arch_name] = {
|
||||
"dims": dims,
|
||||
"normal": (dims, n_means, n_stds),
|
||||
"anomaly": (dims, a_means, a_stds),
|
||||
"overall": (
|
||||
dims,
|
||||
[
|
||||
calculate_batch_mean_loss(scores, batch_size)
|
||||
for scores in all_scores
|
||||
], # Use all scores
|
||||
[
|
||||
np.std(
|
||||
[
|
||||
calculate_batch_mean_loss(fold_scores, batch_size)
|
||||
for fold_scores in fold_scores_list
|
||||
]
|
||||
)
|
||||
for fold_scores_list in all_fold_scores
|
||||
],
|
||||
),
|
||||
}
|
||||
|
||||
# Create the three plots with all architectures
|
||||
plot_multi_loss_curve(
|
||||
{name: results["normal"] for name, results in arch_results.items()},
|
||||
"Normal Class Test Loss vs. Latent Dimension",
|
||||
output_datetime_path / "ae_elbow_test_loss_normal.png",
|
||||
)
|
||||
|
||||
plot_multi_loss_curve(
|
||||
{name: results["anomaly"] for name, results in arch_results.items()},
|
||||
"Anomaly Class Test Loss vs. Latent Dimension",
|
||||
output_datetime_path / "ae_elbow_test_loss_anomaly.png",
|
||||
)
|
||||
|
||||
plot_multi_loss_curve(
|
||||
{name: results["overall"] for name, results in arch_results.items()},
|
||||
"Overall Test Loss vs. Latent Dimension",
|
||||
output_datetime_path / "ae_elbow_test_loss_overall.png",
|
||||
)
|
||||
|
||||
|
||||
def print_loss_comparison(results_folders):
|
||||
"""Print comparison tables of original vs calculated losses for each architecture."""
|
||||
print("\nLoss Comparison Tables")
|
||||
print("=" * 80)
|
||||
|
||||
for arch_name, config in results_folders.items():
|
||||
results_folder = config["path"]
|
||||
batch_size = config["batch_size"]
|
||||
result_files = sorted(
|
||||
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
|
||||
)
|
||||
|
||||
# Prepare table data
|
||||
table_data = []
|
||||
headers = ["Dimension", "Original", "Calculated", "Diff"]
|
||||
|
||||
for result_file in result_files:
|
||||
with open(result_file, "rb") as f:
|
||||
results = pickle.load(f)
|
||||
|
||||
dim = int(results["dimension"])
|
||||
folds = results["ae_results"]
|
||||
|
||||
# Calculate mean original loss across folds
|
||||
orig_losses = []
|
||||
calc_losses = []
|
||||
for fold_key in folds:
|
||||
fold_data = folds[fold_key]["test"]
|
||||
orig_losses.append(fold_data["loss"])
|
||||
calc_losses.append(
|
||||
calculate_batch_mean_loss(np.array(fold_data["scores"]), batch_size)
|
||||
)
|
||||
|
||||
orig_mean = np.mean(orig_losses)
|
||||
calc_mean = np.mean(calc_losses)
|
||||
diff = abs(orig_mean - calc_mean)
|
||||
|
||||
table_data.append([dim, orig_mean, calc_mean, diff])
|
||||
|
||||
# Sort by dimension
|
||||
table_data.sort(key=lambda x: x[0])
|
||||
|
||||
print(f"\n{arch_name}:")
|
||||
print(
|
||||
tabulate(
|
||||
table_data,
|
||||
headers=headers,
|
||||
floatfmt=".6f",
|
||||
tablefmt="pipe",
|
||||
numalign="right",
|
||||
)
|
||||
)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Print loss comparisons for all architectures
|
||||
print_loss_comparison(results_folders)
|
||||
|
||||
# Run main analysis
|
||||
evaluate_autoencoder_loss()
|
||||
|
||||
# Archive management
|
||||
# Delete current latest folder
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Copy contents to latest folder
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# Copy this script for reference
|
||||
shutil.copy2(__file__, output_datetime_path)
|
||||
shutil.copy2(__file__, latest_folder_path)
|
||||
|
||||
# Move output to archive
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
223
tools/plot_scripts/background_ml_illustrations.py
Normal file
223
tools/plot_scripts/background_ml_illustrations.py
Normal file
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Downloads the cats_vs_dogs dataset, then generates four PNGs:
|
||||
- supervised_grid.png
|
||||
- unsupervised_clusters.png
|
||||
- semi_supervised_classification.png
|
||||
- semi_supervised_clustering.png
|
||||
"""
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow_datasets as tfds
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.semi_supervised import LabelSpreading
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CONFIGURATION
|
||||
# -----------------------------------------------------------------------------
|
||||
NUM_SUPERVISED = 16
|
||||
GRID_ROWS = 4
|
||||
GRID_COLS = 4
|
||||
|
||||
UNSUP_SAMPLES = 200
|
||||
|
||||
# how many labeled points to “seed” semi-sup methods
|
||||
N_LABELED_CLASS = 10 # for classification demo
|
||||
N_SEEDS_PER_CLASS = 3 # for clustering demo
|
||||
|
||||
OUTPUT_DIR = "outputs"
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# UTILITIES
|
||||
# -----------------------------------------------------------------------------
|
||||
def ensure_dir(path):
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 1) Supervised grid
|
||||
# -----------------------------------------------------------------------------
|
||||
def plot_supervised_grid(ds, info, num, rows, cols, outpath):
|
||||
plt.figure(figsize=(cols * 2, rows * 2))
|
||||
for i, (img, lbl) in enumerate(ds.take(num)):
|
||||
ax = plt.subplot(rows, cols, i + 1)
|
||||
ax.imshow(img.numpy().astype("uint8"))
|
||||
ax.axis("off")
|
||||
cname = info.features["label"].int2str(lbl.numpy())
|
||||
ax.set_title(cname, fontsize=9)
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved supervised grid → {outpath}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 2) Unsupervised clustering (PCA + KMeans)
|
||||
# -----------------------------------------------------------------------------
|
||||
def plot_unsupervised_clusters(ds, outpath):
|
||||
imgs = []
|
||||
for img, _ in ds.take(UNSUP_SAMPLES):
|
||||
arr = (
|
||||
tf.image.resize(img, (64, 64)).numpy().astype("float32").mean(axis=2)
|
||||
) # resize and grayscale to speed up
|
||||
imgs.append(arr.ravel() / 255.0)
|
||||
X = np.stack(imgs)
|
||||
pca = PCA(n_components=2, random_state=0)
|
||||
X2 = pca.fit_transform(X)
|
||||
|
||||
km = KMeans(n_clusters=2, random_state=0)
|
||||
clusters = km.fit_predict(X2)
|
||||
|
||||
plt.figure(figsize=(6, 6))
|
||||
plt.scatter(X2[:, 0], X2[:, 1], c=clusters, s=15, alpha=0.6)
|
||||
plt.title("Unsupervised: PCA + KMeans")
|
||||
plt.xlabel("PCA 1")
|
||||
plt.ylabel("PCA 2")
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved unsupervised clusters → {outpath}")
|
||||
|
||||
return X2, clusters # return for reuse
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 3) Semi‐supervised classification demo
|
||||
# -----------------------------------------------------------------------------
|
||||
def plot_semi_supervised_classification(X2, y_true, outpath):
|
||||
n = X2.shape[0]
|
||||
all_idx = list(range(n))
|
||||
labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
|
||||
unlabeled_idx = list(set(all_idx) - set(labeled_idx))
|
||||
|
||||
# pure supervised
|
||||
clf = LogisticRegression().fit(X2[labeled_idx], y_true[labeled_idx])
|
||||
|
||||
# semi‐supervised
|
||||
y_train = np.full(n, -1, dtype=int)
|
||||
y_train[labeled_idx] = y_true[labeled_idx]
|
||||
ls = LabelSpreading().fit(X2, y_train)
|
||||
|
||||
# grid for decision boundary
|
||||
x_min, x_max = X2[:, 0].min() - 1, X2[:, 0].max() + 1
|
||||
y_min, y_max = X2[:, 1].min() - 1, X2[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
|
||||
grid = np.c_[xx.ravel(), yy.ravel()]
|
||||
pred_sup = clf.predict(grid).reshape(xx.shape)
|
||||
pred_semi = ls.predict(grid).reshape(xx.shape)
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
|
||||
for ax, Z, title in zip(
|
||||
axes, [pred_sup, pred_semi], ["Supervised only", "Semi-supervised"]
|
||||
):
|
||||
ax.contourf(xx, yy, Z, alpha=0.3)
|
||||
ax.scatter(X2[unlabeled_idx, 0], X2[unlabeled_idx, 1], s=20, alpha=0.4)
|
||||
ax.scatter(
|
||||
X2[labeled_idx, 0],
|
||||
X2[labeled_idx, 1],
|
||||
c=y_true[labeled_idx],
|
||||
s=80,
|
||||
edgecolor="k",
|
||||
)
|
||||
ax.set_title(title)
|
||||
ax.set_xlabel("PCA 1")
|
||||
ax.set_ylabel("PCA 2")
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved semi-supervised classification → {outpath}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# 4) Semi‐supervised clustering (seeded KMeans)
|
||||
# -----------------------------------------------------------------------------
|
||||
def plot_semi_supervised_clustering(X2, y_true, outpath):
|
||||
# pick a few seeds per class
|
||||
cats = np.where(y_true == 0)[0]
|
||||
dogs = np.where(y_true == 1)[0]
|
||||
seeds = list(np.random.choice(cats, N_SEEDS_PER_CLASS, replace=False)) + list(
|
||||
np.random.choice(dogs, N_SEEDS_PER_CLASS, replace=False)
|
||||
)
|
||||
|
||||
# pure KMeans
|
||||
km1 = KMeans(n_clusters=2, random_state=0).fit(X2)
|
||||
lab1 = km1.labels_
|
||||
|
||||
# seeded: init centers from seed means
|
||||
ctr0 = X2[seeds[:N_SEEDS_PER_CLASS]].mean(axis=0)
|
||||
ctr1 = X2[seeds[N_SEEDS_PER_CLASS:]].mean(axis=0)
|
||||
km2 = KMeans(
|
||||
n_clusters=2, init=np.vstack([ctr0, ctr1]), n_init=1, random_state=0
|
||||
).fit(X2)
|
||||
lab2 = km2.labels_
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
|
||||
for ax, labels, title in zip(axes, [lab1, lab2], ["Pure KMeans", "Seeded KMeans"]):
|
||||
ax.scatter(X2[:, 0], X2[:, 1], c=labels, s=20, alpha=0.6)
|
||||
ax.scatter(
|
||||
X2[seeds, 0],
|
||||
X2[seeds, 1],
|
||||
c=y_true[seeds],
|
||||
edgecolor="k",
|
||||
marker="x",
|
||||
s=100,
|
||||
linewidths=2,
|
||||
)
|
||||
ax.set_title(title)
|
||||
ax.set_xlabel("PCA 1")
|
||||
ax.set_ylabel("PCA 2")
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved semi-supervised clustering → {outpath}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MAIN
|
||||
# -----------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
ensure_dir(OUTPUT_DIR)
|
||||
|
||||
# load
|
||||
print("▶ Loading cats_vs_dogs...")
|
||||
ds, info = tfds.load(
|
||||
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
|
||||
)
|
||||
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
|
||||
|
||||
# supervised
|
||||
plot_supervised_grid(
|
||||
ds,
|
||||
info,
|
||||
NUM_SUPERVISED,
|
||||
GRID_ROWS,
|
||||
GRID_COLS,
|
||||
os.path.join(OUTPUT_DIR, "supervised_grid.png"),
|
||||
)
|
||||
|
||||
# unsupervised
|
||||
# also returns X2 for downstream demos
|
||||
X2, _ = plot_unsupervised_clusters(
|
||||
ds, os.path.join(OUTPUT_DIR, "unsupervised_clusters.png")
|
||||
)
|
||||
|
||||
# collect true labels for that same subset
|
||||
# (we need a y_true array for semi-sup demos)
|
||||
y_true = np.array([lbl.numpy() for _, lbl in ds.take(UNSUP_SAMPLES)])
|
||||
|
||||
# semi-supervised classification
|
||||
plot_semi_supervised_classification(
|
||||
X2, y_true, os.path.join(OUTPUT_DIR, "semi_supervised_classification.png")
|
||||
)
|
||||
|
||||
# semi-supervised clustering
|
||||
plot_semi_supervised_clustering(
|
||||
X2, y_true, os.path.join(OUTPUT_DIR, "semi_supervised_clustering.png")
|
||||
)
|
||||
274
tools/plot_scripts/background_ml_semisupervised.py
Normal file
274
tools/plot_scripts/background_ml_semisupervised.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Downloads the cats_vs_dogs dataset, extracts deep features using MobileNetV2,
|
||||
then generates two clustering comparison illustrations using two different embedding pipelines:
|
||||
- PCA followed by t-SNE (saved as: semi_supervised_clustering_tsne.png)
|
||||
- PCA followed by UMAP (saved as: semi_supervised_clustering_umap.png)
|
||||
|
||||
Each illustration compares:
|
||||
- Unsupervised clustering using the deep embedding + KMeans
|
||||
- Semi-supervised clustering using Label Spreading with a few labeled seeds
|
||||
|
||||
This script saves outputs in a datetime folder and also copies the latest outputs
|
||||
to a "latest" folder. All versions of the outputs and scripts are archived.
|
||||
"""
|
||||
|
||||
import random
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow_datasets as tfds
|
||||
import umap
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import TSNE
|
||||
from sklearn.semi_supervised import LabelSpreading
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CONFIGURATION
|
||||
# -----------------------------------------------------------------------------
|
||||
UNSUP_SAMPLES = 200 # number of samples to use for the demo
|
||||
N_LABELED_CLASS = 20 # number of labeled seeds for the semi-supervised approach
|
||||
|
||||
output_path = Path("/home/fedex/mt/plots/background_ml_semisupervised")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# UTILITIES
|
||||
# -----------------------------------------------------------------------------
|
||||
def ensure_dir(directory: Path):
|
||||
directory.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def cluster_accuracy(y_true, y_pred):
|
||||
"""
|
||||
Compute clustering accuracy by determining the optimal mapping
|
||||
between predicted clusters and true labels using the Hungarian algorithm.
|
||||
"""
|
||||
y_true = y_true.astype(np.int64)
|
||||
y_pred = y_pred.astype(np.int64)
|
||||
labels = np.unique(y_true)
|
||||
clusters = np.unique(y_pred)
|
||||
contingency = np.zeros((labels.size, clusters.size), dtype=np.int64)
|
||||
for i, label in enumerate(labels):
|
||||
for j, cluster in enumerate(clusters):
|
||||
contingency[i, j] = np.sum((y_true == label) & (y_pred == cluster))
|
||||
row_ind, col_ind = linear_sum_assignment(-contingency)
|
||||
accuracy = contingency[row_ind, col_ind].sum() / y_true.size
|
||||
return accuracy
|
||||
|
||||
|
||||
def plot_clustering_comparison_embedding(embedding, y_true, outpath, method_name=""):
|
||||
"""
|
||||
Given a 2D data embedding (e.g., from PCA+t-SNE or PCA+UMAP), this function:
|
||||
- Performs unsupervised clustering with KMeans.
|
||||
- Performs semi-supervised clustering with Label Spreading using a few labeled seeds.
|
||||
- Computes accuracy via the Hungarian algorithm.
|
||||
- Plots the decision boundaries from both methods overlaid with the true labels.
|
||||
- Annotates the plot with the accuracy results.
|
||||
|
||||
The 'method_name' is used in the plot title to indicate which embedding is used.
|
||||
"""
|
||||
n = embedding.shape[0]
|
||||
all_idx = list(range(n))
|
||||
labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
|
||||
|
||||
# Unsupervised clustering using KMeans on all embedded data
|
||||
km = KMeans(n_clusters=2, random_state=0).fit(embedding)
|
||||
unsup_pred = km.predict(embedding)
|
||||
unsup_accuracy = cluster_accuracy(y_true, unsup_pred)
|
||||
|
||||
# Create a grid over the space for decision boundaries
|
||||
x_min, x_max = embedding[:, 0].min() - 1, embedding[:, 0].max() + 1
|
||||
y_min, y_max = embedding[:, 1].min() - 1, embedding[:, 1].max() + 1
|
||||
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
|
||||
grid = np.c_[xx.ravel(), yy.ravel()]
|
||||
pred_unsup = km.predict(grid).reshape(xx.shape)
|
||||
|
||||
# Semi-supervised clustering using Label Spreading with labeled seeds
|
||||
y_train = np.full(n, -1, dtype=int)
|
||||
y_train[labeled_idx] = y_true[labeled_idx]
|
||||
ls = LabelSpreading().fit(embedding, y_train)
|
||||
semi_pred = ls.predict(embedding)
|
||||
semi_accuracy = cluster_accuracy(y_true, semi_pred)
|
||||
pred_semi = ls.predict(grid).reshape(xx.shape)
|
||||
|
||||
cmap = plt.cm.coolwarm
|
||||
|
||||
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
|
||||
|
||||
# Unsupervised plot:
|
||||
ax = axes[0]
|
||||
ax.contourf(xx, yy, pred_unsup, alpha=0.2, cmap=cmap)
|
||||
sc1 = ax.scatter(
|
||||
embedding[:, 0],
|
||||
embedding[:, 1],
|
||||
c=y_true,
|
||||
cmap=cmap,
|
||||
s=30,
|
||||
alpha=0.8,
|
||||
edgecolor="k",
|
||||
)
|
||||
ax.set_title(
|
||||
f"{method_name}\nUnsupervised (KMeans) - Acc: {unsup_accuracy:.2f}", fontsize=10
|
||||
)
|
||||
ax.set_xlabel("Dim 1")
|
||||
ax.set_ylabel("Dim 2")
|
||||
handles = []
|
||||
for cl in np.unique(y_true):
|
||||
handles.append(
|
||||
plt.Line2D(
|
||||
[],
|
||||
[],
|
||||
marker="o",
|
||||
linestyle="",
|
||||
color=cmap(cl / (np.max(y_true) + 1)),
|
||||
label=f"Class {cl}",
|
||||
markersize=6,
|
||||
)
|
||||
)
|
||||
ax.legend(handles=handles, loc="upper right")
|
||||
|
||||
# Semi-supervised plot:
|
||||
ax = axes[1]
|
||||
ax.contourf(xx, yy, pred_semi, alpha=0.2, cmap=cmap)
|
||||
sc2 = ax.scatter(
|
||||
embedding[:, 0],
|
||||
embedding[:, 1],
|
||||
c=y_true,
|
||||
cmap=cmap,
|
||||
s=30,
|
||||
alpha=0.8,
|
||||
edgecolor="none",
|
||||
)
|
||||
sc3 = ax.scatter(
|
||||
embedding[labeled_idx, 0],
|
||||
embedding[labeled_idx, 1],
|
||||
c=y_true[labeled_idx],
|
||||
cmap=cmap,
|
||||
s=80,
|
||||
edgecolor="k",
|
||||
marker="o",
|
||||
label="Labeled Seeds",
|
||||
)
|
||||
ax.set_title(
|
||||
f"{method_name}\nSemi-Supervised (Label Spreading) - Acc: {semi_accuracy:.2f}",
|
||||
fontsize=10,
|
||||
)
|
||||
ax.set_xlabel("Dim 1")
|
||||
ax.set_ylabel("Dim 2")
|
||||
handles = []
|
||||
for cl in np.unique(y_true):
|
||||
handles.append(
|
||||
plt.Line2D(
|
||||
[],
|
||||
[],
|
||||
marker="o",
|
||||
linestyle="",
|
||||
color=cmap(cl / (np.max(y_true) + 1)),
|
||||
label=f"Class {cl}",
|
||||
markersize=6,
|
||||
)
|
||||
)
|
||||
handles.append(
|
||||
plt.Line2D(
|
||||
[],
|
||||
[],
|
||||
marker="o",
|
||||
linestyle="",
|
||||
color="black",
|
||||
label="Labeled Seed",
|
||||
markersize=8,
|
||||
)
|
||||
)
|
||||
ax.legend(handles=handles, loc="upper right")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved clustering comparison illustration → {outpath}")
|
||||
print(f"Unsupervised Accuracy: {unsup_accuracy:.2f}")
|
||||
print(f"Semi-Supervised Accuracy: {semi_accuracy:.2f}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MAIN WITH PRE-TRAINED CNN FEATURE EXTRACTION
|
||||
# -----------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
# Create output directories
|
||||
ensure_dir(output_path)
|
||||
ensure_dir(output_datetime_path)
|
||||
ensure_dir(latest_folder_path)
|
||||
ensure_dir(archive_folder_path)
|
||||
|
||||
print("▶ Loading cats_vs_dogs dataset...")
|
||||
ds, info = tfds.load(
|
||||
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
|
||||
)
|
||||
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
|
||||
|
||||
# Load a pre-trained CNN (MobileNetV2) for feature extraction.
|
||||
cnn_model = tf.keras.applications.MobileNetV2(
|
||||
include_top=False, weights="imagenet", pooling="avg", input_shape=(224, 224, 3)
|
||||
)
|
||||
|
||||
# Extract deep features for all samples.
|
||||
features_list = []
|
||||
labels = []
|
||||
for img, lbl in ds.take(UNSUP_SAMPLES):
|
||||
# Resize to 224x224 and keep 3 channels.
|
||||
img_resized = tf.image.resize(img, (224, 224))
|
||||
# Preprocess the image for MobileNetV2.
|
||||
img_preprocessed = tf.keras.applications.mobilenet_v2.preprocess_input(
|
||||
img_resized
|
||||
)
|
||||
# Expand dims for batch, run through CNN, then squeeze.
|
||||
features = cnn_model(tf.expand_dims(img_preprocessed, axis=0))
|
||||
features = features.numpy().squeeze()
|
||||
features_list.append(features)
|
||||
labels.append(lbl.numpy())
|
||||
X = np.stack(features_list)
|
||||
y_true = np.array(labels)
|
||||
|
||||
# First, apply PCA to reduce dimensionality to 50
|
||||
pca_50 = PCA(n_components=50, random_state=0).fit_transform(X)
|
||||
|
||||
# Then compute embedding with t-SNE
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
X_tsne = TSNE(n_components=2, random_state=0, init="pca").fit_transform(pca_50)
|
||||
outfile_tsne = output_datetime_path / "semi_supervised_clustering_tsne.png"
|
||||
plot_clustering_comparison_embedding(
|
||||
X_tsne, y_true, outfile_tsne, "CNN + PCA + t-SNE"
|
||||
)
|
||||
|
||||
# Then compute embedding with UMAP
|
||||
X_umap = umap.UMAP(n_components=2, random_state=0).fit_transform(pca_50)
|
||||
outfile_umap = output_datetime_path / "semi_supervised_clustering_umap.png"
|
||||
plot_clustering_comparison_embedding(
|
||||
X_umap, y_true, outfile_umap, "CNN + PCA + UMAP"
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Update the 'latest' results folder: remove previous and copy current outputs
|
||||
# -----------------------------------------------------------------------------
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
ensure_dir(latest_folder_path)
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# Copy this script to preserve the code used for the outputs
|
||||
script_path = Path(__file__)
|
||||
shutil.copy2(script_path, output_datetime_path)
|
||||
shutil.copy2(script_path, latest_folder_path)
|
||||
|
||||
# Archive the outputs
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
99
tools/plot_scripts/background_ml_supervised.py
Normal file
99
tools/plot_scripts/background_ml_supervised.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
Downloads the cats_vs_dogs dataset, then generates a supervised grid image:
|
||||
- supervised_grid.png
|
||||
|
||||
This script saves outputs in a datetime folder and also copies the latest outputs to a "latest" folder.
|
||||
All versions of the outputs and scripts are archived.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import tensorflow_datasets as tfds
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CONFIGURATION
|
||||
# -----------------------------------------------------------------------------
|
||||
# Number of supervised samples and grid dimensions
|
||||
NUM_SUPERVISED = 16
|
||||
GRID_ROWS = 4
|
||||
GRID_COLS = 4
|
||||
|
||||
# Output directories for saving plots and scripts
|
||||
output_path = Path("/home/fedex/mt/plots/background_ml_supervised")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# UTILITIES
|
||||
# -----------------------------------------------------------------------------
|
||||
def ensure_dir(directory: Path):
|
||||
directory.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
# Create required output directories
|
||||
ensure_dir(output_path)
|
||||
ensure_dir(output_datetime_path)
|
||||
ensure_dir(latest_folder_path)
|
||||
ensure_dir(archive_folder_path)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Supervised grid plot
|
||||
# -----------------------------------------------------------------------------
|
||||
def plot_supervised_grid(ds, info, num, rows, cols, outpath):
|
||||
"""
|
||||
Plots a grid of images from the dataset with their corresponding labels.
|
||||
"""
|
||||
plt.figure(figsize=(cols * 2, rows * 2))
|
||||
for i, (img, lbl) in enumerate(ds.take(num)):
|
||||
ax = plt.subplot(rows, cols, i + 1)
|
||||
ax.imshow(img.numpy().astype("uint8"))
|
||||
ax.axis("off")
|
||||
cname = info.features["label"].int2str(lbl.numpy())
|
||||
ax.set_title(cname, fontsize=9)
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved supervised grid → {outpath}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MAIN
|
||||
# -----------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
# Download and prepare the dataset
|
||||
print("▶ Loading cats_vs_dogs dataset...")
|
||||
ds, info = tfds.load(
|
||||
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
|
||||
)
|
||||
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
|
||||
|
||||
# Generate the supervised grid image
|
||||
supervised_outfile = output_datetime_path / "supervised_grid.png"
|
||||
plot_supervised_grid(
|
||||
ds, info, NUM_SUPERVISED, GRID_ROWS, GRID_COLS, supervised_outfile
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Update the 'latest' results folder: remove previous and copy current outputs
|
||||
# -----------------------------------------------------------------------------
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
ensure_dir(latest_folder_path)
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# Copy this script to the output folder and to the latest folder to preserve the used code
|
||||
script_path = Path(__file__)
|
||||
shutil.copy2(script_path, output_datetime_path)
|
||||
shutil.copy2(script_path, latest_folder_path)
|
||||
|
||||
# Move the output datetime folder to the archive folder for record keeping
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
105
tools/plot_scripts/background_ml_unsupervised.py
Normal file
105
tools/plot_scripts/background_ml_unsupervised.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Downloads the cats_vs_dogs dataset, then generates an unsupervised clusters image:
|
||||
- unsupervised_clusters.png
|
||||
|
||||
This script saves outputs in a datetime folder and also copies the latest outputs to a "latest" folder.
|
||||
All versions of the outputs and scripts are archived.
|
||||
"""
|
||||
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow_datasets as tfds
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CONFIGURATION
|
||||
# -----------------------------------------------------------------------------
|
||||
UNSUP_SAMPLES = 200
|
||||
|
||||
output_path = Path("/home/fedex/mt/plots/background_ml_unsupervised")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# UTILITIES
|
||||
# -----------------------------------------------------------------------------
|
||||
def ensure_dir(directory: Path):
|
||||
directory.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
# Create required output directories
|
||||
ensure_dir(output_path)
|
||||
ensure_dir(output_datetime_path)
|
||||
ensure_dir(latest_folder_path)
|
||||
ensure_dir(archive_folder_path)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Unsupervised Clustering Plot (PCA + KMeans)
|
||||
# -----------------------------------------------------------------------------
|
||||
def plot_unsupervised_clusters(ds, outpath):
|
||||
"""
|
||||
Processes a subset of images from the dataset, reduces their dimensionality with PCA,
|
||||
applies KMeans clustering, and saves a scatterplot of the clusters.
|
||||
"""
|
||||
imgs = []
|
||||
for img, _ in ds.take(UNSUP_SAMPLES):
|
||||
# resize to 64x64, convert to grayscale by averaging channels
|
||||
arr = tf.image.resize(img, (64, 64)).numpy().astype("float32").mean(axis=2)
|
||||
imgs.append(arr.ravel() / 255.0)
|
||||
X = np.stack(imgs)
|
||||
pca = PCA(n_components=2, random_state=0)
|
||||
X2 = pca.fit_transform(X)
|
||||
|
||||
km = KMeans(n_clusters=2, random_state=0)
|
||||
clusters = km.fit_predict(X2)
|
||||
|
||||
plt.figure(figsize=(6, 6))
|
||||
plt.scatter(X2[:, 0], X2[:, 1], c=clusters, s=15, alpha=0.6)
|
||||
plt.title("Unsupervised: PCA + KMeans")
|
||||
plt.xlabel("PCA 1")
|
||||
plt.ylabel("PCA 2")
|
||||
plt.tight_layout()
|
||||
plt.savefig(outpath, dpi=150)
|
||||
plt.close()
|
||||
print(f"✔ Saved unsupervised clusters → {outpath}")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# MAIN
|
||||
# -----------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
# Load and prepare the dataset
|
||||
print("▶ Loading cats_vs_dogs dataset...")
|
||||
ds, _ = tfds.load("cats_vs_dogs", split="train", with_info=True, as_supervised=True)
|
||||
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
|
||||
|
||||
# Generate the unsupervised clusters image
|
||||
unsupervised_outfile = output_datetime_path / "unsupervised_clusters.png"
|
||||
plot_unsupervised_clusters(ds, unsupervised_outfile)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Update the 'latest' results folder: remove previous and copy current outputs
|
||||
# -----------------------------------------------------------------------------
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
ensure_dir(latest_folder_path)
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# Copy this script to the output folder and to the latest folder to preserve the used code
|
||||
script_path = Path(__file__)
|
||||
shutil.copy2(script_path, output_datetime_path)
|
||||
shutil.copy2(script_path, latest_folder_path)
|
||||
|
||||
# Move the output datetime folder to the archive folder for record keeping
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
255
tools/plot_scripts/data_anomalies_timeline.py
Normal file
255
tools/plot_scripts/data_anomalies_timeline.py
Normal file
@@ -0,0 +1,255 @@
|
||||
import json
|
||||
import pickle
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# define data paths
|
||||
all_data_path = Path("/home/fedex/mt/data/subter")
|
||||
output_path = Path("/home/fedex/mt/plots/data_anomalies_timeline")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
cache_path = output_path
|
||||
|
||||
# if output does not exist, create it
|
||||
output_path.mkdir(exist_ok=True, parents=True)
|
||||
output_datetime_path.mkdir(exist_ok=True, parents=True)
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
archive_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
data_resolution = 32 * 2048
|
||||
|
||||
# find all bag files and sort them correctly by name
|
||||
normal_experiment_paths, anomaly_experiment_paths = [], []
|
||||
for bag_file_path in all_data_path.iterdir():
|
||||
if bag_file_path.suffix != ".bag":
|
||||
continue
|
||||
if "smoke" in bag_file_path.name:
|
||||
anomaly_experiment_paths.append(bag_file_path)
|
||||
else:
|
||||
normal_experiment_paths.append(bag_file_path)
|
||||
|
||||
# Load manually labeled frames
|
||||
with open(
|
||||
cache_path / "manually_labeled_anomaly_frames.json", "r"
|
||||
) as frame_borders_file:
|
||||
manually_labeled_anomaly_frames_json = json.load(frame_borders_file)
|
||||
if not manually_labeled_anomaly_frames_json:
|
||||
print("No manually labeled anomaly frames found. Exiting...")
|
||||
exit(1)
|
||||
manually_labeled_anomaly_frames = {}
|
||||
try:
|
||||
for file in manually_labeled_anomaly_frames_json["files"]:
|
||||
if file["filename"] not in (
|
||||
p.with_suffix(".npy").name for p in anomaly_experiment_paths
|
||||
):
|
||||
print(
|
||||
f"File {file['filename']} from manually labeled frames not found in anomaly experiments. Exiting..."
|
||||
)
|
||||
exit(1)
|
||||
manually_labeled_anomaly_frames[file["filename"]] = (
|
||||
file["semi_target_begin_frame"],
|
||||
file["semi_target_end_frame"],
|
||||
)
|
||||
except KeyError as e:
|
||||
print(f"Missing key in manually labeled frames JSON: {e}")
|
||||
exit(1)
|
||||
|
||||
|
||||
def plot_combined_timeline(
|
||||
normal_experiment_paths, anomaly_experiment_paths, title, num_bins=50
|
||||
):
|
||||
"""Plot both missing points and near-sensor measurements over normalized timeline"""
|
||||
# Sort experiments by filesize first (to match original processing order)
|
||||
normal_experiment_paths = sorted(
|
||||
normal_experiment_paths, key=lambda path: path.stat().st_size
|
||||
)
|
||||
anomaly_experiment_paths = sorted(
|
||||
anomaly_experiment_paths, key=lambda path: path.stat().st_size
|
||||
)
|
||||
|
||||
# Get largest normal experiment and moving anomaly experiments
|
||||
baseline_path = normal_experiment_paths[-3] # largest normal experiment
|
||||
moving_exp_indices = [
|
||||
i
|
||||
for i, path in enumerate(anomaly_experiment_paths)
|
||||
if "stationary" not in path.name
|
||||
]
|
||||
moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
|
||||
|
||||
# Load missing points data
|
||||
missing_points_cache = Path(cache_path / "missing_points.pkl")
|
||||
if not missing_points_cache.exists():
|
||||
print("Missing points cache not found!")
|
||||
return
|
||||
|
||||
# Load near-sensor data (using 500mm threshold)
|
||||
near_sensor_cache = Path(cache_path / "particles_near_sensor_counts_500.pkl")
|
||||
if not near_sensor_cache.exists():
|
||||
print("Near-sensor measurements cache not found!")
|
||||
return
|
||||
|
||||
# Load both cached datasets
|
||||
with open(missing_points_cache, "rb") as file:
|
||||
missing_points_normal, missing_points_anomaly = pickle.load(file)
|
||||
with open(near_sensor_cache, "rb") as file:
|
||||
near_sensor_normal, near_sensor_anomaly = pickle.load(file)
|
||||
|
||||
# Get data for baseline and moving experiments
|
||||
missing_data = [missing_points_normal[-3]] + [
|
||||
missing_points_anomaly[i] for i in moving_exp_indices
|
||||
]
|
||||
near_sensor_data = [near_sensor_normal[-3]] + [
|
||||
near_sensor_anomaly[i] for i in moving_exp_indices
|
||||
]
|
||||
all_paths = [baseline_path] + moving_anomaly_paths
|
||||
|
||||
# Create figure with two y-axes and dynamic layout
|
||||
fig, ax1 = plt.subplots(figsize=(12, 6), constrained_layout=True)
|
||||
ax2 = ax1.twinx()
|
||||
|
||||
# Color schemes - gray for baseline, colors for anomaly experiments
|
||||
experiment_colors = ["#808080"] + ["#1f77b4", "#ff7f0e", "#2ca02c"] # gray + colors
|
||||
|
||||
# First create custom legend handles for the metrics
|
||||
from matplotlib.lines import Line2D
|
||||
|
||||
metric_legend = [
|
||||
Line2D([0], [0], color="gray", linestyle="-", label="Missing Points"),
|
||||
Line2D([0], [0], color="gray", linestyle="--", label="Near-Sensor (<0.5m)"),
|
||||
Line2D([0], [0], color="gray", linestyle=":", label="Manually Labeled Borders"),
|
||||
]
|
||||
|
||||
# Plot each experiment's data
|
||||
for i, (missing_exp, near_sensor_exp) in enumerate(
|
||||
zip(missing_data, near_sensor_data)
|
||||
):
|
||||
# Get experiment name without the full path
|
||||
exp_name = all_paths[i].stem
|
||||
# Shorten experiment name if needed
|
||||
exp_name = exp_name.replace("experiment_smoke_", "exp_")
|
||||
|
||||
# Convert both to percentages
|
||||
missing_pct = np.array(missing_exp) / data_resolution * 100
|
||||
near_sensor_pct = np.array(near_sensor_exp) / data_resolution * 100
|
||||
|
||||
# Create normalized timeline bins for both
|
||||
exp_len = len(missing_pct)
|
||||
bins = np.linspace(0, exp_len - 1, num_bins)
|
||||
missing_binned = np.zeros(num_bins)
|
||||
near_sensor_binned = np.zeros(num_bins)
|
||||
|
||||
# Bin both datasets
|
||||
for bin_idx in range(num_bins):
|
||||
if bin_idx == num_bins - 1:
|
||||
start_idx = int(bins[bin_idx])
|
||||
end_idx = exp_len
|
||||
else:
|
||||
start_idx = int(bins[bin_idx])
|
||||
end_idx = int(bins[bin_idx + 1])
|
||||
|
||||
missing_binned[bin_idx] = np.mean(missing_pct[start_idx:end_idx])
|
||||
near_sensor_binned[bin_idx] = np.mean(near_sensor_pct[start_idx:end_idx])
|
||||
|
||||
# Plot both metrics with same color but different line styles
|
||||
color = experiment_colors[i]
|
||||
ax1.plot(
|
||||
range(num_bins),
|
||||
missing_binned,
|
||||
color=color,
|
||||
linestyle="-",
|
||||
alpha=0.6,
|
||||
label=exp_name,
|
||||
)
|
||||
ax2.plot(
|
||||
range(num_bins), near_sensor_binned, color=color, linestyle="--", alpha=0.6
|
||||
)
|
||||
|
||||
# Add vertical lines for manually labeled frames if available
|
||||
if all_paths[i].with_suffix(".npy").name in manually_labeled_anomaly_frames:
|
||||
begin_frame, end_frame = manually_labeled_anomaly_frames[
|
||||
all_paths[i].with_suffix(".npy").name
|
||||
]
|
||||
# Convert frame numbers to normalized timeline positions
|
||||
begin_pos = (begin_frame / exp_len) * (num_bins - 1)
|
||||
end_pos = (end_frame / exp_len) * (num_bins - 1)
|
||||
|
||||
# Add vertical lines with matching color and loose dotting
|
||||
ax1.axvline(
|
||||
x=begin_pos,
|
||||
color=color,
|
||||
linestyle=":",
|
||||
alpha=0.6,
|
||||
)
|
||||
ax1.axvline(
|
||||
x=end_pos,
|
||||
color=color,
|
||||
linestyle=":",
|
||||
alpha=0.6,
|
||||
)
|
||||
|
||||
# Customize axes
|
||||
ax1.set_xlabel("Normalized Timeline")
|
||||
ax1.set_xticks(np.linspace(0, num_bins - 1, 5))
|
||||
ax1.set_xticklabels([f"{x:.0f}%" for x in np.linspace(0, 100, 5)])
|
||||
|
||||
ax1.set_ylabel("Missing Points (%)")
|
||||
ax2.set_ylabel("Points with <0.5m Range (%)")
|
||||
|
||||
plt.title(title)
|
||||
|
||||
# Create legends without fixed positions
|
||||
# First get all lines and labels for experiments
|
||||
lines1, labels1 = ax1.get_legend_handles_labels()
|
||||
|
||||
# Combine both legends into one
|
||||
all_handles = (
|
||||
lines1
|
||||
+ [Line2D([0], [0], color="gray", linestyle="-", label="", alpha=0)]
|
||||
+ metric_legend
|
||||
)
|
||||
all_labels = (
|
||||
labels1
|
||||
+ [""]
|
||||
+ ["Missing Points", "Points Near Sensor (<0.5m)", "Manually Labeled Borders"]
|
||||
)
|
||||
|
||||
# Create single legend in top right corner with consistent margins
|
||||
fig.legend(all_handles, all_labels, loc="upper right", borderaxespad=4.8)
|
||||
|
||||
plt.grid(True, alpha=0.3)
|
||||
|
||||
# Save figure letting matplotlib handle the layout
|
||||
plt.savefig(output_datetime_path / "combined_anomalies_timeline.png", dpi=150)
|
||||
plt.close()
|
||||
|
||||
|
||||
# Generate the combined timeline plot
|
||||
plot_combined_timeline(
|
||||
normal_experiment_paths,
|
||||
anomaly_experiment_paths,
|
||||
"Lidar Degradation Indicators Throughout Experiments\n(Baseline and Moving Anomaly Experiments)",
|
||||
)
|
||||
|
||||
# delete current latest folder
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
|
||||
# create new latest folder
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# copy contents of output folder to the latest folder
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# copy this python script to preserve the code used
|
||||
shutil.copy2(__file__, output_datetime_path)
|
||||
shutil.copy2(__file__, latest_folder_path)
|
||||
|
||||
# move output date folder to archive
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
178
tools/plot_scripts/data_icp_rsme.py
Normal file
178
tools/plot_scripts/data_icp_rsme.py
Normal file
@@ -0,0 +1,178 @@
|
||||
import pickle
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import open3d as o3d
|
||||
from pointcloudset import Dataset
|
||||
from pointcloudset.io.pointcloud.open3d import to_open3d
|
||||
from rich.progress import track
|
||||
|
||||
# define data path containing the bag files
|
||||
all_data_path = Path("/home/fedex/mt/data/subter")
|
||||
|
||||
output_path = Path("/home/fedex/mt/plots/data_icp_rmse")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
# if output does not exist, create it
|
||||
output_path.mkdir(exist_ok=True, parents=True)
|
||||
output_datetime_path.mkdir(exist_ok=True, parents=True)
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
archive_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Add cache folder
|
||||
cache_folder = output_path / "cache"
|
||||
cache_folder.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
||||
def get_cache_path(experiment_path: Path) -> Path:
|
||||
"""Convert experiment bag path to cache pkl path"""
|
||||
return cache_folder / f"{experiment_path.stem}.pkl"
|
||||
|
||||
|
||||
# find all bag files and sort them correctly by name (experiments with smoke in the name are anomalies)
|
||||
normal_experiment_paths, anomaly_experiment_paths = [], []
|
||||
for bag_file_path in all_data_path.iterdir():
|
||||
if bag_file_path.suffix != ".bag":
|
||||
continue
|
||||
if "smoke" in bag_file_path.name:
|
||||
anomaly_experiment_paths.append(bag_file_path)
|
||||
else:
|
||||
normal_experiment_paths.append(bag_file_path)
|
||||
|
||||
# sort anomaly and normal experiments by filesize, ascending
|
||||
anomaly_experiment_paths.sort(key=lambda path: path.stat().st_size)
|
||||
normal_experiment_paths.sort(key=lambda path: path.stat().st_size)
|
||||
|
||||
|
||||
def calculate_icp_rmse(experiment_path):
|
||||
"""Calculate ICP RMSE between consecutive frames in an experiment"""
|
||||
cache_path = get_cache_path(experiment_path)
|
||||
|
||||
# Check cache first
|
||||
if cache_path.exists():
|
||||
with open(cache_path, "rb") as file:
|
||||
return pickle.load(file)
|
||||
|
||||
dataset = Dataset.from_file(experiment_path, topic="/ouster/points")
|
||||
rmse_values = []
|
||||
|
||||
# Convert iterator to list for progress tracking
|
||||
pointclouds = list(dataset)
|
||||
|
||||
# Get consecutive pairs of point clouds with progress bar
|
||||
for i in track(
|
||||
range(len(pointclouds) - 1),
|
||||
description=f"Processing {experiment_path.name}",
|
||||
total=len(pointclouds) - 1,
|
||||
):
|
||||
# Get consecutive point clouds and convert to Open3D format
|
||||
source = to_open3d(pointclouds[i])
|
||||
target = to_open3d(pointclouds[i + 1])
|
||||
|
||||
# Apply point-to-point ICP (much faster than point-to-plane)
|
||||
result = o3d.pipelines.registration.registration_icp(
|
||||
source,
|
||||
target,
|
||||
max_correspondence_distance=0.2, # 20cm max correspondence
|
||||
init=np.identity(4), # Identity matrix as initial transformation
|
||||
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
|
||||
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
|
||||
max_iteration=50
|
||||
),
|
||||
)
|
||||
|
||||
rmse_values.append(result.inlier_rmse)
|
||||
|
||||
# Cache the results
|
||||
with open(cache_path, "wb") as file:
|
||||
pickle.dump(rmse_values, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return rmse_values
|
||||
|
||||
|
||||
def plot_statistical_comparison(normal_paths, anomaly_paths, output_path):
|
||||
"""Create statistical comparison plots between normal and anomalous experiments"""
|
||||
# Collect all RMSE values
|
||||
normal_rmse = []
|
||||
anomaly_rmse = []
|
||||
|
||||
print("Loading cached RMSE values...")
|
||||
for path in normal_paths:
|
||||
normal_rmse.extend(calculate_icp_rmse(path))
|
||||
for path in anomaly_paths:
|
||||
anomaly_rmse.extend(calculate_icp_rmse(path))
|
||||
|
||||
# Convert to numpy arrays
|
||||
normal_rmse = np.array(normal_rmse)
|
||||
anomaly_rmse = np.array(anomaly_rmse)
|
||||
|
||||
# Create figure with two subplots side by sidnte
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
||||
|
||||
# Box plot
|
||||
data = [normal_rmse, anomaly_rmse]
|
||||
ax1.boxplot(data, labels=["Normal", "Anomalous"])
|
||||
ax1.set_ylabel("ICP RMSE (meters)")
|
||||
ax1.set_title("Box Plot Comparison")
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
# Violin plot
|
||||
ax2.violinplot(data)
|
||||
ax2.set_xticks([1, 2])
|
||||
ax2.set_xticklabels(["Normal", "Anomalous"])
|
||||
ax2.set_ylabel("ICP RMSE (meters)")
|
||||
ax2.set_title("Violin Plot Comparison")
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
plt.suptitle(
|
||||
"ICP RMSE Statistical Comparison: Normal vs Anomalous Experiments", fontsize=14
|
||||
)
|
||||
plt.tight_layout()
|
||||
|
||||
# Save both linear and log scale versions
|
||||
plt.savefig(output_path / "icp_rmse_distribution_linear.png", dpi=150)
|
||||
|
||||
# Log scale version
|
||||
ax1.set_yscale("log")
|
||||
ax2.set_yscale("log")
|
||||
plt.savefig(output_path / "icp_rmse_distribution_log.png", dpi=150)
|
||||
plt.close()
|
||||
|
||||
# Print some basic statistics
|
||||
print("\nBasic Statistics:")
|
||||
print(
|
||||
f"Normal experiments - mean: {np.mean(normal_rmse):.4f}, std: {np.std(normal_rmse):.4f}"
|
||||
)
|
||||
print(
|
||||
f"Anomaly experiments - mean: {np.mean(anomaly_rmse):.4f}, std: {np.std(anomaly_rmse):.4f}"
|
||||
)
|
||||
|
||||
|
||||
# Replace all plotting code with just this:
|
||||
plot_statistical_comparison(
|
||||
normal_experiment_paths, anomaly_experiment_paths, output_datetime_path
|
||||
)
|
||||
|
||||
# delete current latest folder
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
|
||||
# create new latest folder
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# copy contents of output folder to the latest folder
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# copy this python script to preserve the code used
|
||||
shutil.copy2(__file__, output_datetime_path)
|
||||
shutil.copy2(__file__, latest_folder_path)
|
||||
|
||||
# move output date folder to archive
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
134
tools/plot_scripts/data_missing_points_anomalies.py
Normal file
134
tools/plot_scripts/data_missing_points_anomalies.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import pickle
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pointcloudset import Dataset
|
||||
|
||||
# define data paths
|
||||
all_data_path = Path("/home/fedex/mt/data/subter")
|
||||
output_path = Path("/home/fedex/mt/plots/data_missing_points_anomalies")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
# if output does not exist, create it
|
||||
output_path.mkdir(exist_ok=True, parents=True)
|
||||
output_datetime_path.mkdir(exist_ok=True, parents=True)
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
archive_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
data_resolution = 32 * 2048
|
||||
|
||||
# find all bag files and sort them correctly by name
|
||||
normal_experiment_paths, anomaly_experiment_paths = [], []
|
||||
for bag_file_path in all_data_path.iterdir():
|
||||
if bag_file_path.suffix != ".bag":
|
||||
continue
|
||||
if "smoke" in bag_file_path.name:
|
||||
anomaly_experiment_paths.append(bag_file_path)
|
||||
else:
|
||||
normal_experiment_paths.append(bag_file_path)
|
||||
|
||||
|
||||
def plot_timeline_comparison(anomaly_experiment_paths, title, num_bins=50):
|
||||
"""Plot missing points percentage over normalized timeline for moving anomaly experiments"""
|
||||
# Sort experiments by filesize first (to match original processing order)
|
||||
anomaly_experiment_paths = sorted(
|
||||
anomaly_experiment_paths, key=lambda path: path.stat().st_size
|
||||
)
|
||||
|
||||
# Filter out stationary experiments
|
||||
moving_exp_indices = [
|
||||
i
|
||||
for i, path in enumerate(anomaly_experiment_paths)
|
||||
if "stationary" not in path.name
|
||||
]
|
||||
moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
|
||||
|
||||
# Try to load cached data from original script's location
|
||||
cache_path = Path("/home/fedex/mt/plots/data_missing_points/missing_points.pkl")
|
||||
if not cache_path.exists():
|
||||
print("No cached data found. Please run the original script first.")
|
||||
return
|
||||
|
||||
with open(cache_path, "rb") as file:
|
||||
_, missing_points_anomaly = pickle.load(file)
|
||||
|
||||
# Get data for moving experiments only (using original indices)
|
||||
moving_anomaly_data = [missing_points_anomaly[i] for i in moving_exp_indices]
|
||||
|
||||
# Create figure
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# Plot each experiment's timeline
|
||||
for i, exp_data in enumerate(moving_anomaly_data):
|
||||
# Convert to percentage
|
||||
percentages = np.array(exp_data) / data_resolution * 100
|
||||
|
||||
# Create normalized timeline bins
|
||||
exp_len = len(percentages)
|
||||
bins = np.linspace(0, exp_len - 1, num_bins)
|
||||
binned_data = np.zeros(num_bins)
|
||||
|
||||
# Bin the data
|
||||
for bin_idx in range(num_bins):
|
||||
if bin_idx == num_bins - 1:
|
||||
start_idx = int(bins[bin_idx])
|
||||
end_idx = exp_len
|
||||
else:
|
||||
start_idx = int(bins[bin_idx])
|
||||
end_idx = int(bins[bin_idx + 1])
|
||||
|
||||
binned_data[bin_idx] = np.mean(percentages[start_idx:end_idx])
|
||||
|
||||
# Plot with slight transparency to show overlaps
|
||||
plt.plot(
|
||||
range(num_bins),
|
||||
binned_data,
|
||||
alpha=0.6,
|
||||
label=f"Experiment {moving_anomaly_paths[i].stem}",
|
||||
)
|
||||
|
||||
plt.title(title)
|
||||
plt.xlabel("Normalized Timeline")
|
||||
# Add percentage ticks on x-axis
|
||||
plt.xticks(
|
||||
np.linspace(0, num_bins - 1, 5), [f"{x:.0f}%" for x in np.linspace(0, 100, 5)]
|
||||
)
|
||||
plt.ylabel("Missing Points (%)")
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
|
||||
# Save the plot
|
||||
plt.savefig(output_datetime_path / "missing_points_timeline.png", dpi=150)
|
||||
plt.close()
|
||||
|
||||
|
||||
# Generate the timeline comparison plot
|
||||
plot_timeline_comparison(
|
||||
anomaly_experiment_paths,
|
||||
"Missing Lidar Measurements Over Time\n(Moving Anomaly Experiments Only)",
|
||||
)
|
||||
|
||||
# delete current latest folder
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
|
||||
# create new latest folder
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# copy contents of output folder to the latest folder
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# copy this python script to preserve the code used
|
||||
shutil.copy2(__file__, output_datetime_path)
|
||||
shutil.copy2(__file__, latest_folder_path)
|
||||
|
||||
# move output date folder to archive
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
149
tools/plot_scripts/data_particles_near_sensor_anomalies.py
Normal file
149
tools/plot_scripts/data_particles_near_sensor_anomalies.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import pickle
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from pointcloudset import Dataset
|
||||
|
||||
# define data paths
|
||||
all_data_path = Path("/home/fedex/mt/data/subter")
|
||||
output_path = Path("/home/fedex/mt/plots/data_particles_near_sensor_anomalies")
|
||||
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
|
||||
latest_folder_path = output_path / "latest"
|
||||
archive_folder_path = output_path / "archive"
|
||||
output_datetime_path = output_path / datetime_folder_name
|
||||
|
||||
# if output does not exist, create it
|
||||
output_path.mkdir(exist_ok=True, parents=True)
|
||||
output_datetime_path.mkdir(exist_ok=True, parents=True)
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
archive_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
data_resolution = 32 * 2048
|
||||
|
||||
# find all bag files and sort them correctly by name
|
||||
normal_experiment_paths, anomaly_experiment_paths = [], []
|
||||
for bag_file_path in all_data_path.iterdir():
|
||||
if bag_file_path.suffix != ".bag":
|
||||
continue
|
||||
if "smoke" in bag_file_path.name:
|
||||
anomaly_experiment_paths.append(bag_file_path)
|
||||
else:
|
||||
normal_experiment_paths.append(bag_file_path)
|
||||
|
||||
|
||||
def plot_timeline_comparison(
|
||||
anomaly_experiment_paths, title, range_threshold, num_bins=50
|
||||
):
|
||||
"""Plot near-sensor measurements percentage over normalized timeline for moving anomaly experiments"""
|
||||
# Sort experiments by filesize first (to match original processing order)
|
||||
anomaly_experiment_paths = sorted(
|
||||
anomaly_experiment_paths, key=lambda path: path.stat().st_size
|
||||
)
|
||||
|
||||
# Filter out stationary experiments
|
||||
moving_exp_indices = [
|
||||
i
|
||||
for i, path in enumerate(anomaly_experiment_paths)
|
||||
if "stationary" not in path.name
|
||||
]
|
||||
moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
|
||||
|
||||
# Try to load cached data
|
||||
cache_path = (
|
||||
Path("/home/fedex/mt/plots/data_particles_near_sensor")
|
||||
/ f"particles_near_sensor_counts_{range_threshold}.pkl"
|
||||
)
|
||||
if not cache_path.exists():
|
||||
print(
|
||||
f"No cached data found for range threshold {range_threshold}. Please run the original script first."
|
||||
)
|
||||
return
|
||||
|
||||
with open(cache_path, "rb") as file:
|
||||
_, particles_near_sensor_anomaly = pickle.load(file)
|
||||
|
||||
# Get data for moving experiments only (using original indices)
|
||||
moving_anomaly_data = [particles_near_sensor_anomaly[i] for i in moving_exp_indices]
|
||||
|
||||
# Create figure
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# Plot each experiment's timeline
|
||||
for i, exp_data in enumerate(moving_anomaly_data):
|
||||
# Convert to percentage
|
||||
percentages = np.array(exp_data) / data_resolution * 100
|
||||
|
||||
# Create normalized timeline bins
|
||||
exp_len = len(percentages)
|
||||
bins = np.linspace(0, exp_len - 1, num_bins)
|
||||
binned_data = np.zeros(num_bins)
|
||||
|
||||
# Bin the data
|
||||
for bin_idx in range(num_bins):
|
||||
if bin_idx == num_bins - 1:
|
||||
start_idx = int(bins[bin_idx])
|
||||
end_idx = exp_len
|
||||
else:
|
||||
start_idx = int(bins[bin_idx])
|
||||
end_idx = int(bins[bin_idx + 1])
|
||||
|
||||
binned_data[bin_idx] = np.mean(percentages[start_idx:end_idx])
|
||||
|
||||
# Plot with slight transparency to show overlaps
|
||||
plt.plot(
|
||||
range(num_bins),
|
||||
binned_data,
|
||||
alpha=0.6,
|
||||
label=f"Experiment {moving_anomaly_paths[i].stem}",
|
||||
)
|
||||
|
||||
plt.title(f"{title}\n(Range Threshold: {range_threshold / 1000:.1f}m)")
|
||||
plt.xlabel("Normalized Timeline")
|
||||
# Add percentage ticks on x-axis
|
||||
plt.xticks(
|
||||
np.linspace(0, num_bins - 1, 5), [f"{x:.0f}%" for x in np.linspace(0, 100, 5)]
|
||||
)
|
||||
plt.ylabel("Near-Sensor Measurements (%)")
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
|
||||
# Save the plot
|
||||
plt.savefig(
|
||||
output_datetime_path
|
||||
/ f"near_sensor_measurements_timeline_{range_threshold}.png",
|
||||
dpi=150,
|
||||
)
|
||||
plt.close()
|
||||
|
||||
|
||||
# Generate timeline comparison plots for each range threshold
|
||||
range_thresholds = [500, 750, 1000, 1250, 1500]
|
||||
for rt in range_thresholds:
|
||||
print(f"Processing range threshold {rt}...")
|
||||
plot_timeline_comparison(
|
||||
anomaly_experiment_paths,
|
||||
"Near-Sensor Lidar Measurements Over Time\n(Moving Anomaly Experiments Only)",
|
||||
rt,
|
||||
)
|
||||
|
||||
# delete current latest folder
|
||||
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
||||
|
||||
# create new latest folder
|
||||
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# copy contents of output folder to the latest folder
|
||||
for file in output_datetime_path.iterdir():
|
||||
shutil.copy2(file, latest_folder_path)
|
||||
|
||||
# copy this python script to preserve the code used
|
||||
shutil.copy2(__file__, output_datetime_path)
|
||||
shutil.copy2(__file__, latest_folder_path)
|
||||
|
||||
# move output date folder to archive
|
||||
shutil.move(output_datetime_path, archive_folder_path)
|
||||
Reference in New Issue
Block a user