tools, lockfile, deps

This commit is contained in:
Jan Kowalczyk
2025-08-13 14:17:12 +02:00
parent cd4dc583e8
commit ef311d862e
17 changed files with 4325 additions and 0 deletions

View File

@@ -0,0 +1,339 @@
import pickle
import shutil
import unittest
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
# Configuration
results_folders = {
"LeNet": {
"path": Path(
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow_v2/"
),
"batch_size": 256,
},
"LeNet Efficient": {
"path": Path(
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_efficient_ae_elbow"
),
"batch_size": 64,
},
}
output_path = Path("/home/fedex/mt/plots/ae_elbow_lenet")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# Create output directories
output_path.mkdir(exist_ok=True, parents=True)
output_datetime_path.mkdir(exist_ok=True, parents=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
archive_folder_path.mkdir(exist_ok=True, parents=True)
def calculate_batch_mean_loss(scores, batch_size):
"""Calculate mean loss over batches similar to the original testing code."""
n_samples = len(scores)
n_batches = (n_samples + batch_size - 1) // batch_size
batch_losses = []
for i in range(0, n_samples, batch_size):
batch_scores = scores[i : i + batch_size]
batch_losses.append(np.mean(batch_scores))
return np.sum(batch_losses) / n_batches
def test_loss_calculation(results, batch_size):
"""Test if our loss calculation matches the original implementation."""
test = unittest.TestCase()
folds = results["ae_results"]
dim = results["dimension"]
for fold_key in folds:
fold_data = folds[fold_key]["test"]
scores = np.array(fold_data["scores"])
original_loss = fold_data["loss"]
calculated_loss = calculate_batch_mean_loss(scores, batch_size)
try:
test.assertAlmostEqual(
original_loss,
calculated_loss,
places=5,
msg=f"Loss mismatch for dim={dim}, {fold_key}",
)
except AssertionError as e:
print(f"Warning: {str(e)}")
print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
raise
def plot_loss_curve(dims, means, stds, title, color, output_path):
"""Create and save a single loss curve plot."""
plt.figure(figsize=(8, 5))
plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
plt.fill_between(
dims,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
color=color,
alpha=0.2,
label="Std Dev",
)
plt.xlabel("Latent Dimension")
plt.ylabel("Test Loss")
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(dims)
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
"""Create and save a loss curve plot with multiple architectures.
Args:
arch_results: Dict of format {arch_name: (dims, means, stds)}
title: Plot title
output_path: Where to save the plot
colors: Optional dict of colors for each architecture
"""
plt.figure(figsize=(10, 6))
if colors is None:
colors = {
"LeNet": "blue",
"LeNet Asymmetric": "red",
}
# Get unique dimensions across all architectures
all_dims = sorted(
set(dim for _, (dims, _, _) in arch_results.items() for dim in dims)
)
for arch_name, (dims, means, stds) in arch_results.items():
color = colors.get(arch_name, "gray")
plt.plot(dims, means, marker="o", color=color, label=arch_name)
plt.fill_between(
dims,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
color=color,
alpha=0.2,
)
plt.xlabel("Latent Dimension")
plt.ylabel("Test Loss")
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(all_dims) # Set x-axis ticks to match data points
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
def evaluate_autoencoder_loss():
"""Main function to evaluate autoencoder loss across different latent dimensions."""
# Results storage for each architecture
arch_results = {
name: {"dims": [], "normal": [], "anomaly": []} for name in results_folders
}
# Process each architecture
for arch_name, config in results_folders.items():
results_folder = config["path"]
batch_size = config["batch_size"]
result_files = sorted(
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
)
dimensions = []
normal_means = []
normal_stds = []
anomaly_means = []
anomaly_stds = []
# Verify loss calculation
print(
f"\nVerifying loss calculation for {arch_name} (batch_size={batch_size})..."
)
for result_file in result_files:
with open(result_file, "rb") as f:
results = pickle.load(f)
test_loss_calculation(results, batch_size)
print(f"Loss calculation verified successfully for {arch_name}!")
# Process files for this architecture
for result_file in result_files:
with open(result_file, "rb") as f:
results = pickle.load(f)
dim = int(results["dimension"])
folds = results["ae_results"]
normal_fold_losses = []
anomaly_fold_losses = []
all_scores = [] # Collect all scores for overall calculation
all_fold_scores = [] # Collect all fold scores for std calculation
for fold_key in folds:
fold_data = folds[fold_key]["test"]
scores = np.array(fold_data["scores"])
labels = np.array(fold_data["labels_exp_based"])
normal_scores = scores[labels == 1]
anomaly_scores = scores[labels == -1]
normal_fold_losses.append(
calculate_batch_mean_loss(normal_scores, batch_size)
)
anomaly_fold_losses.append(
calculate_batch_mean_loss(anomaly_scores, batch_size)
)
all_scores.append(scores) # Add scores to all_scores
all_fold_scores.append(fold_data["scores"]) # Add fold scores
dimensions.append(dim)
normal_means.append(np.mean(normal_fold_losses))
normal_stds.append(np.std(normal_fold_losses))
anomaly_means.append(np.mean(anomaly_fold_losses))
anomaly_stds.append(np.std(anomaly_fold_losses))
# Sort by dimension
sorted_data = sorted(
zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds)
)
dims, n_means, n_stds, a_means, a_stds = zip(*sorted_data)
# Store results for this architecture
arch_results[arch_name] = {
"dims": dims,
"normal": (dims, n_means, n_stds),
"anomaly": (dims, a_means, a_stds),
"overall": (
dims,
[
calculate_batch_mean_loss(scores, batch_size)
for scores in all_scores
], # Use all scores
[
np.std(
[
calculate_batch_mean_loss(fold_scores, batch_size)
for fold_scores in fold_scores_list
]
)
for fold_scores_list in all_fold_scores
],
),
}
# Create the three plots with all architectures
plot_multi_loss_curve(
{name: results["normal"] for name, results in arch_results.items()},
"Normal Class Test Loss vs. Latent Dimension",
output_datetime_path / "ae_elbow_test_loss_normal.png",
)
plot_multi_loss_curve(
{name: results["anomaly"] for name, results in arch_results.items()},
"Anomaly Class Test Loss vs. Latent Dimension",
output_datetime_path / "ae_elbow_test_loss_anomaly.png",
)
plot_multi_loss_curve(
{name: results["overall"] for name, results in arch_results.items()},
"Overall Test Loss vs. Latent Dimension",
output_datetime_path / "ae_elbow_test_loss_overall.png",
)
def print_loss_comparison(results_folders):
"""Print comparison tables of original vs calculated losses for each architecture."""
print("\nLoss Comparison Tables")
print("=" * 80)
for arch_name, config in results_folders.items():
results_folder = config["path"]
batch_size = config["batch_size"]
result_files = sorted(
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
)
# Prepare table data
table_data = []
headers = ["Dimension", "Original", "Calculated", "Diff"]
for result_file in result_files:
with open(result_file, "rb") as f:
results = pickle.load(f)
dim = int(results["dimension"])
folds = results["ae_results"]
# Calculate mean original loss across folds
orig_losses = []
calc_losses = []
for fold_key in folds:
fold_data = folds[fold_key]["test"]
orig_losses.append(fold_data["loss"])
calc_losses.append(
calculate_batch_mean_loss(np.array(fold_data["scores"]), batch_size)
)
orig_mean = np.mean(orig_losses)
calc_mean = np.mean(calc_losses)
diff = abs(orig_mean - calc_mean)
table_data.append([dim, orig_mean, calc_mean, diff])
# Sort by dimension
table_data.sort(key=lambda x: x[0])
print(f"\n{arch_name}:")
print(
tabulate(
table_data,
headers=headers,
floatfmt=".6f",
tablefmt="pipe",
numalign="right",
)
)
print("\n" + "=" * 80)
if __name__ == "__main__":
# Print loss comparisons for all architectures
print_loss_comparison(results_folders)
# Run main analysis
evaluate_autoencoder_loss()
# Archive management
# Delete current latest folder
shutil.rmtree(latest_folder_path, ignore_errors=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
# Copy contents to latest folder
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# Copy this script for reference
shutil.copy2(__file__, output_datetime_path)
shutil.copy2(__file__, latest_folder_path)
# Move output to archive
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,223 @@
"""
Downloads the cats_vs_dogs dataset, then generates four PNGs:
- supervised_grid.png
- unsupervised_clusters.png
- semi_supervised_classification.png
- semi_supervised_clustering.png
"""
import os
import random
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelSpreading
# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
NUM_SUPERVISED = 16
GRID_ROWS = 4
GRID_COLS = 4
UNSUP_SAMPLES = 200
# how many labeled points to “seed” semi-sup methods
N_LABELED_CLASS = 10 # for classification demo
N_SEEDS_PER_CLASS = 3 # for clustering demo
OUTPUT_DIR = "outputs"
# -----------------------------------------------------------------------------
# UTILITIES
# -----------------------------------------------------------------------------
def ensure_dir(path):
os.makedirs(path, exist_ok=True)
# -----------------------------------------------------------------------------
# 1) Supervised grid
# -----------------------------------------------------------------------------
def plot_supervised_grid(ds, info, num, rows, cols, outpath):
plt.figure(figsize=(cols * 2, rows * 2))
for i, (img, lbl) in enumerate(ds.take(num)):
ax = plt.subplot(rows, cols, i + 1)
ax.imshow(img.numpy().astype("uint8"))
ax.axis("off")
cname = info.features["label"].int2str(lbl.numpy())
ax.set_title(cname, fontsize=9)
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved supervised grid → {outpath}")
# -----------------------------------------------------------------------------
# 2) Unsupervised clustering (PCA + KMeans)
# -----------------------------------------------------------------------------
def plot_unsupervised_clusters(ds, outpath):
imgs = []
for img, _ in ds.take(UNSUP_SAMPLES):
arr = (
tf.image.resize(img, (64, 64)).numpy().astype("float32").mean(axis=2)
) # resize and grayscale to speed up
imgs.append(arr.ravel() / 255.0)
X = np.stack(imgs)
pca = PCA(n_components=2, random_state=0)
X2 = pca.fit_transform(X)
km = KMeans(n_clusters=2, random_state=0)
clusters = km.fit_predict(X2)
plt.figure(figsize=(6, 6))
plt.scatter(X2[:, 0], X2[:, 1], c=clusters, s=15, alpha=0.6)
plt.title("Unsupervised: PCA + KMeans")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved unsupervised clusters → {outpath}")
return X2, clusters # return for reuse
# -----------------------------------------------------------------------------
# 3) Semisupervised classification demo
# -----------------------------------------------------------------------------
def plot_semi_supervised_classification(X2, y_true, outpath):
n = X2.shape[0]
all_idx = list(range(n))
labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
unlabeled_idx = list(set(all_idx) - set(labeled_idx))
# pure supervised
clf = LogisticRegression().fit(X2[labeled_idx], y_true[labeled_idx])
# semisupervised
y_train = np.full(n, -1, dtype=int)
y_train[labeled_idx] = y_true[labeled_idx]
ls = LabelSpreading().fit(X2, y_train)
# grid for decision boundary
x_min, x_max = X2[:, 0].min() - 1, X2[:, 0].max() + 1
y_min, y_max = X2[:, 1].min() - 1, X2[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
grid = np.c_[xx.ravel(), yy.ravel()]
pred_sup = clf.predict(grid).reshape(xx.shape)
pred_semi = ls.predict(grid).reshape(xx.shape)
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, Z, title in zip(
axes, [pred_sup, pred_semi], ["Supervised only", "Semi-supervised"]
):
ax.contourf(xx, yy, Z, alpha=0.3)
ax.scatter(X2[unlabeled_idx, 0], X2[unlabeled_idx, 1], s=20, alpha=0.4)
ax.scatter(
X2[labeled_idx, 0],
X2[labeled_idx, 1],
c=y_true[labeled_idx],
s=80,
edgecolor="k",
)
ax.set_title(title)
ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved semi-supervised classification → {outpath}")
# -----------------------------------------------------------------------------
# 4) Semisupervised clustering (seeded KMeans)
# -----------------------------------------------------------------------------
def plot_semi_supervised_clustering(X2, y_true, outpath):
# pick a few seeds per class
cats = np.where(y_true == 0)[0]
dogs = np.where(y_true == 1)[0]
seeds = list(np.random.choice(cats, N_SEEDS_PER_CLASS, replace=False)) + list(
np.random.choice(dogs, N_SEEDS_PER_CLASS, replace=False)
)
# pure KMeans
km1 = KMeans(n_clusters=2, random_state=0).fit(X2)
lab1 = km1.labels_
# seeded: init centers from seed means
ctr0 = X2[seeds[:N_SEEDS_PER_CLASS]].mean(axis=0)
ctr1 = X2[seeds[N_SEEDS_PER_CLASS:]].mean(axis=0)
km2 = KMeans(
n_clusters=2, init=np.vstack([ctr0, ctr1]), n_init=1, random_state=0
).fit(X2)
lab2 = km2.labels_
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for ax, labels, title in zip(axes, [lab1, lab2], ["Pure KMeans", "Seeded KMeans"]):
ax.scatter(X2[:, 0], X2[:, 1], c=labels, s=20, alpha=0.6)
ax.scatter(
X2[seeds, 0],
X2[seeds, 1],
c=y_true[seeds],
edgecolor="k",
marker="x",
s=100,
linewidths=2,
)
ax.set_title(title)
ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved semi-supervised clustering → {outpath}")
# -----------------------------------------------------------------------------
# MAIN
# -----------------------------------------------------------------------------
if __name__ == "__main__":
ensure_dir(OUTPUT_DIR)
# load
print("▶ Loading cats_vs_dogs...")
ds, info = tfds.load(
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
)
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
# supervised
plot_supervised_grid(
ds,
info,
NUM_SUPERVISED,
GRID_ROWS,
GRID_COLS,
os.path.join(OUTPUT_DIR, "supervised_grid.png"),
)
# unsupervised
# also returns X2 for downstream demos
X2, _ = plot_unsupervised_clusters(
ds, os.path.join(OUTPUT_DIR, "unsupervised_clusters.png")
)
# collect true labels for that same subset
# (we need a y_true array for semi-sup demos)
y_true = np.array([lbl.numpy() for _, lbl in ds.take(UNSUP_SAMPLES)])
# semi-supervised classification
plot_semi_supervised_classification(
X2, y_true, os.path.join(OUTPUT_DIR, "semi_supervised_classification.png")
)
# semi-supervised clustering
plot_semi_supervised_clustering(
X2, y_true, os.path.join(OUTPUT_DIR, "semi_supervised_clustering.png")
)

View File

@@ -0,0 +1,274 @@
"""
Downloads the cats_vs_dogs dataset, extracts deep features using MobileNetV2,
then generates two clustering comparison illustrations using two different embedding pipelines:
- PCA followed by t-SNE (saved as: semi_supervised_clustering_tsne.png)
- PCA followed by UMAP (saved as: semi_supervised_clustering_umap.png)
Each illustration compares:
- Unsupervised clustering using the deep embedding + KMeans
- Semi-supervised clustering using Label Spreading with a few labeled seeds
This script saves outputs in a datetime folder and also copies the latest outputs
to a "latest" folder. All versions of the outputs and scripts are archived.
"""
import random
import shutil
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import umap
from scipy.optimize import linear_sum_assignment
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.semi_supervised import LabelSpreading
# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
UNSUP_SAMPLES = 200 # number of samples to use for the demo
N_LABELED_CLASS = 20 # number of labeled seeds for the semi-supervised approach
output_path = Path("/home/fedex/mt/plots/background_ml_semisupervised")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# -----------------------------------------------------------------------------
# UTILITIES
# -----------------------------------------------------------------------------
def ensure_dir(directory: Path):
directory.mkdir(exist_ok=True, parents=True)
def cluster_accuracy(y_true, y_pred):
"""
Compute clustering accuracy by determining the optimal mapping
between predicted clusters and true labels using the Hungarian algorithm.
"""
y_true = y_true.astype(np.int64)
y_pred = y_pred.astype(np.int64)
labels = np.unique(y_true)
clusters = np.unique(y_pred)
contingency = np.zeros((labels.size, clusters.size), dtype=np.int64)
for i, label in enumerate(labels):
for j, cluster in enumerate(clusters):
contingency[i, j] = np.sum((y_true == label) & (y_pred == cluster))
row_ind, col_ind = linear_sum_assignment(-contingency)
accuracy = contingency[row_ind, col_ind].sum() / y_true.size
return accuracy
def plot_clustering_comparison_embedding(embedding, y_true, outpath, method_name=""):
"""
Given a 2D data embedding (e.g., from PCA+t-SNE or PCA+UMAP), this function:
- Performs unsupervised clustering with KMeans.
- Performs semi-supervised clustering with Label Spreading using a few labeled seeds.
- Computes accuracy via the Hungarian algorithm.
- Plots the decision boundaries from both methods overlaid with the true labels.
- Annotates the plot with the accuracy results.
The 'method_name' is used in the plot title to indicate which embedding is used.
"""
n = embedding.shape[0]
all_idx = list(range(n))
labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
# Unsupervised clustering using KMeans on all embedded data
km = KMeans(n_clusters=2, random_state=0).fit(embedding)
unsup_pred = km.predict(embedding)
unsup_accuracy = cluster_accuracy(y_true, unsup_pred)
# Create a grid over the space for decision boundaries
x_min, x_max = embedding[:, 0].min() - 1, embedding[:, 0].max() + 1
y_min, y_max = embedding[:, 1].min() - 1, embedding[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
grid = np.c_[xx.ravel(), yy.ravel()]
pred_unsup = km.predict(grid).reshape(xx.shape)
# Semi-supervised clustering using Label Spreading with labeled seeds
y_train = np.full(n, -1, dtype=int)
y_train[labeled_idx] = y_true[labeled_idx]
ls = LabelSpreading().fit(embedding, y_train)
semi_pred = ls.predict(embedding)
semi_accuracy = cluster_accuracy(y_true, semi_pred)
pred_semi = ls.predict(grid).reshape(xx.shape)
cmap = plt.cm.coolwarm
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Unsupervised plot:
ax = axes[0]
ax.contourf(xx, yy, pred_unsup, alpha=0.2, cmap=cmap)
sc1 = ax.scatter(
embedding[:, 0],
embedding[:, 1],
c=y_true,
cmap=cmap,
s=30,
alpha=0.8,
edgecolor="k",
)
ax.set_title(
f"{method_name}\nUnsupervised (KMeans) - Acc: {unsup_accuracy:.2f}", fontsize=10
)
ax.set_xlabel("Dim 1")
ax.set_ylabel("Dim 2")
handles = []
for cl in np.unique(y_true):
handles.append(
plt.Line2D(
[],
[],
marker="o",
linestyle="",
color=cmap(cl / (np.max(y_true) + 1)),
label=f"Class {cl}",
markersize=6,
)
)
ax.legend(handles=handles, loc="upper right")
# Semi-supervised plot:
ax = axes[1]
ax.contourf(xx, yy, pred_semi, alpha=0.2, cmap=cmap)
sc2 = ax.scatter(
embedding[:, 0],
embedding[:, 1],
c=y_true,
cmap=cmap,
s=30,
alpha=0.8,
edgecolor="none",
)
sc3 = ax.scatter(
embedding[labeled_idx, 0],
embedding[labeled_idx, 1],
c=y_true[labeled_idx],
cmap=cmap,
s=80,
edgecolor="k",
marker="o",
label="Labeled Seeds",
)
ax.set_title(
f"{method_name}\nSemi-Supervised (Label Spreading) - Acc: {semi_accuracy:.2f}",
fontsize=10,
)
ax.set_xlabel("Dim 1")
ax.set_ylabel("Dim 2")
handles = []
for cl in np.unique(y_true):
handles.append(
plt.Line2D(
[],
[],
marker="o",
linestyle="",
color=cmap(cl / (np.max(y_true) + 1)),
label=f"Class {cl}",
markersize=6,
)
)
handles.append(
plt.Line2D(
[],
[],
marker="o",
linestyle="",
color="black",
label="Labeled Seed",
markersize=8,
)
)
ax.legend(handles=handles, loc="upper right")
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved clustering comparison illustration → {outpath}")
print(f"Unsupervised Accuracy: {unsup_accuracy:.2f}")
print(f"Semi-Supervised Accuracy: {semi_accuracy:.2f}")
# -----------------------------------------------------------------------------
# MAIN WITH PRE-TRAINED CNN FEATURE EXTRACTION
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Create output directories
ensure_dir(output_path)
ensure_dir(output_datetime_path)
ensure_dir(latest_folder_path)
ensure_dir(archive_folder_path)
print("▶ Loading cats_vs_dogs dataset...")
ds, info = tfds.load(
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
)
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
# Load a pre-trained CNN (MobileNetV2) for feature extraction.
cnn_model = tf.keras.applications.MobileNetV2(
include_top=False, weights="imagenet", pooling="avg", input_shape=(224, 224, 3)
)
# Extract deep features for all samples.
features_list = []
labels = []
for img, lbl in ds.take(UNSUP_SAMPLES):
# Resize to 224x224 and keep 3 channels.
img_resized = tf.image.resize(img, (224, 224))
# Preprocess the image for MobileNetV2.
img_preprocessed = tf.keras.applications.mobilenet_v2.preprocess_input(
img_resized
)
# Expand dims for batch, run through CNN, then squeeze.
features = cnn_model(tf.expand_dims(img_preprocessed, axis=0))
features = features.numpy().squeeze()
features_list.append(features)
labels.append(lbl.numpy())
X = np.stack(features_list)
y_true = np.array(labels)
# First, apply PCA to reduce dimensionality to 50
pca_50 = PCA(n_components=50, random_state=0).fit_transform(X)
# Then compute embedding with t-SNE
from sklearn.manifold import TSNE
X_tsne = TSNE(n_components=2, random_state=0, init="pca").fit_transform(pca_50)
outfile_tsne = output_datetime_path / "semi_supervised_clustering_tsne.png"
plot_clustering_comparison_embedding(
X_tsne, y_true, outfile_tsne, "CNN + PCA + t-SNE"
)
# Then compute embedding with UMAP
X_umap = umap.UMAP(n_components=2, random_state=0).fit_transform(pca_50)
outfile_umap = output_datetime_path / "semi_supervised_clustering_umap.png"
plot_clustering_comparison_embedding(
X_umap, y_true, outfile_umap, "CNN + PCA + UMAP"
)
# -----------------------------------------------------------------------------
# Update the 'latest' results folder: remove previous and copy current outputs
# -----------------------------------------------------------------------------
shutil.rmtree(latest_folder_path, ignore_errors=True)
ensure_dir(latest_folder_path)
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# Copy this script to preserve the code used for the outputs
script_path = Path(__file__)
shutil.copy2(script_path, output_datetime_path)
shutil.copy2(script_path, latest_folder_path)
# Archive the outputs
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,99 @@
"""
Downloads the cats_vs_dogs dataset, then generates a supervised grid image:
- supervised_grid.png
This script saves outputs in a datetime folder and also copies the latest outputs to a "latest" folder.
All versions of the outputs and scripts are archived.
"""
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
# Number of supervised samples and grid dimensions
NUM_SUPERVISED = 16
GRID_ROWS = 4
GRID_COLS = 4
# Output directories for saving plots and scripts
output_path = Path("/home/fedex/mt/plots/background_ml_supervised")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# -----------------------------------------------------------------------------
# UTILITIES
# -----------------------------------------------------------------------------
def ensure_dir(directory: Path):
directory.mkdir(exist_ok=True, parents=True)
# Create required output directories
ensure_dir(output_path)
ensure_dir(output_datetime_path)
ensure_dir(latest_folder_path)
ensure_dir(archive_folder_path)
# -----------------------------------------------------------------------------
# Supervised grid plot
# -----------------------------------------------------------------------------
def plot_supervised_grid(ds, info, num, rows, cols, outpath):
"""
Plots a grid of images from the dataset with their corresponding labels.
"""
plt.figure(figsize=(cols * 2, rows * 2))
for i, (img, lbl) in enumerate(ds.take(num)):
ax = plt.subplot(rows, cols, i + 1)
ax.imshow(img.numpy().astype("uint8"))
ax.axis("off")
cname = info.features["label"].int2str(lbl.numpy())
ax.set_title(cname, fontsize=9)
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved supervised grid → {outpath}")
# -----------------------------------------------------------------------------
# MAIN
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Download and prepare the dataset
print("▶ Loading cats_vs_dogs dataset...")
ds, info = tfds.load(
"cats_vs_dogs", split="train", with_info=True, as_supervised=True
)
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
# Generate the supervised grid image
supervised_outfile = output_datetime_path / "supervised_grid.png"
plot_supervised_grid(
ds, info, NUM_SUPERVISED, GRID_ROWS, GRID_COLS, supervised_outfile
)
# -----------------------------------------------------------------------------
# Update the 'latest' results folder: remove previous and copy current outputs
# -----------------------------------------------------------------------------
import shutil
shutil.rmtree(latest_folder_path, ignore_errors=True)
ensure_dir(latest_folder_path)
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# Copy this script to the output folder and to the latest folder to preserve the used code
script_path = Path(__file__)
shutil.copy2(script_path, output_datetime_path)
shutil.copy2(script_path, latest_folder_path)
# Move the output datetime folder to the archive folder for record keeping
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,105 @@
"""
Downloads the cats_vs_dogs dataset, then generates an unsupervised clusters image:
- unsupervised_clusters.png
This script saves outputs in a datetime folder and also copies the latest outputs to a "latest" folder.
All versions of the outputs and scripts are archived.
"""
import shutil
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
UNSUP_SAMPLES = 200
output_path = Path("/home/fedex/mt/plots/background_ml_unsupervised")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# -----------------------------------------------------------------------------
# UTILITIES
# -----------------------------------------------------------------------------
def ensure_dir(directory: Path):
directory.mkdir(exist_ok=True, parents=True)
# Create required output directories
ensure_dir(output_path)
ensure_dir(output_datetime_path)
ensure_dir(latest_folder_path)
ensure_dir(archive_folder_path)
# -----------------------------------------------------------------------------
# Unsupervised Clustering Plot (PCA + KMeans)
# -----------------------------------------------------------------------------
def plot_unsupervised_clusters(ds, outpath):
"""
Processes a subset of images from the dataset, reduces their dimensionality with PCA,
applies KMeans clustering, and saves a scatterplot of the clusters.
"""
imgs = []
for img, _ in ds.take(UNSUP_SAMPLES):
# resize to 64x64, convert to grayscale by averaging channels
arr = tf.image.resize(img, (64, 64)).numpy().astype("float32").mean(axis=2)
imgs.append(arr.ravel() / 255.0)
X = np.stack(imgs)
pca = PCA(n_components=2, random_state=0)
X2 = pca.fit_transform(X)
km = KMeans(n_clusters=2, random_state=0)
clusters = km.fit_predict(X2)
plt.figure(figsize=(6, 6))
plt.scatter(X2[:, 0], X2[:, 1], c=clusters, s=15, alpha=0.6)
plt.title("Unsupervised: PCA + KMeans")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.tight_layout()
plt.savefig(outpath, dpi=150)
plt.close()
print(f"✔ Saved unsupervised clusters → {outpath}")
# -----------------------------------------------------------------------------
# MAIN
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Load and prepare the dataset
print("▶ Loading cats_vs_dogs dataset...")
ds, _ = tfds.load("cats_vs_dogs", split="train", with_info=True, as_supervised=True)
ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
# Generate the unsupervised clusters image
unsupervised_outfile = output_datetime_path / "unsupervised_clusters.png"
plot_unsupervised_clusters(ds, unsupervised_outfile)
# -----------------------------------------------------------------------------
# Update the 'latest' results folder: remove previous and copy current outputs
# -----------------------------------------------------------------------------
shutil.rmtree(latest_folder_path, ignore_errors=True)
ensure_dir(latest_folder_path)
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# Copy this script to the output folder and to the latest folder to preserve the used code
script_path = Path(__file__)
shutil.copy2(script_path, output_datetime_path)
shutil.copy2(script_path, latest_folder_path)
# Move the output datetime folder to the archive folder for record keeping
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,255 @@
import json
import pickle
import shutil
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
# define data paths
all_data_path = Path("/home/fedex/mt/data/subter")
output_path = Path("/home/fedex/mt/plots/data_anomalies_timeline")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
cache_path = output_path
# if output does not exist, create it
output_path.mkdir(exist_ok=True, parents=True)
output_datetime_path.mkdir(exist_ok=True, parents=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
archive_folder_path.mkdir(exist_ok=True, parents=True)
data_resolution = 32 * 2048
# find all bag files and sort them correctly by name
normal_experiment_paths, anomaly_experiment_paths = [], []
for bag_file_path in all_data_path.iterdir():
if bag_file_path.suffix != ".bag":
continue
if "smoke" in bag_file_path.name:
anomaly_experiment_paths.append(bag_file_path)
else:
normal_experiment_paths.append(bag_file_path)
# Load manually labeled frames
with open(
cache_path / "manually_labeled_anomaly_frames.json", "r"
) as frame_borders_file:
manually_labeled_anomaly_frames_json = json.load(frame_borders_file)
if not manually_labeled_anomaly_frames_json:
print("No manually labeled anomaly frames found. Exiting...")
exit(1)
manually_labeled_anomaly_frames = {}
try:
for file in manually_labeled_anomaly_frames_json["files"]:
if file["filename"] not in (
p.with_suffix(".npy").name for p in anomaly_experiment_paths
):
print(
f"File {file['filename']} from manually labeled frames not found in anomaly experiments. Exiting..."
)
exit(1)
manually_labeled_anomaly_frames[file["filename"]] = (
file["semi_target_begin_frame"],
file["semi_target_end_frame"],
)
except KeyError as e:
print(f"Missing key in manually labeled frames JSON: {e}")
exit(1)
def plot_combined_timeline(
normal_experiment_paths, anomaly_experiment_paths, title, num_bins=50
):
"""Plot both missing points and near-sensor measurements over normalized timeline"""
# Sort experiments by filesize first (to match original processing order)
normal_experiment_paths = sorted(
normal_experiment_paths, key=lambda path: path.stat().st_size
)
anomaly_experiment_paths = sorted(
anomaly_experiment_paths, key=lambda path: path.stat().st_size
)
# Get largest normal experiment and moving anomaly experiments
baseline_path = normal_experiment_paths[-3] # largest normal experiment
moving_exp_indices = [
i
for i, path in enumerate(anomaly_experiment_paths)
if "stationary" not in path.name
]
moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
# Load missing points data
missing_points_cache = Path(cache_path / "missing_points.pkl")
if not missing_points_cache.exists():
print("Missing points cache not found!")
return
# Load near-sensor data (using 500mm threshold)
near_sensor_cache = Path(cache_path / "particles_near_sensor_counts_500.pkl")
if not near_sensor_cache.exists():
print("Near-sensor measurements cache not found!")
return
# Load both cached datasets
with open(missing_points_cache, "rb") as file:
missing_points_normal, missing_points_anomaly = pickle.load(file)
with open(near_sensor_cache, "rb") as file:
near_sensor_normal, near_sensor_anomaly = pickle.load(file)
# Get data for baseline and moving experiments
missing_data = [missing_points_normal[-3]] + [
missing_points_anomaly[i] for i in moving_exp_indices
]
near_sensor_data = [near_sensor_normal[-3]] + [
near_sensor_anomaly[i] for i in moving_exp_indices
]
all_paths = [baseline_path] + moving_anomaly_paths
# Create figure with two y-axes and dynamic layout
fig, ax1 = plt.subplots(figsize=(12, 6), constrained_layout=True)
ax2 = ax1.twinx()
# Color schemes - gray for baseline, colors for anomaly experiments
experiment_colors = ["#808080"] + ["#1f77b4", "#ff7f0e", "#2ca02c"] # gray + colors
# First create custom legend handles for the metrics
from matplotlib.lines import Line2D
metric_legend = [
Line2D([0], [0], color="gray", linestyle="-", label="Missing Points"),
Line2D([0], [0], color="gray", linestyle="--", label="Near-Sensor (<0.5m)"),
Line2D([0], [0], color="gray", linestyle=":", label="Manually Labeled Borders"),
]
# Plot each experiment's data
for i, (missing_exp, near_sensor_exp) in enumerate(
zip(missing_data, near_sensor_data)
):
# Get experiment name without the full path
exp_name = all_paths[i].stem
# Shorten experiment name if needed
exp_name = exp_name.replace("experiment_smoke_", "exp_")
# Convert both to percentages
missing_pct = np.array(missing_exp) / data_resolution * 100
near_sensor_pct = np.array(near_sensor_exp) / data_resolution * 100
# Create normalized timeline bins for both
exp_len = len(missing_pct)
bins = np.linspace(0, exp_len - 1, num_bins)
missing_binned = np.zeros(num_bins)
near_sensor_binned = np.zeros(num_bins)
# Bin both datasets
for bin_idx in range(num_bins):
if bin_idx == num_bins - 1:
start_idx = int(bins[bin_idx])
end_idx = exp_len
else:
start_idx = int(bins[bin_idx])
end_idx = int(bins[bin_idx + 1])
missing_binned[bin_idx] = np.mean(missing_pct[start_idx:end_idx])
near_sensor_binned[bin_idx] = np.mean(near_sensor_pct[start_idx:end_idx])
# Plot both metrics with same color but different line styles
color = experiment_colors[i]
ax1.plot(
range(num_bins),
missing_binned,
color=color,
linestyle="-",
alpha=0.6,
label=exp_name,
)
ax2.plot(
range(num_bins), near_sensor_binned, color=color, linestyle="--", alpha=0.6
)
# Add vertical lines for manually labeled frames if available
if all_paths[i].with_suffix(".npy").name in manually_labeled_anomaly_frames:
begin_frame, end_frame = manually_labeled_anomaly_frames[
all_paths[i].with_suffix(".npy").name
]
# Convert frame numbers to normalized timeline positions
begin_pos = (begin_frame / exp_len) * (num_bins - 1)
end_pos = (end_frame / exp_len) * (num_bins - 1)
# Add vertical lines with matching color and loose dotting
ax1.axvline(
x=begin_pos,
color=color,
linestyle=":",
alpha=0.6,
)
ax1.axvline(
x=end_pos,
color=color,
linestyle=":",
alpha=0.6,
)
# Customize axes
ax1.set_xlabel("Normalized Timeline")
ax1.set_xticks(np.linspace(0, num_bins - 1, 5))
ax1.set_xticklabels([f"{x:.0f}%" for x in np.linspace(0, 100, 5)])
ax1.set_ylabel("Missing Points (%)")
ax2.set_ylabel("Points with <0.5m Range (%)")
plt.title(title)
# Create legends without fixed positions
# First get all lines and labels for experiments
lines1, labels1 = ax1.get_legend_handles_labels()
# Combine both legends into one
all_handles = (
lines1
+ [Line2D([0], [0], color="gray", linestyle="-", label="", alpha=0)]
+ metric_legend
)
all_labels = (
labels1
+ [""]
+ ["Missing Points", "Points Near Sensor (<0.5m)", "Manually Labeled Borders"]
)
# Create single legend in top right corner with consistent margins
fig.legend(all_handles, all_labels, loc="upper right", borderaxespad=4.8)
plt.grid(True, alpha=0.3)
# Save figure letting matplotlib handle the layout
plt.savefig(output_datetime_path / "combined_anomalies_timeline.png", dpi=150)
plt.close()
# Generate the combined timeline plot
plot_combined_timeline(
normal_experiment_paths,
anomaly_experiment_paths,
"Lidar Degradation Indicators Throughout Experiments\n(Baseline and Moving Anomaly Experiments)",
)
# delete current latest folder
shutil.rmtree(latest_folder_path, ignore_errors=True)
# create new latest folder
latest_folder_path.mkdir(exist_ok=True, parents=True)
# copy contents of output folder to the latest folder
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# copy this python script to preserve the code used
shutil.copy2(__file__, output_datetime_path)
shutil.copy2(__file__, latest_folder_path)
# move output date folder to archive
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,178 @@
import pickle
import shutil
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import open3d as o3d
from pointcloudset import Dataset
from pointcloudset.io.pointcloud.open3d import to_open3d
from rich.progress import track
# define data path containing the bag files
all_data_path = Path("/home/fedex/mt/data/subter")
output_path = Path("/home/fedex/mt/plots/data_icp_rmse")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# if output does not exist, create it
output_path.mkdir(exist_ok=True, parents=True)
output_datetime_path.mkdir(exist_ok=True, parents=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
archive_folder_path.mkdir(exist_ok=True, parents=True)
# Add cache folder
cache_folder = output_path / "cache"
cache_folder.mkdir(exist_ok=True, parents=True)
def get_cache_path(experiment_path: Path) -> Path:
"""Convert experiment bag path to cache pkl path"""
return cache_folder / f"{experiment_path.stem}.pkl"
# find all bag files and sort them correctly by name (experiments with smoke in the name are anomalies)
normal_experiment_paths, anomaly_experiment_paths = [], []
for bag_file_path in all_data_path.iterdir():
if bag_file_path.suffix != ".bag":
continue
if "smoke" in bag_file_path.name:
anomaly_experiment_paths.append(bag_file_path)
else:
normal_experiment_paths.append(bag_file_path)
# sort anomaly and normal experiments by filesize, ascending
anomaly_experiment_paths.sort(key=lambda path: path.stat().st_size)
normal_experiment_paths.sort(key=lambda path: path.stat().st_size)
def calculate_icp_rmse(experiment_path):
"""Calculate ICP RMSE between consecutive frames in an experiment"""
cache_path = get_cache_path(experiment_path)
# Check cache first
if cache_path.exists():
with open(cache_path, "rb") as file:
return pickle.load(file)
dataset = Dataset.from_file(experiment_path, topic="/ouster/points")
rmse_values = []
# Convert iterator to list for progress tracking
pointclouds = list(dataset)
# Get consecutive pairs of point clouds with progress bar
for i in track(
range(len(pointclouds) - 1),
description=f"Processing {experiment_path.name}",
total=len(pointclouds) - 1,
):
# Get consecutive point clouds and convert to Open3D format
source = to_open3d(pointclouds[i])
target = to_open3d(pointclouds[i + 1])
# Apply point-to-point ICP (much faster than point-to-plane)
result = o3d.pipelines.registration.registration_icp(
source,
target,
max_correspondence_distance=0.2, # 20cm max correspondence
init=np.identity(4), # Identity matrix as initial transformation
estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
max_iteration=50
),
)
rmse_values.append(result.inlier_rmse)
# Cache the results
with open(cache_path, "wb") as file:
pickle.dump(rmse_values, file, protocol=pickle.HIGHEST_PROTOCOL)
return rmse_values
def plot_statistical_comparison(normal_paths, anomaly_paths, output_path):
"""Create statistical comparison plots between normal and anomalous experiments"""
# Collect all RMSE values
normal_rmse = []
anomaly_rmse = []
print("Loading cached RMSE values...")
for path in normal_paths:
normal_rmse.extend(calculate_icp_rmse(path))
for path in anomaly_paths:
anomaly_rmse.extend(calculate_icp_rmse(path))
# Convert to numpy arrays
normal_rmse = np.array(normal_rmse)
anomaly_rmse = np.array(anomaly_rmse)
# Create figure with two subplots side by sidnte
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Box plot
data = [normal_rmse, anomaly_rmse]
ax1.boxplot(data, labels=["Normal", "Anomalous"])
ax1.set_ylabel("ICP RMSE (meters)")
ax1.set_title("Box Plot Comparison")
ax1.grid(True, alpha=0.3)
# Violin plot
ax2.violinplot(data)
ax2.set_xticks([1, 2])
ax2.set_xticklabels(["Normal", "Anomalous"])
ax2.set_ylabel("ICP RMSE (meters)")
ax2.set_title("Violin Plot Comparison")
ax2.grid(True, alpha=0.3)
plt.suptitle(
"ICP RMSE Statistical Comparison: Normal vs Anomalous Experiments", fontsize=14
)
plt.tight_layout()
# Save both linear and log scale versions
plt.savefig(output_path / "icp_rmse_distribution_linear.png", dpi=150)
# Log scale version
ax1.set_yscale("log")
ax2.set_yscale("log")
plt.savefig(output_path / "icp_rmse_distribution_log.png", dpi=150)
plt.close()
# Print some basic statistics
print("\nBasic Statistics:")
print(
f"Normal experiments - mean: {np.mean(normal_rmse):.4f}, std: {np.std(normal_rmse):.4f}"
)
print(
f"Anomaly experiments - mean: {np.mean(anomaly_rmse):.4f}, std: {np.std(anomaly_rmse):.4f}"
)
# Replace all plotting code with just this:
plot_statistical_comparison(
normal_experiment_paths, anomaly_experiment_paths, output_datetime_path
)
# delete current latest folder
shutil.rmtree(latest_folder_path, ignore_errors=True)
# create new latest folder
latest_folder_path.mkdir(exist_ok=True, parents=True)
# copy contents of output folder to the latest folder
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# copy this python script to preserve the code used
shutil.copy2(__file__, output_datetime_path)
shutil.copy2(__file__, latest_folder_path)
# move output date folder to archive
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,134 @@
import pickle
import shutil
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from pointcloudset import Dataset
# define data paths
all_data_path = Path("/home/fedex/mt/data/subter")
output_path = Path("/home/fedex/mt/plots/data_missing_points_anomalies")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# if output does not exist, create it
output_path.mkdir(exist_ok=True, parents=True)
output_datetime_path.mkdir(exist_ok=True, parents=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
archive_folder_path.mkdir(exist_ok=True, parents=True)
data_resolution = 32 * 2048
# find all bag files and sort them correctly by name
normal_experiment_paths, anomaly_experiment_paths = [], []
for bag_file_path in all_data_path.iterdir():
if bag_file_path.suffix != ".bag":
continue
if "smoke" in bag_file_path.name:
anomaly_experiment_paths.append(bag_file_path)
else:
normal_experiment_paths.append(bag_file_path)
def plot_timeline_comparison(anomaly_experiment_paths, title, num_bins=50):
"""Plot missing points percentage over normalized timeline for moving anomaly experiments"""
# Sort experiments by filesize first (to match original processing order)
anomaly_experiment_paths = sorted(
anomaly_experiment_paths, key=lambda path: path.stat().st_size
)
# Filter out stationary experiments
moving_exp_indices = [
i
for i, path in enumerate(anomaly_experiment_paths)
if "stationary" not in path.name
]
moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
# Try to load cached data from original script's location
cache_path = Path("/home/fedex/mt/plots/data_missing_points/missing_points.pkl")
if not cache_path.exists():
print("No cached data found. Please run the original script first.")
return
with open(cache_path, "rb") as file:
_, missing_points_anomaly = pickle.load(file)
# Get data for moving experiments only (using original indices)
moving_anomaly_data = [missing_points_anomaly[i] for i in moving_exp_indices]
# Create figure
plt.figure(figsize=(12, 6))
# Plot each experiment's timeline
for i, exp_data in enumerate(moving_anomaly_data):
# Convert to percentage
percentages = np.array(exp_data) / data_resolution * 100
# Create normalized timeline bins
exp_len = len(percentages)
bins = np.linspace(0, exp_len - 1, num_bins)
binned_data = np.zeros(num_bins)
# Bin the data
for bin_idx in range(num_bins):
if bin_idx == num_bins - 1:
start_idx = int(bins[bin_idx])
end_idx = exp_len
else:
start_idx = int(bins[bin_idx])
end_idx = int(bins[bin_idx + 1])
binned_data[bin_idx] = np.mean(percentages[start_idx:end_idx])
# Plot with slight transparency to show overlaps
plt.plot(
range(num_bins),
binned_data,
alpha=0.6,
label=f"Experiment {moving_anomaly_paths[i].stem}",
)
plt.title(title)
plt.xlabel("Normalized Timeline")
# Add percentage ticks on x-axis
plt.xticks(
np.linspace(0, num_bins - 1, 5), [f"{x:.0f}%" for x in np.linspace(0, 100, 5)]
)
plt.ylabel("Missing Points (%)")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
# Save the plot
plt.savefig(output_datetime_path / "missing_points_timeline.png", dpi=150)
plt.close()
# Generate the timeline comparison plot
plot_timeline_comparison(
anomaly_experiment_paths,
"Missing Lidar Measurements Over Time\n(Moving Anomaly Experiments Only)",
)
# delete current latest folder
shutil.rmtree(latest_folder_path, ignore_errors=True)
# create new latest folder
latest_folder_path.mkdir(exist_ok=True, parents=True)
# copy contents of output folder to the latest folder
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# copy this python script to preserve the code used
shutil.copy2(__file__, output_datetime_path)
shutil.copy2(__file__, latest_folder_path)
# move output date folder to archive
shutil.move(output_datetime_path, archive_folder_path)

View File

@@ -0,0 +1,149 @@
import pickle
import shutil
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from pointcloudset import Dataset
# define data paths
all_data_path = Path("/home/fedex/mt/data/subter")
output_path = Path("/home/fedex/mt/plots/data_particles_near_sensor_anomalies")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# if output does not exist, create it
output_path.mkdir(exist_ok=True, parents=True)
output_datetime_path.mkdir(exist_ok=True, parents=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
archive_folder_path.mkdir(exist_ok=True, parents=True)
data_resolution = 32 * 2048
# find all bag files and sort them correctly by name
normal_experiment_paths, anomaly_experiment_paths = [], []
for bag_file_path in all_data_path.iterdir():
if bag_file_path.suffix != ".bag":
continue
if "smoke" in bag_file_path.name:
anomaly_experiment_paths.append(bag_file_path)
else:
normal_experiment_paths.append(bag_file_path)
def plot_timeline_comparison(
anomaly_experiment_paths, title, range_threshold, num_bins=50
):
"""Plot near-sensor measurements percentage over normalized timeline for moving anomaly experiments"""
# Sort experiments by filesize first (to match original processing order)
anomaly_experiment_paths = sorted(
anomaly_experiment_paths, key=lambda path: path.stat().st_size
)
# Filter out stationary experiments
moving_exp_indices = [
i
for i, path in enumerate(anomaly_experiment_paths)
if "stationary" not in path.name
]
moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
# Try to load cached data
cache_path = (
Path("/home/fedex/mt/plots/data_particles_near_sensor")
/ f"particles_near_sensor_counts_{range_threshold}.pkl"
)
if not cache_path.exists():
print(
f"No cached data found for range threshold {range_threshold}. Please run the original script first."
)
return
with open(cache_path, "rb") as file:
_, particles_near_sensor_anomaly = pickle.load(file)
# Get data for moving experiments only (using original indices)
moving_anomaly_data = [particles_near_sensor_anomaly[i] for i in moving_exp_indices]
# Create figure
plt.figure(figsize=(12, 6))
# Plot each experiment's timeline
for i, exp_data in enumerate(moving_anomaly_data):
# Convert to percentage
percentages = np.array(exp_data) / data_resolution * 100
# Create normalized timeline bins
exp_len = len(percentages)
bins = np.linspace(0, exp_len - 1, num_bins)
binned_data = np.zeros(num_bins)
# Bin the data
for bin_idx in range(num_bins):
if bin_idx == num_bins - 1:
start_idx = int(bins[bin_idx])
end_idx = exp_len
else:
start_idx = int(bins[bin_idx])
end_idx = int(bins[bin_idx + 1])
binned_data[bin_idx] = np.mean(percentages[start_idx:end_idx])
# Plot with slight transparency to show overlaps
plt.plot(
range(num_bins),
binned_data,
alpha=0.6,
label=f"Experiment {moving_anomaly_paths[i].stem}",
)
plt.title(f"{title}\n(Range Threshold: {range_threshold / 1000:.1f}m)")
plt.xlabel("Normalized Timeline")
# Add percentage ticks on x-axis
plt.xticks(
np.linspace(0, num_bins - 1, 5), [f"{x:.0f}%" for x in np.linspace(0, 100, 5)]
)
plt.ylabel("Near-Sensor Measurements (%)")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
# Save the plot
plt.savefig(
output_datetime_path
/ f"near_sensor_measurements_timeline_{range_threshold}.png",
dpi=150,
)
plt.close()
# Generate timeline comparison plots for each range threshold
range_thresholds = [500, 750, 1000, 1250, 1500]
for rt in range_thresholds:
print(f"Processing range threshold {rt}...")
plot_timeline_comparison(
anomaly_experiment_paths,
"Near-Sensor Lidar Measurements Over Time\n(Moving Anomaly Experiments Only)",
rt,
)
# delete current latest folder
shutil.rmtree(latest_folder_path, ignore_errors=True)
# create new latest folder
latest_folder_path.mkdir(exist_ok=True, parents=True)
# copy contents of output folder to the latest folder
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# copy this python script to preserve the code used
shutil.copy2(__file__, output_datetime_path)
shutil.copy2(__file__, latest_folder_path)
# move output date folder to archive
shutil.move(output_datetime_path, archive_folder_path)