tools, lockfile, deps

2025-08-13 14:17:12 +02:00
parent cd4dc583e8
commit ef311d862e
17 changed files with 4325 additions and 0 deletions
--- a/tools/plot_scripts/ae_elbow_lenet.py
+++ b/tools/plot_scripts/ae_elbow_lenet.py
@@ -0,0 +1,339 @@
+import pickle
+import shutil
+import unittest
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+from tabulate import tabulate
+
+# Configuration
+results_folders = {
+    "LeNet": {
+        "path": Path(
+            "/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow_v2/"
+        ),
+        "batch_size": 256,
+    },
+    "LeNet Efficient": {
+        "path": Path(
+            "/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_efficient_ae_elbow"
+        ),
+        "batch_size": 64,
+    },
+}
+output_path = Path("/home/fedex/mt/plots/ae_elbow_lenet")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+# Create output directories
+output_path.mkdir(exist_ok=True, parents=True)
+output_datetime_path.mkdir(exist_ok=True, parents=True)
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+archive_folder_path.mkdir(exist_ok=True, parents=True)
+
+
+def calculate_batch_mean_loss(scores, batch_size):
+    """Calculate mean loss over batches similar to the original testing code."""
+    n_samples = len(scores)
+    n_batches = (n_samples + batch_size - 1) // batch_size
+
+    batch_losses = []
+    for i in range(0, n_samples, batch_size):
+        batch_scores = scores[i : i + batch_size]
+        batch_losses.append(np.mean(batch_scores))
+
+    return np.sum(batch_losses) / n_batches
+
+
+def test_loss_calculation(results, batch_size):
+    """Test if our loss calculation matches the original implementation."""
+    test = unittest.TestCase()
+    folds = results["ae_results"]
+    dim = results["dimension"]
+
+    for fold_key in folds:
+        fold_data = folds[fold_key]["test"]
+        scores = np.array(fold_data["scores"])
+        original_loss = fold_data["loss"]
+        calculated_loss = calculate_batch_mean_loss(scores, batch_size)
+
+        try:
+            test.assertAlmostEqual(
+                original_loss,
+                calculated_loss,
+                places=5,
+                msg=f"Loss mismatch for dim={dim}, {fold_key}",
+            )
+        except AssertionError as e:
+            print(f"Warning: {str(e)}")
+            print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
+            raise
+
+
+def plot_loss_curve(dims, means, stds, title, color, output_path):
+    """Create and save a single loss curve plot."""
+    plt.figure(figsize=(8, 5))
+    plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
+    plt.fill_between(
+        dims,
+        np.array(means) - np.array(stds),
+        np.array(means) + np.array(stds),
+        color=color,
+        alpha=0.2,
+        label="Std Dev",
+    )
+    plt.xlabel("Latent Dimension")
+    plt.ylabel("Test Loss")
+    plt.title(title)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.xticks(dims)
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    plt.close()
+
+
+def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
+    """Create and save a loss curve plot with multiple architectures.
+
+    Args:
+        arch_results: Dict of format {arch_name: (dims, means, stds)}
+        title: Plot title
+        output_path: Where to save the plot
+        colors: Optional dict of colors for each architecture
+    """
+    plt.figure(figsize=(10, 6))
+
+    if colors is None:
+        colors = {
+            "LeNet": "blue",
+            "LeNet Asymmetric": "red",
+        }
+
+    # Get unique dimensions across all architectures
+    all_dims = sorted(
+        set(dim for _, (dims, _, _) in arch_results.items() for dim in dims)
+    )
+
+    for arch_name, (dims, means, stds) in arch_results.items():
+        color = colors.get(arch_name, "gray")
+        plt.plot(dims, means, marker="o", color=color, label=arch_name)
+        plt.fill_between(
+            dims,
+            np.array(means) - np.array(stds),
+            np.array(means) + np.array(stds),
+            color=color,
+            alpha=0.2,
+        )
+
+    plt.xlabel("Latent Dimension")
+    plt.ylabel("Test Loss")
+    plt.title(title)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.xticks(all_dims)  # Set x-axis ticks to match data points
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    plt.close()
+
+
+def evaluate_autoencoder_loss():
+    """Main function to evaluate autoencoder loss across different latent dimensions."""
+    # Results storage for each architecture
+    arch_results = {
+        name: {"dims": [], "normal": [], "anomaly": []} for name in results_folders
+    }
+
+    # Process each architecture
+    for arch_name, config in results_folders.items():
+        results_folder = config["path"]
+        batch_size = config["batch_size"]
+        result_files = sorted(
+            results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
+        )
+
+        dimensions = []
+        normal_means = []
+        normal_stds = []
+        anomaly_means = []
+        anomaly_stds = []
+
+        # Verify loss calculation
+        print(
+            f"\nVerifying loss calculation for {arch_name} (batch_size={batch_size})..."
+        )
+        for result_file in result_files:
+            with open(result_file, "rb") as f:
+                results = pickle.load(f)
+                test_loss_calculation(results, batch_size)
+        print(f"Loss calculation verified successfully for {arch_name}!")
+
+        # Process files for this architecture
+        for result_file in result_files:
+            with open(result_file, "rb") as f:
+                results = pickle.load(f)
+            dim = int(results["dimension"])
+            folds = results["ae_results"]
+
+            normal_fold_losses = []
+            anomaly_fold_losses = []
+
+            all_scores = []  # Collect all scores for overall calculation
+            all_fold_scores = []  # Collect all fold scores for std calculation
+
+            for fold_key in folds:
+                fold_data = folds[fold_key]["test"]
+                scores = np.array(fold_data["scores"])
+                labels = np.array(fold_data["labels_exp_based"])
+
+                normal_scores = scores[labels == 1]
+                anomaly_scores = scores[labels == -1]
+
+                normal_fold_losses.append(
+                    calculate_batch_mean_loss(normal_scores, batch_size)
+                )
+                anomaly_fold_losses.append(
+                    calculate_batch_mean_loss(anomaly_scores, batch_size)
+                )
+
+                all_scores.append(scores)  # Add scores to all_scores
+                all_fold_scores.append(fold_data["scores"])  # Add fold scores
+
+            dimensions.append(dim)
+            normal_means.append(np.mean(normal_fold_losses))
+            normal_stds.append(np.std(normal_fold_losses))
+            anomaly_means.append(np.mean(anomaly_fold_losses))
+            anomaly_stds.append(np.std(anomaly_fold_losses))
+
+        # Sort by dimension
+        sorted_data = sorted(
+            zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds)
+        )
+        dims, n_means, n_stds, a_means, a_stds = zip(*sorted_data)
+
+        # Store results for this architecture
+        arch_results[arch_name] = {
+            "dims": dims,
+            "normal": (dims, n_means, n_stds),
+            "anomaly": (dims, a_means, a_stds),
+            "overall": (
+                dims,
+                [
+                    calculate_batch_mean_loss(scores, batch_size)
+                    for scores in all_scores
+                ],  # Use all scores
+                [
+                    np.std(
+                        [
+                            calculate_batch_mean_loss(fold_scores, batch_size)
+                            for fold_scores in fold_scores_list
+                        ]
+                    )
+                    for fold_scores_list in all_fold_scores
+                ],
+            ),
+        }
+
+    # Create the three plots with all architectures
+    plot_multi_loss_curve(
+        {name: results["normal"] for name, results in arch_results.items()},
+        "Normal Class Test Loss vs. Latent Dimension",
+        output_datetime_path / "ae_elbow_test_loss_normal.png",
+    )
+
+    plot_multi_loss_curve(
+        {name: results["anomaly"] for name, results in arch_results.items()},
+        "Anomaly Class Test Loss vs. Latent Dimension",
+        output_datetime_path / "ae_elbow_test_loss_anomaly.png",
+    )
+
+    plot_multi_loss_curve(
+        {name: results["overall"] for name, results in arch_results.items()},
+        "Overall Test Loss vs. Latent Dimension",
+        output_datetime_path / "ae_elbow_test_loss_overall.png",
+    )
+
+
+def print_loss_comparison(results_folders):
+    """Print comparison tables of original vs calculated losses for each architecture."""
+    print("\nLoss Comparison Tables")
+    print("=" * 80)
+
+    for arch_name, config in results_folders.items():
+        results_folder = config["path"]
+        batch_size = config["batch_size"]
+        result_files = sorted(
+            results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
+        )
+
+        # Prepare table data
+        table_data = []
+        headers = ["Dimension", "Original", "Calculated", "Diff"]
+
+        for result_file in result_files:
+            with open(result_file, "rb") as f:
+                results = pickle.load(f)
+
+            dim = int(results["dimension"])
+            folds = results["ae_results"]
+
+            # Calculate mean original loss across folds
+            orig_losses = []
+            calc_losses = []
+            for fold_key in folds:
+                fold_data = folds[fold_key]["test"]
+                orig_losses.append(fold_data["loss"])
+                calc_losses.append(
+                    calculate_batch_mean_loss(np.array(fold_data["scores"]), batch_size)
+                )
+
+            orig_mean = np.mean(orig_losses)
+            calc_mean = np.mean(calc_losses)
+            diff = abs(orig_mean - calc_mean)
+
+            table_data.append([dim, orig_mean, calc_mean, diff])
+
+        # Sort by dimension
+        table_data.sort(key=lambda x: x[0])
+
+        print(f"\n{arch_name}:")
+        print(
+            tabulate(
+                table_data,
+                headers=headers,
+                floatfmt=".6f",
+                tablefmt="pipe",
+                numalign="right",
+            )
+        )
+
+    print("\n" + "=" * 80)
+
+
+if __name__ == "__main__":
+    # Print loss comparisons for all architectures
+    print_loss_comparison(results_folders)
+
+    # Run main analysis
+    evaluate_autoencoder_loss()
+
+    # Archive management
+    # Delete current latest folder
+    shutil.rmtree(latest_folder_path, ignore_errors=True)
+    latest_folder_path.mkdir(exist_ok=True, parents=True)
+
+    # Copy contents to latest folder
+    for file in output_datetime_path.iterdir():
+        shutil.copy2(file, latest_folder_path)
+
+    # Copy this script for reference
+    shutil.copy2(__file__, output_datetime_path)
+    shutil.copy2(__file__, latest_folder_path)
+
+    # Move output to archive
+    shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/background_ml_illustrations.py
+++ b/tools/plot_scripts/background_ml_illustrations.py
@@ -0,0 +1,223 @@
+"""
+Downloads the cats_vs_dogs dataset, then generates four PNGs:
+ - supervised_grid.png
+ - unsupervised_clusters.png
+ - semi_supervised_classification.png
+ - semi_supervised_clustering.png
+"""
+
+import os
+import random
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.linear_model import LogisticRegression
+from sklearn.semi_supervised import LabelSpreading
+
+# -----------------------------------------------------------------------------
+# CONFIGURATION
+# -----------------------------------------------------------------------------
+NUM_SUPERVISED = 16
+GRID_ROWS = 4
+GRID_COLS = 4
+
+UNSUP_SAMPLES = 200
+
+# how many labeled points to “seed” semi-sup methods
+N_LABELED_CLASS = 10  # for classification demo
+N_SEEDS_PER_CLASS = 3  # for clustering demo
+
+OUTPUT_DIR = "outputs"
+
+
+# -----------------------------------------------------------------------------
+# UTILITIES
+# -----------------------------------------------------------------------------
+def ensure_dir(path):
+    os.makedirs(path, exist_ok=True)
+
+
+# -----------------------------------------------------------------------------
+# 1) Supervised grid
+# -----------------------------------------------------------------------------
+def plot_supervised_grid(ds, info, num, rows, cols, outpath):
+    plt.figure(figsize=(cols * 2, rows * 2))
+    for i, (img, lbl) in enumerate(ds.take(num)):
+        ax = plt.subplot(rows, cols, i + 1)
+        ax.imshow(img.numpy().astype("uint8"))
+        ax.axis("off")
+        cname = info.features["label"].int2str(lbl.numpy())
+        ax.set_title(cname, fontsize=9)
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved supervised grid → {outpath}")
+
+
+# -----------------------------------------------------------------------------
+# 2) Unsupervised clustering (PCA + KMeans)
+# -----------------------------------------------------------------------------
+def plot_unsupervised_clusters(ds, outpath):
+    imgs = []
+    for img, _ in ds.take(UNSUP_SAMPLES):
+        arr = (
+            tf.image.resize(img, (64, 64)).numpy().astype("float32").mean(axis=2)
+        )  # resize and grayscale to speed up
+        imgs.append(arr.ravel() / 255.0)
+    X = np.stack(imgs)
+    pca = PCA(n_components=2, random_state=0)
+    X2 = pca.fit_transform(X)
+
+    km = KMeans(n_clusters=2, random_state=0)
+    clusters = km.fit_predict(X2)
+
+    plt.figure(figsize=(6, 6))
+    plt.scatter(X2[:, 0], X2[:, 1], c=clusters, s=15, alpha=0.6)
+    plt.title("Unsupervised: PCA + KMeans")
+    plt.xlabel("PCA 1")
+    plt.ylabel("PCA 2")
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved unsupervised clusters → {outpath}")
+
+    return X2, clusters  # return for reuse
+
+
+# -----------------------------------------------------------------------------
+# 3) Semi‐supervised classification demo
+# -----------------------------------------------------------------------------
+def plot_semi_supervised_classification(X2, y_true, outpath):
+    n = X2.shape[0]
+    all_idx = list(range(n))
+    labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
+    unlabeled_idx = list(set(all_idx) - set(labeled_idx))
+
+    # pure supervised
+    clf = LogisticRegression().fit(X2[labeled_idx], y_true[labeled_idx])
+
+    # semi‐supervised
+    y_train = np.full(n, -1, dtype=int)
+    y_train[labeled_idx] = y_true[labeled_idx]
+    ls = LabelSpreading().fit(X2, y_train)
+
+    # grid for decision boundary
+    x_min, x_max = X2[:, 0].min() - 1, X2[:, 0].max() + 1
+    y_min, y_max = X2[:, 1].min() - 1, X2[:, 1].max() + 1
+    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
+    grid = np.c_[xx.ravel(), yy.ravel()]
+    pred_sup = clf.predict(grid).reshape(xx.shape)
+    pred_semi = ls.predict(grid).reshape(xx.shape)
+
+    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+    for ax, Z, title in zip(
+        axes, [pred_sup, pred_semi], ["Supervised only", "Semi-supervised"]
+    ):
+        ax.contourf(xx, yy, Z, alpha=0.3)
+        ax.scatter(X2[unlabeled_idx, 0], X2[unlabeled_idx, 1], s=20, alpha=0.4)
+        ax.scatter(
+            X2[labeled_idx, 0],
+            X2[labeled_idx, 1],
+            c=y_true[labeled_idx],
+            s=80,
+            edgecolor="k",
+        )
+        ax.set_title(title)
+        ax.set_xlabel("PCA 1")
+        ax.set_ylabel("PCA 2")
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved semi-supervised classification → {outpath}")
+
+
+# -----------------------------------------------------------------------------
+# 4) Semi‐supervised clustering (seeded KMeans)
+# -----------------------------------------------------------------------------
+def plot_semi_supervised_clustering(X2, y_true, outpath):
+    # pick a few seeds per class
+    cats = np.where(y_true == 0)[0]
+    dogs = np.where(y_true == 1)[0]
+    seeds = list(np.random.choice(cats, N_SEEDS_PER_CLASS, replace=False)) + list(
+        np.random.choice(dogs, N_SEEDS_PER_CLASS, replace=False)
+    )
+
+    # pure KMeans
+    km1 = KMeans(n_clusters=2, random_state=0).fit(X2)
+    lab1 = km1.labels_
+
+    # seeded: init centers from seed means
+    ctr0 = X2[seeds[:N_SEEDS_PER_CLASS]].mean(axis=0)
+    ctr1 = X2[seeds[N_SEEDS_PER_CLASS:]].mean(axis=0)
+    km2 = KMeans(
+        n_clusters=2, init=np.vstack([ctr0, ctr1]), n_init=1, random_state=0
+    ).fit(X2)
+    lab2 = km2.labels_
+
+    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+    for ax, labels, title in zip(axes, [lab1, lab2], ["Pure KMeans", "Seeded KMeans"]):
+        ax.scatter(X2[:, 0], X2[:, 1], c=labels, s=20, alpha=0.6)
+        ax.scatter(
+            X2[seeds, 0],
+            X2[seeds, 1],
+            c=y_true[seeds],
+            edgecolor="k",
+            marker="x",
+            s=100,
+            linewidths=2,
+        )
+        ax.set_title(title)
+        ax.set_xlabel("PCA 1")
+        ax.set_ylabel("PCA 2")
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved semi-supervised clustering → {outpath}")
+
+
+# -----------------------------------------------------------------------------
+# MAIN
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    ensure_dir(OUTPUT_DIR)
+
+    # load
+    print("▶ Loading cats_vs_dogs...")
+    ds, info = tfds.load(
+        "cats_vs_dogs", split="train", with_info=True, as_supervised=True
+    )
+    ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
+
+    # supervised
+    plot_supervised_grid(
+        ds,
+        info,
+        NUM_SUPERVISED,
+        GRID_ROWS,
+        GRID_COLS,
+        os.path.join(OUTPUT_DIR, "supervised_grid.png"),
+    )
+
+    # unsupervised
+    # also returns X2 for downstream demos
+    X2, _ = plot_unsupervised_clusters(
+        ds, os.path.join(OUTPUT_DIR, "unsupervised_clusters.png")
+    )
+
+    # collect true labels for that same subset
+    # (we need a y_true array for semi-sup demos)
+    y_true = np.array([lbl.numpy() for _, lbl in ds.take(UNSUP_SAMPLES)])
+
+    # semi-supervised classification
+    plot_semi_supervised_classification(
+        X2, y_true, os.path.join(OUTPUT_DIR, "semi_supervised_classification.png")
+    )
+
+    # semi-supervised clustering
+    plot_semi_supervised_clustering(
+        X2, y_true, os.path.join(OUTPUT_DIR, "semi_supervised_clustering.png")
+    )
--- a/tools/plot_scripts/background_ml_semisupervised.py
+++ b/tools/plot_scripts/background_ml_semisupervised.py
@@ -0,0 +1,274 @@
+"""
+Downloads the cats_vs_dogs dataset, extracts deep features using MobileNetV2,
+then generates two clustering comparison illustrations using two different embedding pipelines:
+  - PCA followed by t-SNE (saved as: semi_supervised_clustering_tsne.png)
+  - PCA followed by UMAP  (saved as: semi_supervised_clustering_umap.png)
+
+Each illustration compares:
+   - Unsupervised clustering using the deep embedding + KMeans
+   - Semi-supervised clustering using Label Spreading with a few labeled seeds
+
+This script saves outputs in a datetime folder and also copies the latest outputs
+to a "latest" folder. All versions of the outputs and scripts are archived.
+"""
+
+import random
+import shutil
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import umap
+from scipy.optimize import linear_sum_assignment
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+from sklearn.semi_supervised import LabelSpreading
+
+# -----------------------------------------------------------------------------
+# CONFIGURATION
+# -----------------------------------------------------------------------------
+UNSUP_SAMPLES = 200  # number of samples to use for the demo
+N_LABELED_CLASS = 20  # number of labeled seeds for the semi-supervised approach
+
+output_path = Path("/home/fedex/mt/plots/background_ml_semisupervised")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+
+# -----------------------------------------------------------------------------
+# UTILITIES
+# -----------------------------------------------------------------------------
+def ensure_dir(directory: Path):
+    directory.mkdir(exist_ok=True, parents=True)
+
+
+def cluster_accuracy(y_true, y_pred):
+    """
+    Compute clustering accuracy by determining the optimal mapping
+    between predicted clusters and true labels using the Hungarian algorithm.
+    """
+    y_true = y_true.astype(np.int64)
+    y_pred = y_pred.astype(np.int64)
+    labels = np.unique(y_true)
+    clusters = np.unique(y_pred)
+    contingency = np.zeros((labels.size, clusters.size), dtype=np.int64)
+    for i, label in enumerate(labels):
+        for j, cluster in enumerate(clusters):
+            contingency[i, j] = np.sum((y_true == label) & (y_pred == cluster))
+    row_ind, col_ind = linear_sum_assignment(-contingency)
+    accuracy = contingency[row_ind, col_ind].sum() / y_true.size
+    return accuracy
+
+
+def plot_clustering_comparison_embedding(embedding, y_true, outpath, method_name=""):
+    """
+    Given a 2D data embedding (e.g., from PCA+t-SNE or PCA+UMAP), this function:
+      - Performs unsupervised clustering with KMeans.
+      - Performs semi-supervised clustering with Label Spreading using a few labeled seeds.
+      - Computes accuracy via the Hungarian algorithm.
+      - Plots the decision boundaries from both methods overlaid with the true labels.
+      - Annotates the plot with the accuracy results.
+
+    The 'method_name' is used in the plot title to indicate which embedding is used.
+    """
+    n = embedding.shape[0]
+    all_idx = list(range(n))
+    labeled_idx = random.sample(all_idx, N_LABELED_CLASS)
+
+    # Unsupervised clustering using KMeans on all embedded data
+    km = KMeans(n_clusters=2, random_state=0).fit(embedding)
+    unsup_pred = km.predict(embedding)
+    unsup_accuracy = cluster_accuracy(y_true, unsup_pred)
+
+    # Create a grid over the space for decision boundaries
+    x_min, x_max = embedding[:, 0].min() - 1, embedding[:, 0].max() + 1
+    y_min, y_max = embedding[:, 1].min() - 1, embedding[:, 1].max() + 1
+    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
+    grid = np.c_[xx.ravel(), yy.ravel()]
+    pred_unsup = km.predict(grid).reshape(xx.shape)
+
+    # Semi-supervised clustering using Label Spreading with labeled seeds
+    y_train = np.full(n, -1, dtype=int)
+    y_train[labeled_idx] = y_true[labeled_idx]
+    ls = LabelSpreading().fit(embedding, y_train)
+    semi_pred = ls.predict(embedding)
+    semi_accuracy = cluster_accuracy(y_true, semi_pred)
+    pred_semi = ls.predict(grid).reshape(xx.shape)
+
+    cmap = plt.cm.coolwarm
+
+    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
+
+    # Unsupervised plot:
+    ax = axes[0]
+    ax.contourf(xx, yy, pred_unsup, alpha=0.2, cmap=cmap)
+    sc1 = ax.scatter(
+        embedding[:, 0],
+        embedding[:, 1],
+        c=y_true,
+        cmap=cmap,
+        s=30,
+        alpha=0.8,
+        edgecolor="k",
+    )
+    ax.set_title(
+        f"{method_name}\nUnsupervised (KMeans) - Acc: {unsup_accuracy:.2f}", fontsize=10
+    )
+    ax.set_xlabel("Dim 1")
+    ax.set_ylabel("Dim 2")
+    handles = []
+    for cl in np.unique(y_true):
+        handles.append(
+            plt.Line2D(
+                [],
+                [],
+                marker="o",
+                linestyle="",
+                color=cmap(cl / (np.max(y_true) + 1)),
+                label=f"Class {cl}",
+                markersize=6,
+            )
+        )
+    ax.legend(handles=handles, loc="upper right")
+
+    # Semi-supervised plot:
+    ax = axes[1]
+    ax.contourf(xx, yy, pred_semi, alpha=0.2, cmap=cmap)
+    sc2 = ax.scatter(
+        embedding[:, 0],
+        embedding[:, 1],
+        c=y_true,
+        cmap=cmap,
+        s=30,
+        alpha=0.8,
+        edgecolor="none",
+    )
+    sc3 = ax.scatter(
+        embedding[labeled_idx, 0],
+        embedding[labeled_idx, 1],
+        c=y_true[labeled_idx],
+        cmap=cmap,
+        s=80,
+        edgecolor="k",
+        marker="o",
+        label="Labeled Seeds",
+    )
+    ax.set_title(
+        f"{method_name}\nSemi-Supervised (Label Spreading) - Acc: {semi_accuracy:.2f}",
+        fontsize=10,
+    )
+    ax.set_xlabel("Dim 1")
+    ax.set_ylabel("Dim 2")
+    handles = []
+    for cl in np.unique(y_true):
+        handles.append(
+            plt.Line2D(
+                [],
+                [],
+                marker="o",
+                linestyle="",
+                color=cmap(cl / (np.max(y_true) + 1)),
+                label=f"Class {cl}",
+                markersize=6,
+            )
+        )
+    handles.append(
+        plt.Line2D(
+            [],
+            [],
+            marker="o",
+            linestyle="",
+            color="black",
+            label="Labeled Seed",
+            markersize=8,
+        )
+    )
+    ax.legend(handles=handles, loc="upper right")
+
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved clustering comparison illustration → {outpath}")
+    print(f"Unsupervised Accuracy: {unsup_accuracy:.2f}")
+    print(f"Semi-Supervised Accuracy: {semi_accuracy:.2f}")
+
+
+# -----------------------------------------------------------------------------
+# MAIN WITH PRE-TRAINED CNN FEATURE EXTRACTION
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    # Create output directories
+    ensure_dir(output_path)
+    ensure_dir(output_datetime_path)
+    ensure_dir(latest_folder_path)
+    ensure_dir(archive_folder_path)
+
+    print("▶ Loading cats_vs_dogs dataset...")
+    ds, info = tfds.load(
+        "cats_vs_dogs", split="train", with_info=True, as_supervised=True
+    )
+    ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
+
+    # Load a pre-trained CNN (MobileNetV2) for feature extraction.
+    cnn_model = tf.keras.applications.MobileNetV2(
+        include_top=False, weights="imagenet", pooling="avg", input_shape=(224, 224, 3)
+    )
+
+    # Extract deep features for all samples.
+    features_list = []
+    labels = []
+    for img, lbl in ds.take(UNSUP_SAMPLES):
+        # Resize to 224x224 and keep 3 channels.
+        img_resized = tf.image.resize(img, (224, 224))
+        # Preprocess the image for MobileNetV2.
+        img_preprocessed = tf.keras.applications.mobilenet_v2.preprocess_input(
+            img_resized
+        )
+        # Expand dims for batch, run through CNN, then squeeze.
+        features = cnn_model(tf.expand_dims(img_preprocessed, axis=0))
+        features = features.numpy().squeeze()
+        features_list.append(features)
+        labels.append(lbl.numpy())
+    X = np.stack(features_list)
+    y_true = np.array(labels)
+
+    # First, apply PCA to reduce dimensionality to 50
+    pca_50 = PCA(n_components=50, random_state=0).fit_transform(X)
+
+    # Then compute embedding with t-SNE
+    from sklearn.manifold import TSNE
+
+    X_tsne = TSNE(n_components=2, random_state=0, init="pca").fit_transform(pca_50)
+    outfile_tsne = output_datetime_path / "semi_supervised_clustering_tsne.png"
+    plot_clustering_comparison_embedding(
+        X_tsne, y_true, outfile_tsne, "CNN + PCA + t-SNE"
+    )
+
+    # Then compute embedding with UMAP
+    X_umap = umap.UMAP(n_components=2, random_state=0).fit_transform(pca_50)
+    outfile_umap = output_datetime_path / "semi_supervised_clustering_umap.png"
+    plot_clustering_comparison_embedding(
+        X_umap, y_true, outfile_umap, "CNN + PCA + UMAP"
+    )
+
+    # -----------------------------------------------------------------------------
+    # Update the 'latest' results folder: remove previous and copy current outputs
+    # -----------------------------------------------------------------------------
+    shutil.rmtree(latest_folder_path, ignore_errors=True)
+    ensure_dir(latest_folder_path)
+    for file in output_datetime_path.iterdir():
+        shutil.copy2(file, latest_folder_path)
+
+    # Copy this script to preserve the code used for the outputs
+    script_path = Path(__file__)
+    shutil.copy2(script_path, output_datetime_path)
+    shutil.copy2(script_path, latest_folder_path)
+
+    # Archive the outputs
+    shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/background_ml_supervised.py
+++ b/tools/plot_scripts/background_ml_supervised.py
@@ -0,0 +1,99 @@
+"""
+Downloads the cats_vs_dogs dataset, then generates a supervised grid image:
+ - supervised_grid.png
+
+This script saves outputs in a datetime folder and also copies the latest outputs to a "latest" folder.
+All versions of the outputs and scripts are archived.
+"""
+
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import tensorflow_datasets as tfds
+
+# -----------------------------------------------------------------------------
+# CONFIGURATION
+# -----------------------------------------------------------------------------
+# Number of supervised samples and grid dimensions
+NUM_SUPERVISED = 16
+GRID_ROWS = 4
+GRID_COLS = 4
+
+# Output directories for saving plots and scripts
+output_path = Path("/home/fedex/mt/plots/background_ml_supervised")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+
+# -----------------------------------------------------------------------------
+# UTILITIES
+# -----------------------------------------------------------------------------
+def ensure_dir(directory: Path):
+    directory.mkdir(exist_ok=True, parents=True)
+
+
+# Create required output directories
+ensure_dir(output_path)
+ensure_dir(output_datetime_path)
+ensure_dir(latest_folder_path)
+ensure_dir(archive_folder_path)
+
+
+# -----------------------------------------------------------------------------
+# Supervised grid plot
+# -----------------------------------------------------------------------------
+def plot_supervised_grid(ds, info, num, rows, cols, outpath):
+    """
+    Plots a grid of images from the dataset with their corresponding labels.
+    """
+    plt.figure(figsize=(cols * 2, rows * 2))
+    for i, (img, lbl) in enumerate(ds.take(num)):
+        ax = plt.subplot(rows, cols, i + 1)
+        ax.imshow(img.numpy().astype("uint8"))
+        ax.axis("off")
+        cname = info.features["label"].int2str(lbl.numpy())
+        ax.set_title(cname, fontsize=9)
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved supervised grid → {outpath}")
+
+
+# -----------------------------------------------------------------------------
+# MAIN
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    # Download and prepare the dataset
+    print("▶ Loading cats_vs_dogs dataset...")
+    ds, info = tfds.load(
+        "cats_vs_dogs", split="train", with_info=True, as_supervised=True
+    )
+    ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
+
+    # Generate the supervised grid image
+    supervised_outfile = output_datetime_path / "supervised_grid.png"
+    plot_supervised_grid(
+        ds, info, NUM_SUPERVISED, GRID_ROWS, GRID_COLS, supervised_outfile
+    )
+
+    # -----------------------------------------------------------------------------
+    # Update the 'latest' results folder: remove previous and copy current outputs
+    # -----------------------------------------------------------------------------
+    import shutil
+
+    shutil.rmtree(latest_folder_path, ignore_errors=True)
+    ensure_dir(latest_folder_path)
+    for file in output_datetime_path.iterdir():
+        shutil.copy2(file, latest_folder_path)
+
+    # Copy this script to the output folder and to the latest folder to preserve the used code
+    script_path = Path(__file__)
+    shutil.copy2(script_path, output_datetime_path)
+    shutil.copy2(script_path, latest_folder_path)
+
+    # Move the output datetime folder to the archive folder for record keeping
+    shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/background_ml_unsupervised.py
+++ b/tools/plot_scripts/background_ml_unsupervised.py
@@ -0,0 +1,105 @@
+"""
+Downloads the cats_vs_dogs dataset, then generates an unsupervised clusters image:
+ - unsupervised_clusters.png
+
+This script saves outputs in a datetime folder and also copies the latest outputs to a "latest" folder.
+All versions of the outputs and scripts are archived.
+"""
+
+import shutil
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
+
+# -----------------------------------------------------------------------------
+# CONFIGURATION
+# -----------------------------------------------------------------------------
+UNSUP_SAMPLES = 200
+
+output_path = Path("/home/fedex/mt/plots/background_ml_unsupervised")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+
+# -----------------------------------------------------------------------------
+# UTILITIES
+# -----------------------------------------------------------------------------
+def ensure_dir(directory: Path):
+    directory.mkdir(exist_ok=True, parents=True)
+
+
+# Create required output directories
+ensure_dir(output_path)
+ensure_dir(output_datetime_path)
+ensure_dir(latest_folder_path)
+ensure_dir(archive_folder_path)
+
+
+# -----------------------------------------------------------------------------
+# Unsupervised Clustering Plot (PCA + KMeans)
+# -----------------------------------------------------------------------------
+def plot_unsupervised_clusters(ds, outpath):
+    """
+    Processes a subset of images from the dataset, reduces their dimensionality with PCA,
+    applies KMeans clustering, and saves a scatterplot of the clusters.
+    """
+    imgs = []
+    for img, _ in ds.take(UNSUP_SAMPLES):
+        # resize to 64x64, convert to grayscale by averaging channels
+        arr = tf.image.resize(img, (64, 64)).numpy().astype("float32").mean(axis=2)
+        imgs.append(arr.ravel() / 255.0)
+    X = np.stack(imgs)
+    pca = PCA(n_components=2, random_state=0)
+    X2 = pca.fit_transform(X)
+
+    km = KMeans(n_clusters=2, random_state=0)
+    clusters = km.fit_predict(X2)
+
+    plt.figure(figsize=(6, 6))
+    plt.scatter(X2[:, 0], X2[:, 1], c=clusters, s=15, alpha=0.6)
+    plt.title("Unsupervised: PCA + KMeans")
+    plt.xlabel("PCA 1")
+    plt.ylabel("PCA 2")
+    plt.tight_layout()
+    plt.savefig(outpath, dpi=150)
+    plt.close()
+    print(f"✔ Saved unsupervised clusters → {outpath}")
+
+
+# -----------------------------------------------------------------------------
+# MAIN
+# -----------------------------------------------------------------------------
+if __name__ == "__main__":
+    # Load and prepare the dataset
+    print("▶ Loading cats_vs_dogs dataset...")
+    ds, _ = tfds.load("cats_vs_dogs", split="train", with_info=True, as_supervised=True)
+    ds = ds.shuffle(1000, reshuffle_each_iteration=False).cache()
+
+    # Generate the unsupervised clusters image
+    unsupervised_outfile = output_datetime_path / "unsupervised_clusters.png"
+    plot_unsupervised_clusters(ds, unsupervised_outfile)
+
+    # -----------------------------------------------------------------------------
+    # Update the 'latest' results folder: remove previous and copy current outputs
+    # -----------------------------------------------------------------------------
+    shutil.rmtree(latest_folder_path, ignore_errors=True)
+    ensure_dir(latest_folder_path)
+    for file in output_datetime_path.iterdir():
+        shutil.copy2(file, latest_folder_path)
+
+    # Copy this script to the output folder and to the latest folder to preserve the used code
+    script_path = Path(__file__)
+    shutil.copy2(script_path, output_datetime_path)
+    shutil.copy2(script_path, latest_folder_path)
+
+    # Move the output datetime folder to the archive folder for record keeping
+    shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/data_anomalies_timeline.py
+++ b/tools/plot_scripts/data_anomalies_timeline.py
@@ -0,0 +1,255 @@
+import json
+import pickle
+import shutil
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# define data paths
+all_data_path = Path("/home/fedex/mt/data/subter")
+output_path = Path("/home/fedex/mt/plots/data_anomalies_timeline")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+cache_path = output_path
+
+# if output does not exist, create it
+output_path.mkdir(exist_ok=True, parents=True)
+output_datetime_path.mkdir(exist_ok=True, parents=True)
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+archive_folder_path.mkdir(exist_ok=True, parents=True)
+
+data_resolution = 32 * 2048
+
+# find all bag files and sort them correctly by name
+normal_experiment_paths, anomaly_experiment_paths = [], []
+for bag_file_path in all_data_path.iterdir():
+    if bag_file_path.suffix != ".bag":
+        continue
+    if "smoke" in bag_file_path.name:
+        anomaly_experiment_paths.append(bag_file_path)
+    else:
+        normal_experiment_paths.append(bag_file_path)
+
+# Load manually labeled frames
+with open(
+    cache_path / "manually_labeled_anomaly_frames.json", "r"
+) as frame_borders_file:
+    manually_labeled_anomaly_frames_json = json.load(frame_borders_file)
+    if not manually_labeled_anomaly_frames_json:
+        print("No manually labeled anomaly frames found. Exiting...")
+        exit(1)
+    manually_labeled_anomaly_frames = {}
+    try:
+        for file in manually_labeled_anomaly_frames_json["files"]:
+            if file["filename"] not in (
+                p.with_suffix(".npy").name for p in anomaly_experiment_paths
+            ):
+                print(
+                    f"File {file['filename']} from manually labeled frames not found in anomaly experiments. Exiting..."
+                )
+                exit(1)
+            manually_labeled_anomaly_frames[file["filename"]] = (
+                file["semi_target_begin_frame"],
+                file["semi_target_end_frame"],
+            )
+    except KeyError as e:
+        print(f"Missing key in manually labeled frames JSON: {e}")
+        exit(1)
+
+
+def plot_combined_timeline(
+    normal_experiment_paths, anomaly_experiment_paths, title, num_bins=50
+):
+    """Plot both missing points and near-sensor measurements over normalized timeline"""
+    # Sort experiments by filesize first (to match original processing order)
+    normal_experiment_paths = sorted(
+        normal_experiment_paths, key=lambda path: path.stat().st_size
+    )
+    anomaly_experiment_paths = sorted(
+        anomaly_experiment_paths, key=lambda path: path.stat().st_size
+    )
+
+    # Get largest normal experiment and moving anomaly experiments
+    baseline_path = normal_experiment_paths[-3]  # largest normal experiment
+    moving_exp_indices = [
+        i
+        for i, path in enumerate(anomaly_experiment_paths)
+        if "stationary" not in path.name
+    ]
+    moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
+
+    # Load missing points data
+    missing_points_cache = Path(cache_path / "missing_points.pkl")
+    if not missing_points_cache.exists():
+        print("Missing points cache not found!")
+        return
+
+    # Load near-sensor data (using 500mm threshold)
+    near_sensor_cache = Path(cache_path / "particles_near_sensor_counts_500.pkl")
+    if not near_sensor_cache.exists():
+        print("Near-sensor measurements cache not found!")
+        return
+
+    # Load both cached datasets
+    with open(missing_points_cache, "rb") as file:
+        missing_points_normal, missing_points_anomaly = pickle.load(file)
+    with open(near_sensor_cache, "rb") as file:
+        near_sensor_normal, near_sensor_anomaly = pickle.load(file)
+
+    # Get data for baseline and moving experiments
+    missing_data = [missing_points_normal[-3]] + [
+        missing_points_anomaly[i] for i in moving_exp_indices
+    ]
+    near_sensor_data = [near_sensor_normal[-3]] + [
+        near_sensor_anomaly[i] for i in moving_exp_indices
+    ]
+    all_paths = [baseline_path] + moving_anomaly_paths
+
+    # Create figure with two y-axes and dynamic layout
+    fig, ax1 = plt.subplots(figsize=(12, 6), constrained_layout=True)
+    ax2 = ax1.twinx()
+
+    # Color schemes - gray for baseline, colors for anomaly experiments
+    experiment_colors = ["#808080"] + ["#1f77b4", "#ff7f0e", "#2ca02c"]  # gray + colors
+
+    # First create custom legend handles for the metrics
+    from matplotlib.lines import Line2D
+
+    metric_legend = [
+        Line2D([0], [0], color="gray", linestyle="-", label="Missing Points"),
+        Line2D([0], [0], color="gray", linestyle="--", label="Near-Sensor (<0.5m)"),
+        Line2D([0], [0], color="gray", linestyle=":", label="Manually Labeled Borders"),
+    ]
+
+    # Plot each experiment's data
+    for i, (missing_exp, near_sensor_exp) in enumerate(
+        zip(missing_data, near_sensor_data)
+    ):
+        # Get experiment name without the full path
+        exp_name = all_paths[i].stem
+        # Shorten experiment name if needed
+        exp_name = exp_name.replace("experiment_smoke_", "exp_")
+
+        # Convert both to percentages
+        missing_pct = np.array(missing_exp) / data_resolution * 100
+        near_sensor_pct = np.array(near_sensor_exp) / data_resolution * 100
+
+        # Create normalized timeline bins for both
+        exp_len = len(missing_pct)
+        bins = np.linspace(0, exp_len - 1, num_bins)
+        missing_binned = np.zeros(num_bins)
+        near_sensor_binned = np.zeros(num_bins)
+
+        # Bin both datasets
+        for bin_idx in range(num_bins):
+            if bin_idx == num_bins - 1:
+                start_idx = int(bins[bin_idx])
+                end_idx = exp_len
+            else:
+                start_idx = int(bins[bin_idx])
+                end_idx = int(bins[bin_idx + 1])
+
+            missing_binned[bin_idx] = np.mean(missing_pct[start_idx:end_idx])
+            near_sensor_binned[bin_idx] = np.mean(near_sensor_pct[start_idx:end_idx])
+
+        # Plot both metrics with same color but different line styles
+        color = experiment_colors[i]
+        ax1.plot(
+            range(num_bins),
+            missing_binned,
+            color=color,
+            linestyle="-",
+            alpha=0.6,
+            label=exp_name,
+        )
+        ax2.plot(
+            range(num_bins), near_sensor_binned, color=color, linestyle="--", alpha=0.6
+        )
+
+        # Add vertical lines for manually labeled frames if available
+        if all_paths[i].with_suffix(".npy").name in manually_labeled_anomaly_frames:
+            begin_frame, end_frame = manually_labeled_anomaly_frames[
+                all_paths[i].with_suffix(".npy").name
+            ]
+            # Convert frame numbers to normalized timeline positions
+            begin_pos = (begin_frame / exp_len) * (num_bins - 1)
+            end_pos = (end_frame / exp_len) * (num_bins - 1)
+
+            # Add vertical lines with matching color and loose dotting
+            ax1.axvline(
+                x=begin_pos,
+                color=color,
+                linestyle=":",
+                alpha=0.6,
+            )
+            ax1.axvline(
+                x=end_pos,
+                color=color,
+                linestyle=":",
+                alpha=0.6,
+            )
+
+    # Customize axes
+    ax1.set_xlabel("Normalized Timeline")
+    ax1.set_xticks(np.linspace(0, num_bins - 1, 5))
+    ax1.set_xticklabels([f"{x:.0f}%" for x in np.linspace(0, 100, 5)])
+
+    ax1.set_ylabel("Missing Points (%)")
+    ax2.set_ylabel("Points with <0.5m Range (%)")
+
+    plt.title(title)
+
+    # Create legends without fixed positions
+    # First get all lines and labels for experiments
+    lines1, labels1 = ax1.get_legend_handles_labels()
+
+    # Combine both legends into one
+    all_handles = (
+        lines1
+        + [Line2D([0], [0], color="gray", linestyle="-", label="", alpha=0)]
+        + metric_legend
+    )
+    all_labels = (
+        labels1
+        + [""]
+        + ["Missing Points", "Points Near Sensor (<0.5m)", "Manually Labeled Borders"]
+    )
+
+    # Create single legend in top right corner with consistent margins
+    fig.legend(all_handles, all_labels, loc="upper right", borderaxespad=4.8)
+
+    plt.grid(True, alpha=0.3)
+
+    # Save figure letting matplotlib handle the layout
+    plt.savefig(output_datetime_path / "combined_anomalies_timeline.png", dpi=150)
+    plt.close()
+
+
+# Generate the combined timeline plot
+plot_combined_timeline(
+    normal_experiment_paths,
+    anomaly_experiment_paths,
+    "Lidar Degradation Indicators Throughout Experiments\n(Baseline and Moving Anomaly Experiments)",
+)
+
+# delete current latest folder
+shutil.rmtree(latest_folder_path, ignore_errors=True)
+
+# create new latest folder
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+
+# copy contents of output folder to the latest folder
+for file in output_datetime_path.iterdir():
+    shutil.copy2(file, latest_folder_path)
+
+# copy this python script to preserve the code used
+shutil.copy2(__file__, output_datetime_path)
+shutil.copy2(__file__, latest_folder_path)
+
+# move output date folder to archive
+shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/data_icp_rsme.py
+++ b/tools/plot_scripts/data_icp_rsme.py
@@ -0,0 +1,178 @@
+import pickle
+import shutil
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import open3d as o3d
+from pointcloudset import Dataset
+from pointcloudset.io.pointcloud.open3d import to_open3d
+from rich.progress import track
+
+# define data path containing the bag files
+all_data_path = Path("/home/fedex/mt/data/subter")
+
+output_path = Path("/home/fedex/mt/plots/data_icp_rmse")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+# if output does not exist, create it
+output_path.mkdir(exist_ok=True, parents=True)
+output_datetime_path.mkdir(exist_ok=True, parents=True)
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+archive_folder_path.mkdir(exist_ok=True, parents=True)
+
+# Add cache folder
+cache_folder = output_path / "cache"
+cache_folder.mkdir(exist_ok=True, parents=True)
+
+
+def get_cache_path(experiment_path: Path) -> Path:
+    """Convert experiment bag path to cache pkl path"""
+    return cache_folder / f"{experiment_path.stem}.pkl"
+
+
+# find all bag files and sort them correctly by name (experiments with smoke in the name are anomalies)
+normal_experiment_paths, anomaly_experiment_paths = [], []
+for bag_file_path in all_data_path.iterdir():
+    if bag_file_path.suffix != ".bag":
+        continue
+    if "smoke" in bag_file_path.name:
+        anomaly_experiment_paths.append(bag_file_path)
+    else:
+        normal_experiment_paths.append(bag_file_path)
+
+# sort anomaly and normal experiments by filesize, ascending
+anomaly_experiment_paths.sort(key=lambda path: path.stat().st_size)
+normal_experiment_paths.sort(key=lambda path: path.stat().st_size)
+
+
+def calculate_icp_rmse(experiment_path):
+    """Calculate ICP RMSE between consecutive frames in an experiment"""
+    cache_path = get_cache_path(experiment_path)
+
+    # Check cache first
+    if cache_path.exists():
+        with open(cache_path, "rb") as file:
+            return pickle.load(file)
+
+    dataset = Dataset.from_file(experiment_path, topic="/ouster/points")
+    rmse_values = []
+
+    # Convert iterator to list for progress tracking
+    pointclouds = list(dataset)
+
+    # Get consecutive pairs of point clouds with progress bar
+    for i in track(
+        range(len(pointclouds) - 1),
+        description=f"Processing {experiment_path.name}",
+        total=len(pointclouds) - 1,
+    ):
+        # Get consecutive point clouds and convert to Open3D format
+        source = to_open3d(pointclouds[i])
+        target = to_open3d(pointclouds[i + 1])
+
+        # Apply point-to-point ICP (much faster than point-to-plane)
+        result = o3d.pipelines.registration.registration_icp(
+            source,
+            target,
+            max_correspondence_distance=0.2,  # 20cm max correspondence
+            init=np.identity(4),  # Identity matrix as initial transformation
+            estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+            criteria=o3d.pipelines.registration.ICPConvergenceCriteria(
+                max_iteration=50
+            ),
+        )
+
+        rmse_values.append(result.inlier_rmse)
+
+    # Cache the results
+    with open(cache_path, "wb") as file:
+        pickle.dump(rmse_values, file, protocol=pickle.HIGHEST_PROTOCOL)
+
+    return rmse_values
+
+
+def plot_statistical_comparison(normal_paths, anomaly_paths, output_path):
+    """Create statistical comparison plots between normal and anomalous experiments"""
+    # Collect all RMSE values
+    normal_rmse = []
+    anomaly_rmse = []
+
+    print("Loading cached RMSE values...")
+    for path in normal_paths:
+        normal_rmse.extend(calculate_icp_rmse(path))
+    for path in anomaly_paths:
+        anomaly_rmse.extend(calculate_icp_rmse(path))
+
+    # Convert to numpy arrays
+    normal_rmse = np.array(normal_rmse)
+    anomaly_rmse = np.array(anomaly_rmse)
+
+    # Create figure with two subplots side by sidnte
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
+
+    # Box plot
+    data = [normal_rmse, anomaly_rmse]
+    ax1.boxplot(data, labels=["Normal", "Anomalous"])
+    ax1.set_ylabel("ICP RMSE (meters)")
+    ax1.set_title("Box Plot Comparison")
+    ax1.grid(True, alpha=0.3)
+
+    # Violin plot
+    ax2.violinplot(data)
+    ax2.set_xticks([1, 2])
+    ax2.set_xticklabels(["Normal", "Anomalous"])
+    ax2.set_ylabel("ICP RMSE (meters)")
+    ax2.set_title("Violin Plot Comparison")
+    ax2.grid(True, alpha=0.3)
+
+    plt.suptitle(
+        "ICP RMSE Statistical Comparison: Normal vs Anomalous Experiments", fontsize=14
+    )
+    plt.tight_layout()
+
+    # Save both linear and log scale versions
+    plt.savefig(output_path / "icp_rmse_distribution_linear.png", dpi=150)
+
+    # Log scale version
+    ax1.set_yscale("log")
+    ax2.set_yscale("log")
+    plt.savefig(output_path / "icp_rmse_distribution_log.png", dpi=150)
+    plt.close()
+
+    # Print some basic statistics
+    print("\nBasic Statistics:")
+    print(
+        f"Normal experiments - mean: {np.mean(normal_rmse):.4f}, std: {np.std(normal_rmse):.4f}"
+    )
+    print(
+        f"Anomaly experiments - mean: {np.mean(anomaly_rmse):.4f}, std: {np.std(anomaly_rmse):.4f}"
+    )
+
+
+# Replace all plotting code with just this:
+plot_statistical_comparison(
+    normal_experiment_paths, anomaly_experiment_paths, output_datetime_path
+)
+
+# delete current latest folder
+shutil.rmtree(latest_folder_path, ignore_errors=True)
+
+# create new latest folder
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+
+# copy contents of output folder to the latest folder
+for file in output_datetime_path.iterdir():
+    shutil.copy2(file, latest_folder_path)
+
+# copy this python script to preserve the code used
+shutil.copy2(__file__, output_datetime_path)
+shutil.copy2(__file__, latest_folder_path)
+
+# move output date folder to archive
+shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/data_missing_points_anomalies.py
+++ b/tools/plot_scripts/data_missing_points_anomalies.py
@@ -0,0 +1,134 @@
+import pickle
+import shutil
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pointcloudset import Dataset
+
+# define data paths
+all_data_path = Path("/home/fedex/mt/data/subter")
+output_path = Path("/home/fedex/mt/plots/data_missing_points_anomalies")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+# if output does not exist, create it
+output_path.mkdir(exist_ok=True, parents=True)
+output_datetime_path.mkdir(exist_ok=True, parents=True)
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+archive_folder_path.mkdir(exist_ok=True, parents=True)
+
+data_resolution = 32 * 2048
+
+# find all bag files and sort them correctly by name
+normal_experiment_paths, anomaly_experiment_paths = [], []
+for bag_file_path in all_data_path.iterdir():
+    if bag_file_path.suffix != ".bag":
+        continue
+    if "smoke" in bag_file_path.name:
+        anomaly_experiment_paths.append(bag_file_path)
+    else:
+        normal_experiment_paths.append(bag_file_path)
+
+
+def plot_timeline_comparison(anomaly_experiment_paths, title, num_bins=50):
+    """Plot missing points percentage over normalized timeline for moving anomaly experiments"""
+    # Sort experiments by filesize first (to match original processing order)
+    anomaly_experiment_paths = sorted(
+        anomaly_experiment_paths, key=lambda path: path.stat().st_size
+    )
+
+    # Filter out stationary experiments
+    moving_exp_indices = [
+        i
+        for i, path in enumerate(anomaly_experiment_paths)
+        if "stationary" not in path.name
+    ]
+    moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
+
+    # Try to load cached data from original script's location
+    cache_path = Path("/home/fedex/mt/plots/data_missing_points/missing_points.pkl")
+    if not cache_path.exists():
+        print("No cached data found. Please run the original script first.")
+        return
+
+    with open(cache_path, "rb") as file:
+        _, missing_points_anomaly = pickle.load(file)
+
+    # Get data for moving experiments only (using original indices)
+    moving_anomaly_data = [missing_points_anomaly[i] for i in moving_exp_indices]
+
+    # Create figure
+    plt.figure(figsize=(12, 6))
+
+    # Plot each experiment's timeline
+    for i, exp_data in enumerate(moving_anomaly_data):
+        # Convert to percentage
+        percentages = np.array(exp_data) / data_resolution * 100
+
+        # Create normalized timeline bins
+        exp_len = len(percentages)
+        bins = np.linspace(0, exp_len - 1, num_bins)
+        binned_data = np.zeros(num_bins)
+
+        # Bin the data
+        for bin_idx in range(num_bins):
+            if bin_idx == num_bins - 1:
+                start_idx = int(bins[bin_idx])
+                end_idx = exp_len
+            else:
+                start_idx = int(bins[bin_idx])
+                end_idx = int(bins[bin_idx + 1])
+
+            binned_data[bin_idx] = np.mean(percentages[start_idx:end_idx])
+
+        # Plot with slight transparency to show overlaps
+        plt.plot(
+            range(num_bins),
+            binned_data,
+            alpha=0.6,
+            label=f"Experiment {moving_anomaly_paths[i].stem}",
+        )
+
+    plt.title(title)
+    plt.xlabel("Normalized Timeline")
+    # Add percentage ticks on x-axis
+    plt.xticks(
+        np.linspace(0, num_bins - 1, 5), [f"{x:.0f}%" for x in np.linspace(0, 100, 5)]
+    )
+    plt.ylabel("Missing Points (%)")
+    plt.grid(True, alpha=0.3)
+    plt.legend()
+    plt.tight_layout()
+
+    # Save the plot
+    plt.savefig(output_datetime_path / "missing_points_timeline.png", dpi=150)
+    plt.close()
+
+
+# Generate the timeline comparison plot
+plot_timeline_comparison(
+    anomaly_experiment_paths,
+    "Missing Lidar Measurements Over Time\n(Moving Anomaly Experiments Only)",
+)
+
+# delete current latest folder
+shutil.rmtree(latest_folder_path, ignore_errors=True)
+
+# create new latest folder
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+
+# copy contents of output folder to the latest folder
+for file in output_datetime_path.iterdir():
+    shutil.copy2(file, latest_folder_path)
+
+# copy this python script to preserve the code used
+shutil.copy2(__file__, output_datetime_path)
+shutil.copy2(__file__, latest_folder_path)
+
+# move output date folder to archive
+shutil.move(output_datetime_path, archive_folder_path)
--- a/tools/plot_scripts/data_particles_near_sensor_anomalies.py
+++ b/tools/plot_scripts/data_particles_near_sensor_anomalies.py
@@ -0,0 +1,149 @@
+import pickle
+import shutil
+from datetime import datetime
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pointcloudset import Dataset
+
+# define data paths
+all_data_path = Path("/home/fedex/mt/data/subter")
+output_path = Path("/home/fedex/mt/plots/data_particles_near_sensor_anomalies")
+datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+latest_folder_path = output_path / "latest"
+archive_folder_path = output_path / "archive"
+output_datetime_path = output_path / datetime_folder_name
+
+# if output does not exist, create it
+output_path.mkdir(exist_ok=True, parents=True)
+output_datetime_path.mkdir(exist_ok=True, parents=True)
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+archive_folder_path.mkdir(exist_ok=True, parents=True)
+
+data_resolution = 32 * 2048
+
+# find all bag files and sort them correctly by name
+normal_experiment_paths, anomaly_experiment_paths = [], []
+for bag_file_path in all_data_path.iterdir():
+    if bag_file_path.suffix != ".bag":
+        continue
+    if "smoke" in bag_file_path.name:
+        anomaly_experiment_paths.append(bag_file_path)
+    else:
+        normal_experiment_paths.append(bag_file_path)
+
+
+def plot_timeline_comparison(
+    anomaly_experiment_paths, title, range_threshold, num_bins=50
+):
+    """Plot near-sensor measurements percentage over normalized timeline for moving anomaly experiments"""
+    # Sort experiments by filesize first (to match original processing order)
+    anomaly_experiment_paths = sorted(
+        anomaly_experiment_paths, key=lambda path: path.stat().st_size
+    )
+
+    # Filter out stationary experiments
+    moving_exp_indices = [
+        i
+        for i, path in enumerate(anomaly_experiment_paths)
+        if "stationary" not in path.name
+    ]
+    moving_anomaly_paths = [anomaly_experiment_paths[i] for i in moving_exp_indices]
+
+    # Try to load cached data
+    cache_path = (
+        Path("/home/fedex/mt/plots/data_particles_near_sensor")
+        / f"particles_near_sensor_counts_{range_threshold}.pkl"
+    )
+    if not cache_path.exists():
+        print(
+            f"No cached data found for range threshold {range_threshold}. Please run the original script first."
+        )
+        return
+
+    with open(cache_path, "rb") as file:
+        _, particles_near_sensor_anomaly = pickle.load(file)
+
+    # Get data for moving experiments only (using original indices)
+    moving_anomaly_data = [particles_near_sensor_anomaly[i] for i in moving_exp_indices]
+
+    # Create figure
+    plt.figure(figsize=(12, 6))
+
+    # Plot each experiment's timeline
+    for i, exp_data in enumerate(moving_anomaly_data):
+        # Convert to percentage
+        percentages = np.array(exp_data) / data_resolution * 100
+
+        # Create normalized timeline bins
+        exp_len = len(percentages)
+        bins = np.linspace(0, exp_len - 1, num_bins)
+        binned_data = np.zeros(num_bins)
+
+        # Bin the data
+        for bin_idx in range(num_bins):
+            if bin_idx == num_bins - 1:
+                start_idx = int(bins[bin_idx])
+                end_idx = exp_len
+            else:
+                start_idx = int(bins[bin_idx])
+                end_idx = int(bins[bin_idx + 1])
+
+            binned_data[bin_idx] = np.mean(percentages[start_idx:end_idx])
+
+        # Plot with slight transparency to show overlaps
+        plt.plot(
+            range(num_bins),
+            binned_data,
+            alpha=0.6,
+            label=f"Experiment {moving_anomaly_paths[i].stem}",
+        )
+
+    plt.title(f"{title}\n(Range Threshold: {range_threshold / 1000:.1f}m)")
+    plt.xlabel("Normalized Timeline")
+    # Add percentage ticks on x-axis
+    plt.xticks(
+        np.linspace(0, num_bins - 1, 5), [f"{x:.0f}%" for x in np.linspace(0, 100, 5)]
+    )
+    plt.ylabel("Near-Sensor Measurements (%)")
+    plt.grid(True, alpha=0.3)
+    plt.legend()
+    plt.tight_layout()
+
+    # Save the plot
+    plt.savefig(
+        output_datetime_path
+        / f"near_sensor_measurements_timeline_{range_threshold}.png",
+        dpi=150,
+    )
+    plt.close()
+
+
+# Generate timeline comparison plots for each range threshold
+range_thresholds = [500, 750, 1000, 1250, 1500]
+for rt in range_thresholds:
+    print(f"Processing range threshold {rt}...")
+    plot_timeline_comparison(
+        anomaly_experiment_paths,
+        "Near-Sensor Lidar Measurements Over Time\n(Moving Anomaly Experiments Only)",
+        rt,
+    )
+
+# delete current latest folder
+shutil.rmtree(latest_folder_path, ignore_errors=True)
+
+# create new latest folder
+latest_folder_path.mkdir(exist_ok=True, parents=True)
+
+# copy contents of output folder to the latest folder
+for file in output_datetime_path.iterdir():
+    shutil.copy2(file, latest_folder_path)
+
+# copy this python script to preserve the code used
+shutil.copy2(__file__, output_datetime_path)
+shutil.copy2(__file__, latest_folder_path)
+
+# move output date folder to archive
+shutil.move(output_datetime_path, archive_folder_path)