data loading and plotting for results wip

2025-09-03 14:55:54 +02:00
parent 3d968c305c
commit ed80faf1e2
16 changed files with 2732 additions and 952 deletions
--- a/tools/plot_scripts/ae_elbow_lenet.py
+++ b/tools/plot_scripts/ae_elbow_lenet.py
@@ -1,118 +1,176 @@
-import pickle
+# ae_elbow_from_df.py
+
+from __future__ import annotations
+
+import json
 import shutil
-import unittest
 from datetime import datetime
 from pathlib import Path

 import matplotlib.pyplot as plt
 import numpy as np
-from tabulate import tabulate
+import polars as pl

-# Configuration
-results_folders = {
-    "LeNet": {
-        "path": Path(
-            "/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow_v2/"
-        ),
-        "batch_size": 256,
-    },
-    "LeNet Efficient": {
-        "path": Path(
-            "/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_efficient_ae_elbow"
-        ),
-        "batch_size": 64,
-    },
-}
-output_path = Path("/home/fedex/mt/plots/ae_elbow_lenet")
-datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+# CHANGE THIS IMPORT IF YOUR LOADER MODULE IS NAMED DIFFERENTLY
+from load_results import load_pretraining_results_dataframe

-latest_folder_path = output_path / "latest"
-archive_folder_path = output_path / "archive"
-output_datetime_path = output_path / datetime_folder_name
+# ----------------------------
+# Config
+# ----------------------------
+ROOT = Path("/home/fedex/mt/results/done")  # experiments root you pass to the loader
+OUTPUT_DIR = Path("/home/fedex/mt/plots/ae_elbow_lenet_from_df")

-# Create output directories
-output_path.mkdir(exist_ok=True, parents=True)
-output_datetime_path.mkdir(exist_ok=True, parents=True)
-latest_folder_path.mkdir(exist_ok=True, parents=True)
-archive_folder_path.mkdir(exist_ok=True, parents=True)
+# Which label field to use from the DF; "labels_exp_based" or "labels_manual_based"
+LABEL_FIELD = "labels_exp_based"


-def calculate_batch_mean_loss(scores, batch_size):
-    """Calculate mean loss over batches similar to the original testing code."""
-    n_samples = len(scores)
-    n_batches = (n_samples + batch_size - 1) // batch_size
-
-    batch_losses = []
-    for i in range(0, n_samples, batch_size):
-        batch_scores = scores[i : i + batch_size]
-        batch_losses.append(np.mean(batch_scores))
-
-    return np.sum(batch_losses) / n_batches
+# ----------------------------
+# Helpers
+# ----------------------------
+def canonicalize_network(name: str) -> str:
+    """Map various net_name strings to clean labels for plotting."""
+    low = (name or "").lower()
+    if "lenet" in low:
+        return "LeNet"
+    if "efficient" in low:
+        return "Efficient"
+    # fallback: show whatever was stored
+    return name or "unknown"


-def test_loss_calculation(results, batch_size):
-    """Test if our loss calculation matches the original implementation."""
-    test = unittest.TestCase()
-    folds = results["ae_results"]
-    dim = results["dimension"]
-
-    for fold_key in folds:
-        fold_data = folds[fold_key]["test"]
-        scores = np.array(fold_data["scores"])
-        original_loss = fold_data["loss"]
-        calculated_loss = calculate_batch_mean_loss(scores, batch_size)
-
-        try:
-            test.assertAlmostEqual(
-                original_loss,
-                calculated_loss,
-                places=5,
-                msg=f"Loss mismatch for dim={dim}, {fold_key}",
-            )
-        except AssertionError as e:
-            print(f"Warning: {str(e)}")
-            print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
-            raise
+def calculate_batch_mean_loss(scores: np.ndarray, batch_size: int) -> float:
+    """Mean of per-batch means (matches how the original test loss was computed)."""
+    n = len(scores)
+    if n == 0:
+        return np.nan
+    if batch_size <= 0:
+        batch_size = n  # single batch fallback
+    n_batches = (n + batch_size - 1) // batch_size
+    acc = 0.0
+    for i in range(0, n, batch_size):
+        acc += float(np.mean(scores[i : i + batch_size]))
+    return acc / n_batches


-def plot_loss_curve(dims, means, stds, title, color, output_path):
-    """Create and save a single loss curve plot."""
-    plt.figure(figsize=(8, 5))
-    plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
-    plt.fill_between(
-        dims,
-        np.array(means) - np.array(stds),
-        np.array(means) + np.array(stds),
-        color=color,
-        alpha=0.2,
-        label="Std Dev",
-    )
-    plt.xlabel("Latent Dimension")
-    plt.ylabel("Test Loss")
-    plt.title(title)
-    plt.legend()
-    plt.grid(True, alpha=0.3)
-    plt.xticks(dims)
-    plt.tight_layout()
-    plt.savefig(output_path, dpi=150, bbox_inches="tight")
-    plt.close()
+def extract_batch_size(cfg_json: str) -> int:
+    """
+    Prefer AE batch size; fall back to general batch_size; then a safe default.
+    We only rely on config_json (no lifted fields).
+    """
+    try:
+        cfg = json.loads(cfg_json) if cfg_json else {}
+    except Exception:
+        cfg = {}
+    return int(cfg.get("ae_batch_size") or cfg.get("batch_size") or 256)
+
+
+def build_arch_curves_from_df(
+    df: pl.DataFrame,
+    label_field: str = "labels_exp_based",
+    only_nets: set[str] | None = None,
+):
+    """
+    From the AE pretraining DF, compute (dims, means, stds) for normal/anomaly/overall
+    grouped by network and latent_dim. Returns:
+        { net_label: {
+            "normal":  (dims, means, stds),
+            "anomaly": (dims, means, stds),
+            "overall": (dims, means, stds),
+        } }
+    """
+    if "split" not in df.columns:
+        raise ValueError("Expected 'split' column in AE dataframe.")
+    if "scores" not in df.columns:
+        raise ValueError("Expected 'scores' column in AE dataframe.")
+    if "network" not in df.columns or "latent_dim" not in df.columns:
+        raise ValueError("Expected 'network' and 'latent_dim' columns in AE dataframe.")
+    if label_field not in df.columns:
+        raise ValueError(f"Expected '{label_field}' column in AE dataframe.")
+
+    # Keep only test split
+    df = df.filter(pl.col("split") == "test")
+
+    groups: dict[tuple[str, int], dict[str, list[float]]] = {}
+
+    for row in df.iter_rows(named=True):
+        net_label = canonicalize_network(row["network"])
+        if only_nets and net_label not in only_nets:
+            continue
+
+        dim = int(row["latent_dim"])
+        batch_size = extract_batch_size(row.get("config_json"))
+        scores = np.asarray(row["scores"] or [], dtype=float)
+
+        labels = row.get(label_field)
+        labels = np.asarray(labels, dtype=int) if labels is not None else None
+
+        overall_loss = calculate_batch_mean_loss(scores, batch_size)
+
+        # Split by labels if available; otherwise we only aggregate overall
+        normal_loss = np.nan
+        anomaly_loss = np.nan
+        if labels is not None and labels.size == scores.size:
+            normal_scores = scores[labels == 1]
+            anomaly_scores = scores[labels == -1]
+            if normal_scores.size > 0:
+                normal_loss = calculate_batch_mean_loss(normal_scores, batch_size)
+            if anomaly_scores.size > 0:
+                anomaly_loss = calculate_batch_mean_loss(anomaly_scores, batch_size)
+
+        key = (net_label, dim)
+        if key not in groups:
+            groups[key] = {"normal": [], "anomaly": [], "overall": []}
+        groups[key]["overall"].append(overall_loss)
+        groups[key]["normal"].append(normal_loss)
+        groups[key]["anomaly"].append(anomaly_loss)
+
+    # Aggregate across folds -> per (net, dim) mean/std
+    per_net_dims: dict[str, set[int]] = {}
+    for net, dim in groups:
+        per_net_dims.setdefault(net, set()).add(dim)
+
+    result: dict[str, dict[str, tuple[list[int], list[float], list[float]]]] = {}
+    for net, dims in per_net_dims.items():
+        dims_sorted = sorted(dims)
+
+        def collect(kind: str):
+            means, stds = [], []
+            for d in dims_sorted:
+                xs = [
+                    x
+                    for (n2, d2), v in groups.items()
+                    if n2 == net and d2 == d
+                    for x in v[kind]
+                    if x is not None and not np.isnan(x)
+                ]
+                if len(xs) == 0:
+                    means.append(np.nan)
+                    stds.append(np.nan)
+                else:
+                    means.append(float(np.mean(xs)))
+                    stds.append(float(np.std(xs)))
+            return dims_sorted, means, stds
+
+        result[net] = {
+            "normal": collect("normal"),
+            "anomaly": collect("anomaly"),
+            "overall": collect("overall"),
+        }
+
+    return result


 def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
-    """Create and save a loss curve plot with multiple architectures.
-
-    Args:
-        arch_results: Dict of format {arch_name: (dims, means, stds)}
-        title: Plot title
-        output_path: Where to save the plot
-        colors: Optional dict of colors for each architecture
+    """
+    arch_results: {arch_name: (dims, means, stds)}
    """
    plt.figure(figsize=(10, 6))

+    # default color map if not provided
    if colors is None:
        colors = {
-            "LeNet": "blue",
-            "LeNet Asymmetric": "red",
+            "LeNet": "tab:blue",
+            "Efficient": "tab:orange",
        }

    # Get unique dimensions across all architectures
@@ -121,219 +179,91 @@ def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
    )

    for arch_name, (dims, means, stds) in arch_results.items():
-        color = colors.get(arch_name, "gray")
-        plt.plot(dims, means, marker="o", color=color, label=arch_name)
-        plt.fill_between(
-            dims,
-            np.array(means) - np.array(stds),
-            np.array(means) + np.array(stds),
-            color=color,
-            alpha=0.2,
-        )
+        color = colors.get(arch_name)
+        # Plot line
+        if color is None:
+            plt.plot(dims, means, marker="o", label=arch_name)
+            plt.fill_between(
+                dims,
+                np.array(means) - np.array(stds),
+                np.array(means) + np.array(stds),
+                alpha=0.2,
+            )
+        else:
+            plt.plot(dims, means, marker="o", color=color, label=arch_name)
+            plt.fill_between(
+                dims,
+                np.array(means) - np.array(stds),
+                np.array(means) + np.array(stds),
+                color=color,
+                alpha=0.2,
+            )

-    plt.xlabel("Latent Dimension")
+    plt.xlabel("Latent Dimensionality")
    plt.ylabel("Test Loss")
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
-    plt.xticks(all_dims)  # Set x-axis ticks to match data points
+    plt.xticks(all_dims)
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.close()


-def evaluate_autoencoder_loss():
-    """Main function to evaluate autoencoder loss across different latent dimensions."""
-    # Results storage for each architecture
-    arch_results = {
-        name: {"dims": [], "normal": [], "anomaly": []} for name in results_folders
-    }
+def main():
+    # Load AE DF (uses your cache if enabled in the loader)
+    df = load_pretraining_results_dataframe(ROOT, allow_cache=True, include_train=False)

-    # Process each architecture
-    for arch_name, config in results_folders.items():
-        results_folder = config["path"]
-        batch_size = config["batch_size"]
-        result_files = sorted(
-            results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
-        )
+    # Optional: filter to just LeNet vs Efficient; drop this set() to plot all nets
+    wanted_nets = {"LeNet", "Efficient"}

-        dimensions = []
-        normal_means = []
-        normal_stds = []
-        anomaly_means = []
-        anomaly_stds = []
+    curves = build_arch_curves_from_df(
+        df,
+        label_field=LABEL_FIELD,
+        only_nets=wanted_nets,
+    )

-        # Verify loss calculation
-        print(
-            f"\nVerifying loss calculation for {arch_name} (batch_size={batch_size})..."
-        )
-        for result_file in result_files:
-            with open(result_file, "rb") as f:
-                results = pickle.load(f)
-                test_loss_calculation(results, batch_size)
-        print(f"Loss calculation verified successfully for {arch_name}!")
+    # Prepare output dirs
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    ts_dir = OUTPUT_DIR / "archive" / datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    ts_dir.mkdir(parents=True, exist_ok=True)

-        # Process files for this architecture
-        for result_file in result_files:
-            with open(result_file, "rb") as f:
-                results = pickle.load(f)
-            dim = int(results["dimension"])
-            folds = results["ae_results"]
+    def pick(kind: str):
+        # kind in {"normal","anomaly","overall"}
+        return {name: payload[kind] for name, payload in curves.items()}

-            normal_fold_losses = []
-            anomaly_fold_losses = []
-
-            all_scores = []  # Collect all scores for overall calculation
-            all_fold_scores = []  # Collect all fold scores for std calculation
-
-            for fold_key in folds:
-                fold_data = folds[fold_key]["test"]
-                scores = np.array(fold_data["scores"])
-                labels = np.array(fold_data["labels_exp_based"])
-
-                normal_scores = scores[labels == 1]
-                anomaly_scores = scores[labels == -1]
-
-                normal_fold_losses.append(
-                    calculate_batch_mean_loss(normal_scores, batch_size)
-                )
-                anomaly_fold_losses.append(
-                    calculate_batch_mean_loss(anomaly_scores, batch_size)
-                )
-
-                all_scores.append(scores)  # Add scores to all_scores
-                all_fold_scores.append(fold_data["scores"])  # Add fold scores
-
-            dimensions.append(dim)
-            normal_means.append(np.mean(normal_fold_losses))
-            normal_stds.append(np.std(normal_fold_losses))
-            anomaly_means.append(np.mean(anomaly_fold_losses))
-            anomaly_stds.append(np.std(anomaly_fold_losses))
-
-        # Sort by dimension
-        sorted_data = sorted(
-            zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds)
-        )
-        dims, n_means, n_stds, a_means, a_stds = zip(*sorted_data)
-
-        # Store results for this architecture
-        arch_results[arch_name] = {
-            "dims": dims,
-            "normal": (dims, n_means, n_stds),
-            "anomaly": (dims, a_means, a_stds),
-            "overall": (
-                dims,
-                [
-                    calculate_batch_mean_loss(scores, batch_size)
-                    for scores in all_scores
-                ],  # Use all scores
-                [
-                    np.std(
-                        [
-                            calculate_batch_mean_loss(fold_scores, batch_size)
-                            for fold_scores in fold_scores_list
-                        ]
-                    )
-                    for fold_scores_list in all_fold_scores
-                ],
-            ),
-        }
-
-    # Create the three plots with all architectures
    plot_multi_loss_curve(
-        {name: results["normal"] for name, results in arch_results.items()},
-        "Normal Class Test Loss vs. Latent Dimension",
-        output_datetime_path / "ae_elbow_test_loss_normal.png",
+        pick("normal"),
+        "Normal Class Test Loss vs. Latent Dimensionality",
+        ts_dir / "ae_elbow_test_loss_normal.png",
    )

    plot_multi_loss_curve(
-        {name: results["anomaly"] for name, results in arch_results.items()},
-        "Anomaly Class Test Loss vs. Latent Dimension",
-        output_datetime_path / "ae_elbow_test_loss_anomaly.png",
+        pick("anomaly"),
+        "Anomaly Class Test Loss vs. Latent Dimensionality",
+        ts_dir / "ae_elbow_test_loss_anomaly.png",
    )

    plot_multi_loss_curve(
-        {name: results["overall"] for name, results in arch_results.items()},
-        "Overall Test Loss vs. Latent Dimension",
-        output_datetime_path / "ae_elbow_test_loss_overall.png",
+        pick("overall"),
+        "Overall Test Loss vs. Latent Dimensionality",
+        ts_dir / "ae_elbow_test_loss_overall.png",
    )

+    # Copy this script to preserve the code used for the outputs
+    script_path = Path(__file__)
+    shutil.copy2(script_path, ts_dir)

-def print_loss_comparison(results_folders):
-    """Print comparison tables of original vs calculated losses for each architecture."""
-    print("\nLoss Comparison Tables")
-    print("=" * 80)
+    # Optionally mirror latest
+    latest = OUTPUT_DIR / "latest"
+    latest.mkdir(exist_ok=True, parents=True)
+    for f in ts_dir.iterdir():
+        if f.is_file():
+            shutil.copy2(f, latest / f.name)

-    for arch_name, config in results_folders.items():
-        results_folder = config["path"]
-        batch_size = config["batch_size"]
-        result_files = sorted(
-            results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
-        )
-
-        # Prepare table data
-        table_data = []
-        headers = ["Dimension", "Original", "Calculated", "Diff"]
-
-        for result_file in result_files:
-            with open(result_file, "rb") as f:
-                results = pickle.load(f)
-
-            dim = int(results["dimension"])
-            folds = results["ae_results"]
-
-            # Calculate mean original loss across folds
-            orig_losses = []
-            calc_losses = []
-            for fold_key in folds:
-                fold_data = folds[fold_key]["test"]
-                orig_losses.append(fold_data["loss"])
-                calc_losses.append(
-                    calculate_batch_mean_loss(np.array(fold_data["scores"]), batch_size)
-                )
-
-            orig_mean = np.mean(orig_losses)
-            calc_mean = np.mean(calc_losses)
-            diff = abs(orig_mean - calc_mean)
-
-            table_data.append([dim, orig_mean, calc_mean, diff])
-
-        # Sort by dimension
-        table_data.sort(key=lambda x: x[0])
-
-        print(f"\n{arch_name}:")
-        print(
-            tabulate(
-                table_data,
-                headers=headers,
-                floatfmt=".6f",
-                tablefmt="pipe",
-                numalign="right",
-            )
-        )
-
-    print("\n" + "=" * 80)
+    print(f"Saved plots to: {ts_dir}")
+    print(f"Also updated: {latest}")


 if __name__ == "__main__":
-    # Print loss comparisons for all architectures
-    print_loss_comparison(results_folders)
-
-    # Run main analysis
-    evaluate_autoencoder_loss()
-
-    # Archive management
-    # Delete current latest folder
-    shutil.rmtree(latest_folder_path, ignore_errors=True)
-    latest_folder_path.mkdir(exist_ok=True, parents=True)
-
-    # Copy contents to latest folder
-    for file in output_datetime_path.iterdir():
-        shutil.copy2(file, latest_folder_path)
-
-    # Copy this script for reference
-    shutil.copy2(__file__, output_datetime_path)
-    shutil.copy2(__file__, latest_folder_path)
-
-    # Move output to archive
-    shutil.move(output_datetime_path, archive_folder_path)
+    main()