data loading and plotting for results wip

This commit is contained in:
Jan Kowalczyk
2025-09-03 14:55:54 +02:00
parent 3d968c305c
commit ed80faf1e2
16 changed files with 2732 additions and 952 deletions

View File

@@ -1,118 +1,176 @@
import pickle
# ae_elbow_from_df.py
from __future__ import annotations
import json
import shutil
import unittest
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import polars as pl
# Configuration
results_folders = {
"LeNet": {
"path": Path(
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow_v2/"
),
"batch_size": 256,
},
"LeNet Efficient": {
"path": Path(
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_efficient_ae_elbow"
),
"batch_size": 64,
},
}
output_path = Path("/home/fedex/mt/plots/ae_elbow_lenet")
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# CHANGE THIS IMPORT IF YOUR LOADER MODULE IS NAMED DIFFERENTLY
from load_results import load_pretraining_results_dataframe
latest_folder_path = output_path / "latest"
archive_folder_path = output_path / "archive"
output_datetime_path = output_path / datetime_folder_name
# ----------------------------
# Config
# ----------------------------
ROOT = Path("/home/fedex/mt/results/done") # experiments root you pass to the loader
OUTPUT_DIR = Path("/home/fedex/mt/plots/ae_elbow_lenet_from_df")
# Create output directories
output_path.mkdir(exist_ok=True, parents=True)
output_datetime_path.mkdir(exist_ok=True, parents=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
archive_folder_path.mkdir(exist_ok=True, parents=True)
# Which label field to use from the DF; "labels_exp_based" or "labels_manual_based"
LABEL_FIELD = "labels_exp_based"
def calculate_batch_mean_loss(scores, batch_size):
"""Calculate mean loss over batches similar to the original testing code."""
n_samples = len(scores)
n_batches = (n_samples + batch_size - 1) // batch_size
batch_losses = []
for i in range(0, n_samples, batch_size):
batch_scores = scores[i : i + batch_size]
batch_losses.append(np.mean(batch_scores))
return np.sum(batch_losses) / n_batches
# ----------------------------
# Helpers
# ----------------------------
def canonicalize_network(name: str) -> str:
"""Map various net_name strings to clean labels for plotting."""
low = (name or "").lower()
if "lenet" in low:
return "LeNet"
if "efficient" in low:
return "Efficient"
# fallback: show whatever was stored
return name or "unknown"
def test_loss_calculation(results, batch_size):
"""Test if our loss calculation matches the original implementation."""
test = unittest.TestCase()
folds = results["ae_results"]
dim = results["dimension"]
for fold_key in folds:
fold_data = folds[fold_key]["test"]
scores = np.array(fold_data["scores"])
original_loss = fold_data["loss"]
calculated_loss = calculate_batch_mean_loss(scores, batch_size)
try:
test.assertAlmostEqual(
original_loss,
calculated_loss,
places=5,
msg=f"Loss mismatch for dim={dim}, {fold_key}",
)
except AssertionError as e:
print(f"Warning: {str(e)}")
print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
raise
def calculate_batch_mean_loss(scores: np.ndarray, batch_size: int) -> float:
"""Mean of per-batch means (matches how the original test loss was computed)."""
n = len(scores)
if n == 0:
return np.nan
if batch_size <= 0:
batch_size = n # single batch fallback
n_batches = (n + batch_size - 1) // batch_size
acc = 0.0
for i in range(0, n, batch_size):
acc += float(np.mean(scores[i : i + batch_size]))
return acc / n_batches
def plot_loss_curve(dims, means, stds, title, color, output_path):
"""Create and save a single loss curve plot."""
plt.figure(figsize=(8, 5))
plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
plt.fill_between(
dims,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
color=color,
alpha=0.2,
label="Std Dev",
)
plt.xlabel("Latent Dimension")
plt.ylabel("Test Loss")
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(dims)
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
def extract_batch_size(cfg_json: str) -> int:
"""
Prefer AE batch size; fall back to general batch_size; then a safe default.
We only rely on config_json (no lifted fields).
"""
try:
cfg = json.loads(cfg_json) if cfg_json else {}
except Exception:
cfg = {}
return int(cfg.get("ae_batch_size") or cfg.get("batch_size") or 256)
def build_arch_curves_from_df(
df: pl.DataFrame,
label_field: str = "labels_exp_based",
only_nets: set[str] | None = None,
):
"""
From the AE pretraining DF, compute (dims, means, stds) for normal/anomaly/overall
grouped by network and latent_dim. Returns:
{ net_label: {
"normal": (dims, means, stds),
"anomaly": (dims, means, stds),
"overall": (dims, means, stds),
} }
"""
if "split" not in df.columns:
raise ValueError("Expected 'split' column in AE dataframe.")
if "scores" not in df.columns:
raise ValueError("Expected 'scores' column in AE dataframe.")
if "network" not in df.columns or "latent_dim" not in df.columns:
raise ValueError("Expected 'network' and 'latent_dim' columns in AE dataframe.")
if label_field not in df.columns:
raise ValueError(f"Expected '{label_field}' column in AE dataframe.")
# Keep only test split
df = df.filter(pl.col("split") == "test")
groups: dict[tuple[str, int], dict[str, list[float]]] = {}
for row in df.iter_rows(named=True):
net_label = canonicalize_network(row["network"])
if only_nets and net_label not in only_nets:
continue
dim = int(row["latent_dim"])
batch_size = extract_batch_size(row.get("config_json"))
scores = np.asarray(row["scores"] or [], dtype=float)
labels = row.get(label_field)
labels = np.asarray(labels, dtype=int) if labels is not None else None
overall_loss = calculate_batch_mean_loss(scores, batch_size)
# Split by labels if available; otherwise we only aggregate overall
normal_loss = np.nan
anomaly_loss = np.nan
if labels is not None and labels.size == scores.size:
normal_scores = scores[labels == 1]
anomaly_scores = scores[labels == -1]
if normal_scores.size > 0:
normal_loss = calculate_batch_mean_loss(normal_scores, batch_size)
if anomaly_scores.size > 0:
anomaly_loss = calculate_batch_mean_loss(anomaly_scores, batch_size)
key = (net_label, dim)
if key not in groups:
groups[key] = {"normal": [], "anomaly": [], "overall": []}
groups[key]["overall"].append(overall_loss)
groups[key]["normal"].append(normal_loss)
groups[key]["anomaly"].append(anomaly_loss)
# Aggregate across folds -> per (net, dim) mean/std
per_net_dims: dict[str, set[int]] = {}
for net, dim in groups:
per_net_dims.setdefault(net, set()).add(dim)
result: dict[str, dict[str, tuple[list[int], list[float], list[float]]]] = {}
for net, dims in per_net_dims.items():
dims_sorted = sorted(dims)
def collect(kind: str):
means, stds = [], []
for d in dims_sorted:
xs = [
x
for (n2, d2), v in groups.items()
if n2 == net and d2 == d
for x in v[kind]
if x is not None and not np.isnan(x)
]
if len(xs) == 0:
means.append(np.nan)
stds.append(np.nan)
else:
means.append(float(np.mean(xs)))
stds.append(float(np.std(xs)))
return dims_sorted, means, stds
result[net] = {
"normal": collect("normal"),
"anomaly": collect("anomaly"),
"overall": collect("overall"),
}
return result
def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
"""Create and save a loss curve plot with multiple architectures.
Args:
arch_results: Dict of format {arch_name: (dims, means, stds)}
title: Plot title
output_path: Where to save the plot
colors: Optional dict of colors for each architecture
"""
arch_results: {arch_name: (dims, means, stds)}
"""
plt.figure(figsize=(10, 6))
# default color map if not provided
if colors is None:
colors = {
"LeNet": "blue",
"LeNet Asymmetric": "red",
"LeNet": "tab:blue",
"Efficient": "tab:orange",
}
# Get unique dimensions across all architectures
@@ -121,219 +179,91 @@ def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
)
for arch_name, (dims, means, stds) in arch_results.items():
color = colors.get(arch_name, "gray")
plt.plot(dims, means, marker="o", color=color, label=arch_name)
plt.fill_between(
dims,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
color=color,
alpha=0.2,
)
color = colors.get(arch_name)
# Plot line
if color is None:
plt.plot(dims, means, marker="o", label=arch_name)
plt.fill_between(
dims,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
alpha=0.2,
)
else:
plt.plot(dims, means, marker="o", color=color, label=arch_name)
plt.fill_between(
dims,
np.array(means) - np.array(stds),
np.array(means) + np.array(stds),
color=color,
alpha=0.2,
)
plt.xlabel("Latent Dimension")
plt.xlabel("Latent Dimensionality")
plt.ylabel("Test Loss")
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(all_dims) # Set x-axis ticks to match data points
plt.xticks(all_dims)
plt.tight_layout()
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
def evaluate_autoencoder_loss():
"""Main function to evaluate autoencoder loss across different latent dimensions."""
# Results storage for each architecture
arch_results = {
name: {"dims": [], "normal": [], "anomaly": []} for name in results_folders
}
def main():
# Load AE DF (uses your cache if enabled in the loader)
df = load_pretraining_results_dataframe(ROOT, allow_cache=True, include_train=False)
# Process each architecture
for arch_name, config in results_folders.items():
results_folder = config["path"]
batch_size = config["batch_size"]
result_files = sorted(
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
)
# Optional: filter to just LeNet vs Efficient; drop this set() to plot all nets
wanted_nets = {"LeNet", "Efficient"}
dimensions = []
normal_means = []
normal_stds = []
anomaly_means = []
anomaly_stds = []
curves = build_arch_curves_from_df(
df,
label_field=LABEL_FIELD,
only_nets=wanted_nets,
)
# Verify loss calculation
print(
f"\nVerifying loss calculation for {arch_name} (batch_size={batch_size})..."
)
for result_file in result_files:
with open(result_file, "rb") as f:
results = pickle.load(f)
test_loss_calculation(results, batch_size)
print(f"Loss calculation verified successfully for {arch_name}!")
# Prepare output dirs
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
ts_dir = OUTPUT_DIR / "archive" / datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ts_dir.mkdir(parents=True, exist_ok=True)
# Process files for this architecture
for result_file in result_files:
with open(result_file, "rb") as f:
results = pickle.load(f)
dim = int(results["dimension"])
folds = results["ae_results"]
def pick(kind: str):
# kind in {"normal","anomaly","overall"}
return {name: payload[kind] for name, payload in curves.items()}
normal_fold_losses = []
anomaly_fold_losses = []
all_scores = [] # Collect all scores for overall calculation
all_fold_scores = [] # Collect all fold scores for std calculation
for fold_key in folds:
fold_data = folds[fold_key]["test"]
scores = np.array(fold_data["scores"])
labels = np.array(fold_data["labels_exp_based"])
normal_scores = scores[labels == 1]
anomaly_scores = scores[labels == -1]
normal_fold_losses.append(
calculate_batch_mean_loss(normal_scores, batch_size)
)
anomaly_fold_losses.append(
calculate_batch_mean_loss(anomaly_scores, batch_size)
)
all_scores.append(scores) # Add scores to all_scores
all_fold_scores.append(fold_data["scores"]) # Add fold scores
dimensions.append(dim)
normal_means.append(np.mean(normal_fold_losses))
normal_stds.append(np.std(normal_fold_losses))
anomaly_means.append(np.mean(anomaly_fold_losses))
anomaly_stds.append(np.std(anomaly_fold_losses))
# Sort by dimension
sorted_data = sorted(
zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds)
)
dims, n_means, n_stds, a_means, a_stds = zip(*sorted_data)
# Store results for this architecture
arch_results[arch_name] = {
"dims": dims,
"normal": (dims, n_means, n_stds),
"anomaly": (dims, a_means, a_stds),
"overall": (
dims,
[
calculate_batch_mean_loss(scores, batch_size)
for scores in all_scores
], # Use all scores
[
np.std(
[
calculate_batch_mean_loss(fold_scores, batch_size)
for fold_scores in fold_scores_list
]
)
for fold_scores_list in all_fold_scores
],
),
}
# Create the three plots with all architectures
plot_multi_loss_curve(
{name: results["normal"] for name, results in arch_results.items()},
"Normal Class Test Loss vs. Latent Dimension",
output_datetime_path / "ae_elbow_test_loss_normal.png",
pick("normal"),
"Normal Class Test Loss vs. Latent Dimensionality",
ts_dir / "ae_elbow_test_loss_normal.png",
)
plot_multi_loss_curve(
{name: results["anomaly"] for name, results in arch_results.items()},
"Anomaly Class Test Loss vs. Latent Dimension",
output_datetime_path / "ae_elbow_test_loss_anomaly.png",
pick("anomaly"),
"Anomaly Class Test Loss vs. Latent Dimensionality",
ts_dir / "ae_elbow_test_loss_anomaly.png",
)
plot_multi_loss_curve(
{name: results["overall"] for name, results in arch_results.items()},
"Overall Test Loss vs. Latent Dimension",
output_datetime_path / "ae_elbow_test_loss_overall.png",
pick("overall"),
"Overall Test Loss vs. Latent Dimensionality",
ts_dir / "ae_elbow_test_loss_overall.png",
)
# Copy this script to preserve the code used for the outputs
script_path = Path(__file__)
shutil.copy2(script_path, ts_dir)
def print_loss_comparison(results_folders):
"""Print comparison tables of original vs calculated losses for each architecture."""
print("\nLoss Comparison Tables")
print("=" * 80)
# Optionally mirror latest
latest = OUTPUT_DIR / "latest"
latest.mkdir(exist_ok=True, parents=True)
for f in ts_dir.iterdir():
if f.is_file():
shutil.copy2(f, latest / f.name)
for arch_name, config in results_folders.items():
results_folder = config["path"]
batch_size = config["batch_size"]
result_files = sorted(
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
)
# Prepare table data
table_data = []
headers = ["Dimension", "Original", "Calculated", "Diff"]
for result_file in result_files:
with open(result_file, "rb") as f:
results = pickle.load(f)
dim = int(results["dimension"])
folds = results["ae_results"]
# Calculate mean original loss across folds
orig_losses = []
calc_losses = []
for fold_key in folds:
fold_data = folds[fold_key]["test"]
orig_losses.append(fold_data["loss"])
calc_losses.append(
calculate_batch_mean_loss(np.array(fold_data["scores"]), batch_size)
)
orig_mean = np.mean(orig_losses)
calc_mean = np.mean(calc_losses)
diff = abs(orig_mean - calc_mean)
table_data.append([dim, orig_mean, calc_mean, diff])
# Sort by dimension
table_data.sort(key=lambda x: x[0])
print(f"\n{arch_name}:")
print(
tabulate(
table_data,
headers=headers,
floatfmt=".6f",
tablefmt="pipe",
numalign="right",
)
)
print("\n" + "=" * 80)
print(f"Saved plots to: {ts_dir}")
print(f"Also updated: {latest}")
if __name__ == "__main__":
# Print loss comparisons for all architectures
print_loss_comparison(results_folders)
# Run main analysis
evaluate_autoencoder_loss()
# Archive management
# Delete current latest folder
shutil.rmtree(latest_folder_path, ignore_errors=True)
latest_folder_path.mkdir(exist_ok=True, parents=True)
# Copy contents to latest folder
for file in output_datetime_path.iterdir():
shutil.copy2(file, latest_folder_path)
# Copy this script for reference
shutil.copy2(__file__, output_datetime_path)
shutil.copy2(__file__, latest_folder_path)
# Move output to archive
shutil.move(output_datetime_path, archive_folder_path)
main()