tools, lockfile, deps

2025-08-13 14:17:12 +02:00
parent cd4dc583e8
commit ef311d862e
17 changed files with 4325 additions and 0 deletions
--- a/tools/ae_elbow_eval.py
+++ b/tools/ae_elbow_eval.py
@@ -0,0 +1,174 @@
+# loads results from autoencoder training form a pickle file and evaluates results and visualizes them to find traiing elbow
+
+import pickle
+import unittest
+from pathlib import Path
+from typing import Dict, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+results_folder = Path(
+    "/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow/"
+)
+
+# Find all result files matching the pattern
+result_files = sorted(
+    results_folder.glob("ae_elbow_results_subter_LeNet_dim_*_kfold.pkl")
+)
+
+# Initialize data structures for both classes
+dimensions = []
+normal_means = []
+normal_stds = []
+anomaly_means = []
+anomaly_stds = []
+
+BATCH_SIZE = 256  # Add this constant at the top of the file
+
+
+def calculate_batch_mean_loss(scores, batch_size=BATCH_SIZE):
+    """Calculate mean loss over batches similar to the original testing code."""
+    n_samples = len(scores)
+    n_batches = (n_samples + batch_size - 1) // batch_size  # ceiling division
+
+    # Split scores into batches
+    batch_losses = []
+    for i in range(0, n_samples, batch_size):
+        batch_scores = scores[i : i + batch_size]
+        batch_losses.append(np.mean(batch_scores))
+
+    return np.sum(batch_losses) / n_batches
+
+
+def test_loss_calculation(results: Dict, batch_size: int = BATCH_SIZE) -> None:
+    """Test if our loss calculation matches the original implementation."""
+    test = unittest.TestCase()
+    folds = results["ae_results"]
+    dim = results["dimension"]
+
+    for fold_key in folds:
+        fold_data = folds[fold_key]["test"]
+        scores = np.array(fold_data["scores"])
+        original_loss = fold_data["loss"]
+        calculated_loss = calculate_batch_mean_loss(scores)
+
+        try:
+            test.assertAlmostEqual(
+                original_loss,
+                calculated_loss,
+                places=5,
+                msg=f"Loss mismatch for dim={dim}, {fold_key}",
+            )
+        except AssertionError as e:
+            print(f"Warning: {str(e)}")
+            print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
+            raise
+
+
+# Load and verify data
+print("Verifying loss calculation implementation...")
+for result_file in result_files:
+    with open(result_file, "rb") as f:
+        results = pickle.load(f)
+        test_loss_calculation(results)
+print("Loss calculation verified successfully!")
+
+# Continue with actual data processing
+for result_file in result_files:
+    with open(result_file, "rb") as f:
+        results = pickle.load(f)
+    dim = int(results["dimension"])
+    folds = results["ae_results"]
+
+    normal_fold_losses = []
+    anomaly_fold_losses = []
+
+    for fold_key in folds:
+        fold_data = folds[fold_key]["test"]
+        scores = np.array(fold_data["scores"])
+        labels = np.array(fold_data["labels_exp_based"])
+
+        # Calculate mean loss for normal and anomaly samples
+        normal_scores = scores[labels == 1]
+        anomaly_scores = scores[labels == -1]
+
+        # Calculate losses using batch means
+        normal_fold_losses.append(calculate_batch_mean_loss(normal_scores))
+        anomaly_fold_losses.append(calculate_batch_mean_loss(anomaly_scores))
+
+    dimensions.append(dim)
+    normal_means.append(np.mean(normal_fold_losses))
+    normal_stds.append(np.std(normal_fold_losses))
+    anomaly_means.append(np.mean(anomaly_fold_losses))
+    anomaly_stds.append(np.std(anomaly_fold_losses))
+
+# Sort by dimension
+dims, n_means, n_stds, a_means, a_stds = zip(
+    *sorted(zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds))
+)
+
+# Calculate overall means and stds
+means = [(n + a) / 2 for n, a in zip(n_means, a_means)]
+stds = [(ns + as_) / 2 for ns, as_ in zip(n_stds, a_stds)]
+
+
+def plot_loss_curve(dims, means, stds, title, color, output_path):
+    """Create and save a single loss curve plot.
+
+    Args:
+        dims: List of latent dimensions
+        means: List of mean losses
+        stds: List of standard deviations
+        title: Plot title
+        color: Color for plot and fill
+        output_path: Where to save the plot
+    """
+    plt.figure(figsize=(8, 5))
+    plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
+    plt.fill_between(
+        dims,
+        np.array(means) - np.array(stds),
+        np.array(means) + np.array(stds),
+        color=color,
+        alpha=0.2,
+        label="Std Dev",
+    )
+    plt.xlabel("Latent Dimension")
+    plt.ylabel("Test Loss")
+    plt.title(title)
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plt.xticks(dims)  # Set x-ticks exactly at all data points
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    plt.close()
+
+
+# Create the three plots
+plot_loss_curve(
+    dims,
+    means,
+    stds,
+    "Overall Test Loss vs. Latent Dimension",
+    "blue",
+    results_folder / "ae_elbow_test_loss_overall.png",
+)
+
+plot_loss_curve(
+    dims,
+    n_means,
+    n_stds,
+    "Normal Class Test Loss vs. Latent Dimension",
+    "green",
+    results_folder / "ae_elbow_test_loss_normal.png",
+)
+
+plot_loss_curve(
+    dims,
+    a_means,
+    a_stds,
+    "Anomaly Class Test Loss vs. Latent Dimension",
+    "red",
+    results_folder / "ae_elbow_test_loss_anomaly.png",
+)