340 lines
11 KiB
Python
340 lines
11 KiB
Python
import pickle
|
|
import shutil
|
|
import unittest
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from tabulate import tabulate
|
|
|
|
# Configuration
|
|
results_folders = {
|
|
"LeNet": {
|
|
"path": Path(
|
|
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_ae_elbow_v2/"
|
|
),
|
|
"batch_size": 256,
|
|
},
|
|
"LeNet Efficient": {
|
|
"path": Path(
|
|
"/home/fedex/mt/projects/thesis-kowalczyk-jan/Deep-SAD-PyTorch/test/DeepSAD/subter_efficient_ae_elbow"
|
|
),
|
|
"batch_size": 64,
|
|
},
|
|
}
|
|
output_path = Path("/home/fedex/mt/plots/ae_elbow_lenet")
|
|
datetime_folder_name = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
|
|
latest_folder_path = output_path / "latest"
|
|
archive_folder_path = output_path / "archive"
|
|
output_datetime_path = output_path / datetime_folder_name
|
|
|
|
# Create output directories
|
|
output_path.mkdir(exist_ok=True, parents=True)
|
|
output_datetime_path.mkdir(exist_ok=True, parents=True)
|
|
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
|
archive_folder_path.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
|
def calculate_batch_mean_loss(scores, batch_size):
|
|
"""Calculate mean loss over batches similar to the original testing code."""
|
|
n_samples = len(scores)
|
|
n_batches = (n_samples + batch_size - 1) // batch_size
|
|
|
|
batch_losses = []
|
|
for i in range(0, n_samples, batch_size):
|
|
batch_scores = scores[i : i + batch_size]
|
|
batch_losses.append(np.mean(batch_scores))
|
|
|
|
return np.sum(batch_losses) / n_batches
|
|
|
|
|
|
def test_loss_calculation(results, batch_size):
|
|
"""Test if our loss calculation matches the original implementation."""
|
|
test = unittest.TestCase()
|
|
folds = results["ae_results"]
|
|
dim = results["dimension"]
|
|
|
|
for fold_key in folds:
|
|
fold_data = folds[fold_key]["test"]
|
|
scores = np.array(fold_data["scores"])
|
|
original_loss = fold_data["loss"]
|
|
calculated_loss = calculate_batch_mean_loss(scores, batch_size)
|
|
|
|
try:
|
|
test.assertAlmostEqual(
|
|
original_loss,
|
|
calculated_loss,
|
|
places=5,
|
|
msg=f"Loss mismatch for dim={dim}, {fold_key}",
|
|
)
|
|
except AssertionError as e:
|
|
print(f"Warning: {str(e)}")
|
|
print(f"Original: {original_loss:.6f}, Calculated: {calculated_loss:.6f}")
|
|
raise
|
|
|
|
|
|
def plot_loss_curve(dims, means, stds, title, color, output_path):
|
|
"""Create and save a single loss curve plot."""
|
|
plt.figure(figsize=(8, 5))
|
|
plt.plot(dims, means, marker="o", color=color, label="Mean Test Loss")
|
|
plt.fill_between(
|
|
dims,
|
|
np.array(means) - np.array(stds),
|
|
np.array(means) + np.array(stds),
|
|
color=color,
|
|
alpha=0.2,
|
|
label="Std Dev",
|
|
)
|
|
plt.xlabel("Latent Dimension")
|
|
plt.ylabel("Test Loss")
|
|
plt.title(title)
|
|
plt.legend()
|
|
plt.grid(True, alpha=0.3)
|
|
plt.xticks(dims)
|
|
plt.tight_layout()
|
|
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
|
plt.close()
|
|
|
|
|
|
def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
|
|
"""Create and save a loss curve plot with multiple architectures.
|
|
|
|
Args:
|
|
arch_results: Dict of format {arch_name: (dims, means, stds)}
|
|
title: Plot title
|
|
output_path: Where to save the plot
|
|
colors: Optional dict of colors for each architecture
|
|
"""
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
if colors is None:
|
|
colors = {
|
|
"LeNet": "blue",
|
|
"LeNet Asymmetric": "red",
|
|
}
|
|
|
|
# Get unique dimensions across all architectures
|
|
all_dims = sorted(
|
|
set(dim for _, (dims, _, _) in arch_results.items() for dim in dims)
|
|
)
|
|
|
|
for arch_name, (dims, means, stds) in arch_results.items():
|
|
color = colors.get(arch_name, "gray")
|
|
plt.plot(dims, means, marker="o", color=color, label=arch_name)
|
|
plt.fill_between(
|
|
dims,
|
|
np.array(means) - np.array(stds),
|
|
np.array(means) + np.array(stds),
|
|
color=color,
|
|
alpha=0.2,
|
|
)
|
|
|
|
plt.xlabel("Latent Dimension")
|
|
plt.ylabel("Test Loss")
|
|
plt.title(title)
|
|
plt.legend()
|
|
plt.grid(True, alpha=0.3)
|
|
plt.xticks(all_dims) # Set x-axis ticks to match data points
|
|
plt.tight_layout()
|
|
plt.savefig(output_path, dpi=150, bbox_inches="tight")
|
|
plt.close()
|
|
|
|
|
|
def evaluate_autoencoder_loss():
|
|
"""Main function to evaluate autoencoder loss across different latent dimensions."""
|
|
# Results storage for each architecture
|
|
arch_results = {
|
|
name: {"dims": [], "normal": [], "anomaly": []} for name in results_folders
|
|
}
|
|
|
|
# Process each architecture
|
|
for arch_name, config in results_folders.items():
|
|
results_folder = config["path"]
|
|
batch_size = config["batch_size"]
|
|
result_files = sorted(
|
|
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
|
|
)
|
|
|
|
dimensions = []
|
|
normal_means = []
|
|
normal_stds = []
|
|
anomaly_means = []
|
|
anomaly_stds = []
|
|
|
|
# Verify loss calculation
|
|
print(
|
|
f"\nVerifying loss calculation for {arch_name} (batch_size={batch_size})..."
|
|
)
|
|
for result_file in result_files:
|
|
with open(result_file, "rb") as f:
|
|
results = pickle.load(f)
|
|
test_loss_calculation(results, batch_size)
|
|
print(f"Loss calculation verified successfully for {arch_name}!")
|
|
|
|
# Process files for this architecture
|
|
for result_file in result_files:
|
|
with open(result_file, "rb") as f:
|
|
results = pickle.load(f)
|
|
dim = int(results["dimension"])
|
|
folds = results["ae_results"]
|
|
|
|
normal_fold_losses = []
|
|
anomaly_fold_losses = []
|
|
|
|
all_scores = [] # Collect all scores for overall calculation
|
|
all_fold_scores = [] # Collect all fold scores for std calculation
|
|
|
|
for fold_key in folds:
|
|
fold_data = folds[fold_key]["test"]
|
|
scores = np.array(fold_data["scores"])
|
|
labels = np.array(fold_data["labels_exp_based"])
|
|
|
|
normal_scores = scores[labels == 1]
|
|
anomaly_scores = scores[labels == -1]
|
|
|
|
normal_fold_losses.append(
|
|
calculate_batch_mean_loss(normal_scores, batch_size)
|
|
)
|
|
anomaly_fold_losses.append(
|
|
calculate_batch_mean_loss(anomaly_scores, batch_size)
|
|
)
|
|
|
|
all_scores.append(scores) # Add scores to all_scores
|
|
all_fold_scores.append(fold_data["scores"]) # Add fold scores
|
|
|
|
dimensions.append(dim)
|
|
normal_means.append(np.mean(normal_fold_losses))
|
|
normal_stds.append(np.std(normal_fold_losses))
|
|
anomaly_means.append(np.mean(anomaly_fold_losses))
|
|
anomaly_stds.append(np.std(anomaly_fold_losses))
|
|
|
|
# Sort by dimension
|
|
sorted_data = sorted(
|
|
zip(dimensions, normal_means, normal_stds, anomaly_means, anomaly_stds)
|
|
)
|
|
dims, n_means, n_stds, a_means, a_stds = zip(*sorted_data)
|
|
|
|
# Store results for this architecture
|
|
arch_results[arch_name] = {
|
|
"dims": dims,
|
|
"normal": (dims, n_means, n_stds),
|
|
"anomaly": (dims, a_means, a_stds),
|
|
"overall": (
|
|
dims,
|
|
[
|
|
calculate_batch_mean_loss(scores, batch_size)
|
|
for scores in all_scores
|
|
], # Use all scores
|
|
[
|
|
np.std(
|
|
[
|
|
calculate_batch_mean_loss(fold_scores, batch_size)
|
|
for fold_scores in fold_scores_list
|
|
]
|
|
)
|
|
for fold_scores_list in all_fold_scores
|
|
],
|
|
),
|
|
}
|
|
|
|
# Create the three plots with all architectures
|
|
plot_multi_loss_curve(
|
|
{name: results["normal"] for name, results in arch_results.items()},
|
|
"Normal Class Test Loss vs. Latent Dimension",
|
|
output_datetime_path / "ae_elbow_test_loss_normal.png",
|
|
)
|
|
|
|
plot_multi_loss_curve(
|
|
{name: results["anomaly"] for name, results in arch_results.items()},
|
|
"Anomaly Class Test Loss vs. Latent Dimension",
|
|
output_datetime_path / "ae_elbow_test_loss_anomaly.png",
|
|
)
|
|
|
|
plot_multi_loss_curve(
|
|
{name: results["overall"] for name, results in arch_results.items()},
|
|
"Overall Test Loss vs. Latent Dimension",
|
|
output_datetime_path / "ae_elbow_test_loss_overall.png",
|
|
)
|
|
|
|
|
|
def print_loss_comparison(results_folders):
|
|
"""Print comparison tables of original vs calculated losses for each architecture."""
|
|
print("\nLoss Comparison Tables")
|
|
print("=" * 80)
|
|
|
|
for arch_name, config in results_folders.items():
|
|
results_folder = config["path"]
|
|
batch_size = config["batch_size"]
|
|
result_files = sorted(
|
|
results_folder.glob("ae_elbow_results_subter_*_kfold.pkl")
|
|
)
|
|
|
|
# Prepare table data
|
|
table_data = []
|
|
headers = ["Dimension", "Original", "Calculated", "Diff"]
|
|
|
|
for result_file in result_files:
|
|
with open(result_file, "rb") as f:
|
|
results = pickle.load(f)
|
|
|
|
dim = int(results["dimension"])
|
|
folds = results["ae_results"]
|
|
|
|
# Calculate mean original loss across folds
|
|
orig_losses = []
|
|
calc_losses = []
|
|
for fold_key in folds:
|
|
fold_data = folds[fold_key]["test"]
|
|
orig_losses.append(fold_data["loss"])
|
|
calc_losses.append(
|
|
calculate_batch_mean_loss(np.array(fold_data["scores"]), batch_size)
|
|
)
|
|
|
|
orig_mean = np.mean(orig_losses)
|
|
calc_mean = np.mean(calc_losses)
|
|
diff = abs(orig_mean - calc_mean)
|
|
|
|
table_data.append([dim, orig_mean, calc_mean, diff])
|
|
|
|
# Sort by dimension
|
|
table_data.sort(key=lambda x: x[0])
|
|
|
|
print(f"\n{arch_name}:")
|
|
print(
|
|
tabulate(
|
|
table_data,
|
|
headers=headers,
|
|
floatfmt=".6f",
|
|
tablefmt="pipe",
|
|
numalign="right",
|
|
)
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Print loss comparisons for all architectures
|
|
print_loss_comparison(results_folders)
|
|
|
|
# Run main analysis
|
|
evaluate_autoencoder_loss()
|
|
|
|
# Archive management
|
|
# Delete current latest folder
|
|
shutil.rmtree(latest_folder_path, ignore_errors=True)
|
|
latest_folder_path.mkdir(exist_ok=True, parents=True)
|
|
|
|
# Copy contents to latest folder
|
|
for file in output_datetime_path.iterdir():
|
|
shutil.copy2(file, latest_folder_path)
|
|
|
|
# Copy this script for reference
|
|
shutil.copy2(__file__, output_datetime_path)
|
|
shutil.copy2(__file__, latest_folder_path)
|
|
|
|
# Move output to archive
|
|
shutil.move(output_datetime_path, archive_folder_path)
|