This commit is contained in:
Jan Kowalczyk
2025-09-10 19:41:00 +02:00
parent ef0c36eed5
commit cf15d5501e
17 changed files with 1198 additions and 720 deletions

View File

@@ -12,7 +12,7 @@ import numpy as np
import polars as pl
# CHANGE THIS IMPORT IF YOUR LOADER MODULE IS NAMED DIFFERENTLY
from load_results import load_pretraining_results_dataframe
from plot_scripts.load_results import load_pretraining_results_dataframe
# ----------------------------
# Config
@@ -212,7 +212,7 @@ def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
def main():
# Load AE DF (uses your cache if enabled in the loader)
df = load_pretraining_results_dataframe(ROOT, allow_cache=True, include_train=False)
df = load_pretraining_results_dataframe(ROOT, allow_cache=True)
# Optional: filter to just LeNet vs Efficient; drop this set() to plot all nets
wanted_nets = {"LeNet", "Efficient"}

View File

@@ -0,0 +1,544 @@
import json
import math
from typing import Any, Dict, Iterable, List, Optional, Tuple
import polars as pl
Number = (int, float)
FLOAT_DTYPES = {pl.Float32, pl.Float64}
SIMPLE_CASTABLE_DTYPES = (
pl.Int8,
pl.Int16,
pl.Int32,
pl.Int64,
pl.UInt8,
pl.UInt16,
pl.UInt32,
pl.UInt64,
pl.Float32,
pl.Float64,
pl.Utf8,
pl.Boolean,
pl.Date,
pl.Datetime,
pl.Time,
pl.Duration,
)
def _is_nan(x):
try:
return isinstance(x, float) and math.isnan(x)
except Exception:
return False
def _repr_safe(v):
try:
return json.dumps(v, default=str, ensure_ascii=False)
except Exception:
return repr(v)
def _to_python(v):
"""
Convert any leaf-ish object to plain Python types:
- pl.Series -> list (or scalar if length==1)
- objects with .to_list()/.tolist() -> list
- dict stays dict; list/tuple become list
"""
# Polars Series
if isinstance(v, pl.Series):
seq = v.to_list()
return seq[0] if len(seq) == 1 else seq
# Numpy scalars/arrays or anything with tolist()
if hasattr(v, "tolist"):
try:
return v.tolist()
except Exception:
pass
# Polars expressions should not appear; stringify them
# Anything iterable that isn't list/dict/str -> convert carefully
if isinstance(v, tuple):
return [_to_python(x) for x in v]
if isinstance(v, list):
return [_to_python(x) for x in v]
if isinstance(v, dict):
return {k: _to_python(val) for k, val in v.items()}
return v
def _safe_equal(a, b):
"""
Return a plain bool saying whether a and b are equal,
without ever producing a vector/Series.
"""
# exact same object
if a is b:
return True
# normalize
a_n = _to_python(a)
b_n = _to_python(b)
# handle NaNs
if _is_nan(a_n) and _is_nan(b_n):
return True
# plain scalars/containers
try:
eq = a_n == b_n
if isinstance(eq, bool):
return eq
except Exception:
pass
# fallback: compare stable JSON-ish reprs
return _repr_safe(a_n) == _repr_safe(b_n)
def _num_close(a: float, b: float, atol: float, rtol: float) -> bool:
# NaN==NaN treated equal
if _is_nan(a) and _is_nan(b):
return True
return abs(a - b) <= (atol + rtol * abs(b))
def _to_python(v: Any) -> Any:
"""
Convert Polars value to a Python object. Struct -> dict, List -> list, scalars stay scalars.
Values coming from Series[i] / .to_list() are already Python, so this usually no-ops.
"""
return v
def _repr_safe(v: Any) -> str:
try:
return json.dumps(v, default=str, ensure_ascii=False)
except Exception:
return repr(v)
def _iter_dict_keys(d: Dict[str, Any]) -> Iterable[str]:
# stable order, useful for predictable output
return sorted(d.keys())
def _recursive_leaf_diffs(a, b, path, out, float_atol, float_rtol):
# treat None==None
if a is None and b is None:
return
# normalize early
a = _to_python(a)
b = _to_python(b)
# tuples -> lists
if isinstance(a, tuple):
a = list(a)
if isinstance(b, tuple):
b = list(b)
# numbers
if isinstance(a, (int, float)) and isinstance(b, (int, float)):
if _is_nan(a) and _is_nan(b):
return
# |a-b| <= atol + rtol*|b|
if abs(float(a) - float(b)) > (float_atol + float_rtol * abs(float(b))):
out.append(
{
"path": path or "$",
"left": a,
"right": b,
"abs_delta": abs(float(a) - float(b)),
}
)
return
# exact types for strings/bools
if type(a) is type(b) and isinstance(a, (str, bool)):
if not _safe_equal(a, b):
out.append({"path": path or "$", "left": a, "right": b, "abs_delta": None})
return
# lists
if isinstance(a, list) and isinstance(b, list):
if len(a) != len(b):
out.append(
{
"path": f"{path or '$'}.length",
"left": len(a),
"right": len(b),
"abs_delta": None,
}
)
n = min(len(a), len(b))
for i in range(n):
_recursive_leaf_diffs(
a[i], b[i], f"{path or '$'}[{i}]", out, float_atol, float_rtol
)
for i in range(n, len(a)):
out.append(
{
"path": f"{path or '$'}[{i}]",
"left": a[i],
"right": None,
"abs_delta": None,
}
)
for i in range(n, len(b)):
out.append(
{
"path": f"{path or '$'}[{i}]",
"left": None,
"right": b[i],
"abs_delta": None,
}
)
return
# dicts
if isinstance(a, dict) and isinstance(b, dict):
keys = sorted(set(a.keys()) | set(b.keys()))
for k in keys:
ak = a.get(k, None)
bk = b.get(k, None)
if k not in a:
out.append(
{
"path": f"{path or '$'}.{k}",
"left": None,
"right": bk,
"abs_delta": None,
}
)
elif k not in b:
out.append(
{
"path": f"{path or '$'}.{k}",
"left": ak,
"right": None,
"abs_delta": None,
}
)
else:
_recursive_leaf_diffs(
ak, bk, f"{path or '$'}.{k}", out, float_atol, float_rtol
)
return
# fallback (type mismatch / opaque objects)
if not _safe_equal(a, b):
out.append({"path": path or "$", "left": a, "right": b, "abs_delta": None})
def _boolean_mask_simple_equals(s1: pl.Series, s2: pl.Series) -> pl.Series:
both_null = s1.is_null() & s2.is_null()
return ((s1 == s2) | both_null).fill_null(True)
def _boolean_mask_float_close(
s1: pl.Series, s2: pl.Series, atol: float, rtol: float
) -> pl.Series:
both_null = s1.is_null() & s2.is_null()
both_nan = s1.is_nan() & s2.is_nan()
abs_diff = (s1 - s2).abs()
near = abs_diff <= (atol + rtol * s2.abs())
return (near | both_null | both_nan).fill_null(False)
def _candidate_rows_for_nested(col_left: pl.Series, col_right: pl.Series) -> List[int]:
"""
Cheap way to find rows that might differ for nested types:
compare JSON dumps of values. This is only a prefilter.
"""
a = col_left.to_list()
b = col_right.to_list()
cand = []
for i, (x, y) in enumerate(zip(a, b)):
if _repr_safe(x) != _repr_safe(y):
cand.append(i)
return cand
def recursive_diff_frames(
left: pl.DataFrame,
right: pl.DataFrame,
ignore: Optional[List[str]] = None,
float_atol: float = 0.0,
float_rtol: float = 0.0,
max_rows_per_column: int = 20,
max_leafs_per_row: int = 200,
) -> Tuple[pl.DataFrame, pl.DataFrame]:
"""
Deep diff DataFrames, recursing into List/Struct/dict-like values.
Returns (diff_summary, diff_leaves).
- diff_summary: [column, n_rows_with_diffs]
- diff_leaves: [column, row, path, left, right, abs_delta]
left/right are Python values (JSON-serializable where possible).
"""
ignore = set(ignore or [])
# basic guards
if left.height != right.height:
raise ValueError(f"Row count differs: {left.height} vs {right.height}")
lcols = set(left.columns) - ignore
rcols = set(right.columns) - ignore
if lcols != rcols:
raise ValueError(
f"Column sets differ after ignoring.\nleft_only={sorted(lcols - rcols)}\nright_only={sorted(rcols - lcols)}"
)
cols = sorted(lcols)
summary_rows: List[Tuple[str, int]] = []
leaves_rows: List[Dict[str, Any]] = []
for c in cols:
s1, s2 = left[c], right[c]
# Fast path for simple, non-nested types with vectorized comparison
simple_dtype = (
s1.dtype in SIMPLE_CASTABLE_DTYPES and s2.dtype in SIMPLE_CASTABLE_DTYPES
)
is_floaty = s1.dtype in FLOAT_DTYPES and s2.dtype in FLOAT_DTYPES
if simple_dtype and not is_floaty:
equal_mask = _boolean_mask_simple_equals(s1, s2)
diff_idx = [i for i, ok in enumerate(equal_mask) if not ok]
elif simple_dtype and is_floaty:
close_mask = _boolean_mask_float_close(s1, s2, float_atol, float_rtol)
diff_idx = [i for i, ok in enumerate(close_mask) if not ok]
else:
# nested or exotic dtype → candidate rows via JSON compare
diff_idx = _candidate_rows_for_nested(s1, s2)
if not diff_idx:
continue
summary_rows.append((c, len(diff_idx)))
# limit how many rows per column we fully expand
for row in diff_idx[:max_rows_per_column]:
a = s1[row]
b = s2[row]
leaf_diffs: List[Dict[str, Any]] = []
_recursive_leaf_diffs(
a,
b,
path="",
out=leaf_diffs,
float_atol=float_atol,
float_rtol=float_rtol,
)
# If all leaf_diffs are only float-close (within tol), suppress (can happen for nested)
# The recursive function already filters by tolerance for numbers, so we keep what's left.
# cap the number of leaf diffs to avoid explosion
for d in leaf_diffs[:max_leafs_per_row]:
left_norm = _repr_safe(_to_python(d["left"])) # -> str
right_norm = _repr_safe(_to_python(d["right"])) # -> str
abs_delta_val = d.get("abs_delta", None)
try:
abs_delta_norm = (
float(abs_delta_val) if abs_delta_val is not None else None
)
except Exception:
abs_delta_norm = None # just in case something weird sneaks in
leaves_rows.append(
{
"column": str(c),
"row": int(row),
"path": str(d["path"] or "$"),
"left": left_norm, # str
"right": right_norm, # str
"abs_delta": abs_delta_norm, # float or None
}
)
diff_summary = (
pl.DataFrame(summary_rows, schema=["column", "n_rows_with_diffs"]).sort(
"n_rows_with_diffs", descending=True
)
if summary_rows
else pl.DataFrame(
{
"column": pl.Series([], pl.Utf8),
"n_rows_with_diffs": pl.Series([], pl.Int64),
}
)
)
# Build diff_leaves with stable schema; stringify complex left/right to avoid concat issues
if leaves_rows:
diff_leaves = pl.DataFrame(
{
"column": [r["column"] for r in leaves_rows],
"row": pl.Series([r["row"] for r in leaves_rows], dtype=pl.Int64),
"path": [r["path"] for r in leaves_rows],
"left": [r["left"] for r in leaves_rows], # Utf8
"right": [r["right"] for r in leaves_rows], # Utf8
"abs_delta": pl.Series(
[r["abs_delta"] for r in leaves_rows], dtype=pl.Float64
),
},
schema={
"column": pl.Utf8,
"row": pl.Int64,
"path": pl.Utf8,
"left": pl.Utf8,
"right": pl.Utf8,
"abs_delta": pl.Float64,
},
)
else:
diff_leaves = pl.DataFrame(
{
"column": [],
"row": [],
"path": [],
"left": [],
"right": [],
"abs_delta": [],
}
)
return diff_summary, diff_leaves
# FLOAT_DTYPES = {pl.Float32, pl.Float64}
# def diff_frames(
# left: pl.DataFrame,
# right: pl.DataFrame,
# ignore: Optional[List[str]] = None,
# float_atol: float = 0.0,
# float_rtol: float = 0.0,
# sample: int = 20,
# ) -> Tuple[pl.DataFrame, pl.DataFrame]:
# ignore = set(ignore or [])
# if left.height != right.height:
# raise ValueError(f"Row count differs: {left.height} vs {right.height}")
# lcols = set(left.columns) - ignore
# rcols = set(right.columns) - ignore
# if lcols != rcols:
# raise ValueError(
# f"Column sets differ after ignoring.\nleft_only={sorted(lcols - rcols)}\nright_only={sorted(rcols - lcols)}"
# )
# cols = sorted(lcols)
# row_idx = pl.Series("row", range(left.height), dtype=pl.Int64)
# def _float_diff_mask(s1: pl.Series, s2: pl.Series) -> pl.Series:
# both_null = s1.is_null() & s2.is_null()
# both_nan = s1.is_nan() & s2.is_nan()
# abs_diff = (s1 - s2).abs()
# near = abs_diff <= (float_atol + float_rtol * s2.abs())
# return ~(near | both_null | both_nan)
# def _nonfloat_diff_mask(s1: pl.Series, s2: pl.Series) -> pl.Series:
# both_null = s1.is_null() & s2.is_null()
# return ~((s1 == s2) | both_null).fill_null(True)
# examples_frames = []
# summary_rows = []
# for c in cols:
# s1, s2 = left[c], right[c]
# if s1.dtype in FLOAT_DTYPES and s2.dtype in FLOAT_DTYPES:
# diff_mask = _float_diff_mask(s1, s2)
# abs_delta = (s1 - s2).abs()
# else:
# diff_mask = _nonfloat_diff_mask(s1, s2)
# abs_delta = None
# diff_mask = diff_mask.cast(pl.Boolean)
# n_diff = int(diff_mask.sum())
# if n_diff == 0:
# continue
# summary_rows.append((c, n_diff))
# k = min(sample, n_diff)
# idx = row_idx.filter(diff_mask)[:k]
# def to_utf8_safe(s: pl.Series) -> pl.Series:
# # Fast path for simple scalars
# if s.dtype in (
# pl.Int8,
# pl.Int16,
# pl.Int32,
# pl.Int64,
# pl.UInt8,
# pl.UInt16,
# pl.UInt32,
# pl.UInt64,
# pl.Float32,
# pl.Float64,
# pl.Utf8,
# pl.Boolean,
# pl.Date,
# pl.Datetime,
# pl.Time,
# pl.Duration,
# ):
# return s.cast(pl.Utf8)
# # Fallback for nested/complex types: List, Struct, etc.
# return s.map_elements(
# lambda v: json.dumps(v, default=str, allow_nan=True),
# return_dtype=pl.Utf8,
# )
# ex_left = to_utf8_safe(s1.filter(diff_mask)[:k])
# ex_right = to_utf8_safe(s2.filter(diff_mask)[:k])
# ex = pl.DataFrame(
# {
# "column": [c] * k,
# "row": idx,
# "left": ex_left,
# "right": ex_right,
# "dtype_left": [str(s1.dtype)] * k,
# "dtype_right": [str(s2.dtype)] * k,
# }
# )
# # unify schema: always have abs_delta as Float64 (None for non-floats)
# if abs_delta is not None:
# ex = ex.with_columns(
# abs_delta.filter(diff_mask)[:k].cast(pl.Float64).alias("abs_delta")
# )
# else:
# ex = ex.with_columns(pl.lit(None, dtype=pl.Float64).alias("abs_delta"))
# examples_frames.append(ex)
# diff_summary = (
# pl.DataFrame(summary_rows, schema=["column", "n_different"]).sort(
# "n_different", descending=True
# )
# if summary_rows
# else pl.DataFrame(
# {
# "column": pl.Series([], pl.Utf8),
# "n_different": pl.Series([], pl.Int64),
# }
# )
# )
# diff_examples = (
# pl.concat(examples_frames) if examples_frames else pl.DataFrame()
# )
# return diff_summary, diff_examples
# # --- usage ---
# # diff_summary: one row per column with a count of differing rows
# # diff_examples: sample rows showing left/right values (and abs_delta for floats)
# summary, examples = diff_frames(
# df1, df2, ignore=["timestamp"], float_atol=0.1, float_rtol=0.0, sample=25
# )
# print(summary) # which columns differ and how much
# print(examples) # sample mismatches with row indices

View File

@@ -3,10 +3,12 @@ from __future__ import annotations
import json
import pickle
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import polars as pl
from diff_df import recursive_diff_frames
from polars.testing import assert_frame_equal
# ------------------------------------------------------------
# Config you can tweak
@@ -75,7 +77,8 @@ PRETRAIN_SCHEMA = {
"fold": pl.Int32,
"split": pl.Utf8, # "train" | "test"
# timings and optimization
"time": pl.Float64,
"train_time": pl.Float64,
"test_time": pl.Float64,
"loss": pl.Float64,
# per-sample arrays (as lists)
"indices": pl.List(pl.Int32),
@@ -247,6 +250,14 @@ def read_pickle(p: Path) -> Any:
# ------------------------------------------------------------
# Extractors for each model
# ------------------------------------------------------------
counting = {
(label_method, eval_method): []
for label_method in ["exp_based", "manual_based"]
for eval_method in ["roc", "prc"]
}
def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
"""
deepsad under data['test'][eval], with extra per-eval arrays and AP present.
@@ -257,6 +268,8 @@ def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
evd = test.get(ev)
if not isinstance(evd, dict):
continue
counting[(ev, "roc")].append(len(evd["roc"][0]))
counting[(ev, "prc")].append(len(evd["prc"][0]))
out[ev] = {
"auc": float(evd["auc"])
if "auc" in evd and evd["auc"] is not None
@@ -444,7 +457,6 @@ def load_results_dataframe(root: Path, allow_cache: bool = True) -> pl.DataFrame
def load_pretraining_results_dataframe(
root: Path,
allow_cache: bool = True,
include_train: bool = False, # <— default: store only TEST to keep cache tiny
keep_file_names: bool = False, # <— drop file_names by default; theyre repeated
parquet_compression: str = "zstd",
parquet_compression_level: int = 7, # <— stronger compression than default
@@ -484,9 +496,6 @@ def load_pretraining_results_dataframe(
semi_anomalous = int(cfg.get("num_known_outlier"))
k = int(cfg.get("k_fold_num"))
# Only test split by default (include_train=False)
splits = ("train", "test") if include_train else ("test",)
for fold in range(k):
pkl = exp_dir / f"results_ae_{fold}.pkl"
if not pkl.exists():
@@ -498,57 +507,53 @@ def load_pretraining_results_dataframe(
print(f"[warn] failed to read {pkl.name}: {e}")
continue
for split in splits:
splitd = data.get(split)
if not isinstance(splitd, dict):
continue
train_time = data.get("train", {}).get("time")
data = data.get("test", {})
rows.append(
{
"network": network,
"latent_dim": latent_dim,
"semi_normals": semi_normals,
"semi_anomalous": semi_anomalous,
"model": "ae",
"fold": fold,
"split": split,
"time": float(splitd.get("time"))
if splitd.get("time") is not None
else None,
"loss": float(splitd.get("loss"))
if splitd.get("loss") is not None
else None,
# ints as Int32, scores as Float32 to save space
"indices": normalize_int_list(splitd.get("indices")),
"labels_exp_based": normalize_int_list(
splitd.get("labels_exp_based")
),
"labels_manual_based": normalize_int_list(
splitd.get("labels_manual_based")
),
"semi_targets": normalize_int_list(splitd.get("semi_targets")),
"file_ids": normalize_int_list(splitd.get("file_ids")),
"frame_ids": normalize_int_list(splitd.get("frame_ids")),
"scores": (
None
if splitd.get("scores") is None
else [
float(x)
for x in (
splitd["scores"].tolist()
if isinstance(splitd["scores"], np.ndarray)
else splitd["scores"]
)
]
),
"file_names": normalize_file_names(splitd.get("file_names"))
if keep_file_names
else None,
"folder": str(exp_dir),
"k_fold_num": k,
"config_json": cfg_json,
}
)
rows.append(
{
"network": network,
"latent_dim": latent_dim,
"semi_normals": semi_normals,
"semi_anomalous": semi_anomalous,
"model": "ae",
"fold": fold,
"train_time": train_time,
"test_time": data.get("time"),
"loss": float(data.get("loss"))
if data.get("loss") is not None
else None,
# ints as Int32, scores as Float32 to save space
"indices": normalize_int_list(data.get("indices")),
"labels_exp_based": normalize_int_list(
data.get("labels_exp_based")
),
"labels_manual_based": normalize_int_list(
data.get("labels_manual_based")
),
"semi_targets": normalize_int_list(data.get("semi_targets")),
"file_ids": normalize_int_list(data.get("file_ids")),
"frame_ids": normalize_int_list(data.get("frame_ids")),
"scores": (
None
if data.get("scores") is None
else [
float(x)
for x in (
data["scores"].tolist()
if isinstance(data["scores"], np.ndarray)
else data["scores"]
)
]
),
"file_names": normalize_file_names(data.get("file_names"))
if keep_file_names
else None,
"folder": str(exp_dir),
"k_fold_num": k,
"config_json": cfg_json,
}
)
if not rows:
return pl.DataFrame(schema=PRETRAIN_SCHEMA)
@@ -561,7 +566,7 @@ def load_pretraining_results_dataframe(
pl.col(
"latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
).cast(pl.Int32),
pl.col("time", "loss").cast(pl.Float64),
pl.col("test_time", "train_time", "loss").cast(pl.Float64),
pl.col("scores").cast(pl.List(pl.Float32)), # ensure downcast took
)
@@ -585,12 +590,53 @@ def load_pretraining_results_dataframe(
def main():
root = Path("/home/fedex/mt/results/done")
df = load_results_dataframe(root, allow_cache=True)
print(df.shape, df.head())
root = Path("/home/fedex/mt/results/copy")
df1 = load_results_dataframe(root, allow_cache=True)
exit(0)
df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
print("pretraining:", df_pre.shape, df_pre.head())
retest_root = Path("/home/fedex/mt/results/copy/retest_nodrop")
df2 = load_results_dataframe(retest_root, allow_cache=False).drop("folder")
# exact schema & shape first (optional but helpful messages)
assert df1.shape == df2.shape, f"Shape differs: {df1.shape} vs {df2.shape}"
assert set(df1.columns) == set(df2.columns), (
f"Column sets differ: {df1.columns} vs {df2.columns}"
)
# allow small float diffs, ignore column order differences if you want
df1_sorted = df1.select(sorted(df1.columns))
df2_sorted = df2.select(sorted(df2.columns))
# Optionally pre-align/sort both frames by a stable key before diffing.
summary, leaves = recursive_diff_frames(
df1,
df2,
ignore=["timestamp"], # columns to ignore
float_atol=0.1, # absolute tolerance for floats
float_rtol=0.0, # relative tolerance for floats
max_rows_per_column=20, # limit expansion per column
max_leafs_per_row=200, # cap leaves per row
)
pl.Config.set_fmt_table_cell_list_len(100)
pl.Config.set_tbl_rows(100)
print(summary) # which columns differ & how many rows
print(leaves) # exact nested paths + scalar diffs
# check_exact=False lets us use atol/rtol for floats
assert_frame_equal(
df1_sorted,
df2_sorted,
check_exact=False,
atol=0.1, # absolute tolerance for floats
rtol=0.0, # relative tolerance (set if you want % based)
check_dtypes=True, # set False if you only care about values
)
print("DataFrames match within tolerance ✅")
# df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
# print("pretraining:", df_pre.shape, df_pre.head())
if __name__ == "__main__":

View File

@@ -10,7 +10,7 @@ import polars as pl
from matplotlib.lines import Line2D
# CHANGE THIS IMPORT IF YOUR LOADER MODULE IS NAMED DIFFERENTLY
from load_results import load_results_dataframe
from plot_scripts.load_results import load_results_dataframe
# ----------------------------
# Config

View File

@@ -12,7 +12,7 @@ from matplotlib.lines import Line2D
from scipy.stats import sem, t
# CHANGE THIS IMPORT IF YOUR LOADER MODULE NAME IS DIFFERENT
from load_results import load_results_dataframe
from plot_scripts.load_results import load_results_dataframe
# ---------------------------------
# Config

View File

@@ -0,0 +1,704 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import shutil
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
import numpy as np
import pandas as pd
import polars as pl
from load_results import (
load_pretraining_results_dataframe,
load_results_dataframe,
)
# ----------------------------
# Config
# ----------------------------
RESULTS_ROOT = Path("/home/fedex/mt/results/done") # folder with experiment subdirs
OUTPUT_DIR = Path("/home/fedex/mt/plots/setup_runtime_tables") # where .tex goes
# If you want to optionally prefer a specific network label for baselines in column names,
# set to a substring to detect (e.g. "efficient"). If None, keep network as-is.
BASELINE_NETWORK_HINT: Optional[str] = None # e.g., "efficient" or None
# ----------------------------
# Helpers
# ----------------------------
def _net_label_for_display(net: str | None) -> str:
s = (net or "").lower()
if "effic" in s:
return "Efficient"
if "lenet" in s:
return "LeNet"
return net or ""
def _fmt_mean_std_n(
mean: float | None, std: float | None, n: int | None, unit: str = ""
) -> str:
if mean is None or (isinstance(mean, float) and (np.isnan(mean) or np.isinf(mean))):
return "-"
base = f"{mean:.2f}"
if std is not None and not (
isinstance(std, float) and (np.isnan(std) or np.isinf(std))
):
base = f"{base} ± {std:.2f}"
if unit:
base = f"{base} {unit}"
if n is not None and n > 0:
base = f"{base} (n={n})"
return base
def _fmt_pair(n: int, m: int) -> str:
return f"{n}/{m}"
def _fmt_mean_std(mean: float | None, std: float | None, n: int | None) -> str:
if mean is None or (isinstance(mean, float) and (np.isnan(mean) or np.isinf(mean))):
return "-"
if std is None or (isinstance(std, float) and (np.isnan(std) or np.isinf(std))):
return f"{mean:.2f}"
if n is None or n < 1:
return f"{mean:.2f} ± {std:.2f}"
return f"{mean:.2f} ± {std:.2f} (n={n})"
def _parse_cfg(cfg_json: Optional[str]) -> Dict[str, Any]:
if not cfg_json:
return {}
try:
return json.loads(cfg_json)
except Exception:
return {}
def _key_params(model: str, cfg: Dict[str, Any]) -> str:
"""Compact, model-specific parameter string for the table."""
if model == "deepsad":
bs = cfg.get("batch_size")
ne = cfg.get("n_epochs")
lr = cfg.get("lr")
wd = cfg.get("weight_decay")
return f"bs={bs}, epochs={ne}, lr={lr}, wd={wd}"
if model == "isoforest":
est = cfg.get("isoforest_n_estimators")
ms = cfg.get("isoforest_max_samples")
cont = cfg.get("isoforest_contamination")
return f"n_estimators={est}, max_samples={ms}, cont={cont}"
if model == "ocsvm":
ker = cfg.get("ocsvm_kernel")
nu = cfg.get("ocsvm_nu")
return f"kernel={ker}, nu={nu}"
return "-"
def _method_col_name(model: str, network: str) -> str:
"""
Column heading for pivot tables:
- deepsad carries the network (e.g., 'DeepSAD / LeNet')
- baselines carry their own model name; optionally annotate network
"""
label = model.lower()
if label == "deepsad":
return f"DeepSAD / {network}"
# baselines; optionally simplify/standardize network name
if (
BASELINE_NETWORK_HINT
and BASELINE_NETWORK_HINT.lower() not in (network or "").lower()
):
# If you want to collapse baseline duplicates to a single name, you can force it here
return model.capitalize()
# Otherwise, keep network variant explicit
return f"{model.capitalize()} / {network}"
def _prepare_per_fold_metrics(df: pl.DataFrame) -> pl.DataFrame:
"""
Returns one row per (folder, model, fold) with:
- train_time, test_time
- n_test (len(scores))
- n_epochs (from config_json; DeepSAD only)
- latency_ms = 1000 * test_time / n_test
- time_per_epoch = train_time / n_epochs (DeepSAD only)
"""
base = (
df.select(
"folder",
"network",
"model",
"latent_dim",
"semi_normals",
"semi_anomalous",
"fold",
"train_time",
"test_time",
"scores",
"config_json",
)
.with_columns(
n_test=pl.col("scores").list.len(),
n_epochs=pl.col("config_json")
.str.json_path_match("$.n_epochs")
.cast(pl.Int64),
)
.drop("scores")
)
# de-dup across evals
uniq = base.unique(subset=["folder", "model", "fold"])
# derived metrics
uniq = uniq.with_columns(
latency_ms=pl.when((pl.col("test_time") > 0) & (pl.col("n_test") > 0))
.then(1000.0 * pl.col("test_time") / pl.col("n_test"))
.otherwise(None)
.cast(pl.Float64),
time_per_epoch=pl.when(
(pl.col("model") == "deepsad") & (pl.col("n_epochs") > 0)
)
.then(pl.col("train_time") / pl.col("n_epochs"))
.otherwise(None)
.cast(pl.Float64),
network_disp=pl.col("network")
.cast(pl.Utf8)
.map_elements(_net_label_for_display, return_dtype=pl.Utf8),
)
return uniq
def _prepare_aggregates(df: pl.DataFrame) -> pl.DataFrame:
"""
Deduplicate across evals, then aggregate times across folds for each
(network, model, latent_dim, semi_normals, semi_anomalous).
"""
# Keep only columns we need
base = df.select(
"folder",
"network",
"model",
"latent_dim",
"semi_normals",
"semi_anomalous",
"fold",
"train_time",
"test_time",
"config_json",
)
# Drop duplicates across evals: same (folder, model, fold) should have identical timings
uniq = base.unique(subset=["folder", "model", "fold"]).with_columns(
# Normalize network to a simpler display label, if your config used long names
pl.col("network").cast(pl.Utf8)
)
# Group across folds
agg = (
uniq.group_by(
["network", "model", "latent_dim", "semi_normals", "semi_anomalous"]
)
.agg(
pl.len().alias("n_folds"),
pl.col("train_time").mean().alias("train_mean"),
pl.col("train_time").std(ddof=1).alias("train_std"),
pl.col("test_time").mean().alias("test_mean"),
pl.col("test_time").std(ddof=1).alias("test_std"),
pl.col("config_json")
.first()
.alias("config_json"), # one exemplar cfg per group
)
.sort(["semi_normals", "semi_anomalous", "latent_dim", "network", "model"])
)
return agg
def make_training_runtime_table(df: pl.DataFrame) -> str:
"""
Returns a LaTeX table (string) for TRAIN runtimes: mean ± std (seconds) across folds.
Rows: Semi (N/O), Latent Dim
Columns: methods split (DeepSAD/LeNet, DeepSAD/Efficient, IsoForest[/net], OCSVM[/net])
"""
agg = _prepare_aggregates(df)
# Prepare display strings and column keys
tbl = agg.with_columns(
pl.format("{}/{}", pl.col("semi_normals"), pl.col("semi_anomalous")).alias(
"semi"
),
pl.col("model").cast(pl.Utf8),
pl.col("network").cast(pl.Utf8),
pl.col("latent_dim").cast(pl.Int64),
# ADD return_dtype here
pl.struct(["train_mean", "train_std", "n_folds"])
.map_elements(
lambda s: _fmt_mean_std(s["train_mean"], s["train_std"], s["n_folds"]),
return_dtype=pl.Utf8,
)
.alias("train_fmt"),
# And here
pl.struct(["model", "network"])
.map_elements(
lambda s: _method_col_name(s["model"], s["network"]),
return_dtype=pl.Utf8,
)
.alias("method"),
).select("semi", "latent_dim", "method", "train_fmt")
# Pivot to wide form: one cell per (semi, latent_dim, method)
wide = tbl.pivot(
values="train_fmt",
index=["semi", "latent_dim"],
columns="method",
aggregate_function="first",
).sort(["semi", "latent_dim"])
# Fill missing with '-' and export
pdf = wide.fill_null("-").to_pandas()
pdf.index = pd.MultiIndex.from_frame(pdf[["semi", "latent_dim"]])
pdf = pdf.drop(columns=["semi", "latent_dim"])
latex = pdf.to_latex(
index=True,
escape=True,
na_rep="-",
multicolumn=True,
multicolumn_format="c",
bold_rows=False,
caption="Training runtime (seconds): mean ± std across folds (n in parentheses).",
label="tab:train_runtimes",
)
return latex
def make_inference_runtime_table(df: pl.DataFrame) -> str:
"""
Returns a LaTeX table (string) for TEST/INFERENCE runtimes: mean ± std (seconds) across folds.
Same layout as training table.
"""
agg = _prepare_aggregates(df)
tbl = agg.with_columns(
pl.format("{}/{}", pl.col("semi_normals"), pl.col("semi_anomalous")).alias(
"semi"
),
pl.col("model").cast(pl.Utf8),
pl.col("network").cast(pl.Utf8),
pl.col("latent_dim").cast(pl.Int64),
pl.struct(["test_mean", "test_std", "n_folds"])
.map_elements(
lambda s: _fmt_mean_std(s["test_mean"], s["test_std"], s["n_folds"]),
return_dtype=pl.Utf8,
)
.alias("test_fmt"),
pl.struct(["model", "network"])
.map_elements(
lambda s: _method_col_name(s["model"], s["network"]),
return_dtype=pl.Utf8,
)
.alias("method"),
).select("semi", "latent_dim", "method", "test_fmt")
wide = tbl.pivot(
values="test_fmt",
index=["semi", "latent_dim"],
columns="method",
aggregate_function="first",
).sort(["semi", "latent_dim"])
pdf = wide.fill_null("-").to_pandas()
pdf.index = pd.MultiIndex.from_frame(pdf[["semi", "latent_dim"]])
pdf = pdf.drop(columns=["semi", "latent_dim"])
latex = pdf.to_latex(
index=True,
escape=True,
na_rep="-",
multicolumn=True,
multicolumn_format="c",
bold_rows=False,
caption="Inference/Test runtime (seconds): mean ± std across folds (n in parentheses).",
label="tab:test_runtimes",
)
return latex
def make_longform_train_table_with_params(df: pl.DataFrame) -> str:
"""
(Optional) Long-form table that includes a 'Params' column extracted from config_json.
Useful if you want to show per-model settings alongside the runtimes.
"""
agg = _prepare_aggregates(df)
# Build params column from JSON for readability
long = (
agg.with_columns(
pl.format("{}/{}", pl.col("semi_normals"), pl.col("semi_anomalous")).alias(
"semi"
),
pl.col("latent_dim").cast(pl.Int64),
pl.struct(["model", "config_json"])
.map_elements(
lambda s: _key_params(s["model"], _parse_cfg(s["config_json"])),
return_dtype=pl.Utf8,
)
.alias("params"),
pl.struct(["train_mean", "train_std", "n_folds"])
.map_elements(
lambda s: _fmt_mean_std(s["train_mean"], s["train_std"], s["n_folds"])
)
.alias("train_time_fmt"),
)
.select(
"network",
"model",
"latent_dim",
"semi",
"params",
"train_time_fmt",
)
.sort(["semi", "latent_dim", "network", "model"])
)
pdf = long.to_pandas()
pdf.rename(
columns={
"network": "Network",
"model": "Method",
"latent_dim": "Latent Dim",
"semi": "Semi (N/O)",
"params": "Params",
"train_time_fmt": "Train time [s] (mean ± std)",
},
inplace=True,
)
latex = pdf.to_latex(
index=False,
escape=True,
longtable=False,
caption="Training runtime with key parameters.",
label="tab:train_runtimes_params",
)
return latex
def make_training_runtime_table_compact(df: pl.DataFrame) -> str:
per_fold = _prepare_per_fold_metrics(df)
# DeepSAD: keep LeNet vs Efficient, collapse semis
ds = (
per_fold.filter(pl.col("model") == "deepsad")
.group_by(["model", "network_disp", "latent_dim"])
.agg(
n=pl.len(),
train_mean=pl.mean("train_time"),
train_std=pl.std("train_time", ddof=1),
tpe_mean=pl.mean("time_per_epoch"),
tpe_std=pl.std("time_per_epoch", ddof=1),
)
.with_columns(
method=pl.format("DeepSAD / {}", pl.col("network_disp")),
)
)
# Baselines: collapse networks & semis; only vary by latent_dim
bl = (
per_fold.filter(pl.col("model").is_in(["isoforest", "ocsvm"]))
.group_by(["model", "latent_dim"])
.agg(
n=pl.len(),
train_mean=pl.mean("train_time"),
train_std=pl.std("train_time", ddof=1),
)
.with_columns(
method=pl.when(pl.col("model") == "isoforest")
.then(pl.lit("IsoForest"))
.when(pl.col("model") == "ocsvm")
.then(pl.lit("OCSVM"))
.otherwise(pl.lit("Baseline"))
)
)
# --- Standardize schemas before concat ---
ds_std = ds.select(
pl.col("latent_dim").cast(pl.Int64),
pl.col("method").cast(pl.Utf8),
pl.col("train_mean").cast(pl.Float64),
pl.col("train_std").cast(pl.Float64),
pl.col("tpe_mean").cast(pl.Float64),
pl.col("tpe_std").cast(pl.Float64),
pl.col("n").cast(pl.Int64),
)
bl_std = bl.select(
pl.col("latent_dim").cast(pl.Int64),
pl.col("method").cast(pl.Utf8),
pl.col("train_mean").cast(pl.Float64),
pl.col("train_std").cast(pl.Float64),
pl.lit(None, dtype=pl.Float64).alias("tpe_mean"),
pl.lit(None, dtype=pl.Float64).alias("tpe_std"),
pl.col("n").cast(pl.Int64),
)
agg = pl.concat([ds_std, bl_std], how="vertical")
# Format cell: total [s]; DeepSAD also appends (italic) per-epoch
def _fmt_train_cell(s: dict) -> str:
total = _fmt_mean_std_n(s["train_mean"], s["train_std"], s["n"], "s")
if s.get("tpe_mean") is None or (
isinstance(s.get("tpe_mean"), float) and np.isnan(s["tpe_mean"])
):
return total
tpe = _fmt_mean_std_n(s["tpe_mean"], s["tpe_std"], None, "s/epoch")
return f"{total} (\\textit{{{tpe}}})"
tbl = agg.with_columns(
pl.struct(["train_mean", "train_std", "tpe_mean", "tpe_std", "n"])
.map_elements(_fmt_train_cell, return_dtype=pl.Utf8)
.alias("train_fmt"),
).select("latent_dim", "method", "train_fmt")
# Pivot and order columns nicely
wide = tbl.pivot(
values="train_fmt",
index=["latent_dim"],
columns="method",
aggregate_function="first",
).sort("latent_dim")
pdf = wide.fill_null("-").to_pandas().set_index("latent_dim")
desired_cols = [
c
for c in ["DeepSAD / LeNet", "DeepSAD / Efficient", "IsoForest", "OCSVM"]
if c in pdf.columns
]
if desired_cols:
pdf = pdf.reindex(columns=desired_cols)
latex = pdf.to_latex(
index=True,
escape=True,
na_rep="-",
multicolumn=True,
multicolumn_format="c",
bold_rows=False,
caption="Training runtime: total seconds (mean ± std). DeepSAD cells also show \\textit{seconds per epoch} in parentheses.",
label="tab:train_runtimes_compact",
)
return latex
def make_inference_latency_table_compact(df: pl.DataFrame) -> str:
per_fold = _prepare_per_fold_metrics(df)
# DeepSAD: keep networks; collapse semis
ds = (
per_fold.filter(pl.col("model") == "deepsad")
.group_by(["model", "network_disp", "latent_dim"])
.agg(
n=pl.len(),
lat_mean=pl.mean("latency_ms"),
lat_std=pl.std("latency_ms", ddof=1),
)
.with_columns(
method=pl.format("DeepSAD / {}", pl.col("network_disp")),
)
)
# Baselines: collapse networks & semis
bl = (
per_fold.filter(pl.col("model").is_in(["isoforest", "ocsvm"]))
.group_by(["model", "latent_dim"])
.agg(
n=pl.len(),
lat_mean=pl.mean("latency_ms"),
lat_std=pl.std("latency_ms", ddof=1),
)
.with_columns(
method=pl.when(pl.col("model") == "isoforest")
.then(pl.lit("IsoForest"))
.when(pl.col("model") == "ocsvm")
.then(pl.lit("OCSVM"))
.otherwise(pl.lit("Baseline"))
)
)
# --- Standardize schemas before concat ---
ds_std = ds.select(
pl.col("latent_dim").cast(pl.Int64),
pl.col("method").cast(pl.Utf8),
pl.col("lat_mean").cast(pl.Float64),
pl.col("lat_std").cast(pl.Float64),
pl.col("n").cast(pl.Int64),
)
bl_std = bl.select(
pl.col("latent_dim").cast(pl.Int64),
pl.col("method").cast(pl.Utf8),
pl.col("lat_mean").cast(pl.Float64),
pl.col("lat_std").cast(pl.Float64),
pl.col("n").cast(pl.Int64),
)
agg = pl.concat([ds_std, bl_std], how="vertical")
def _fmt_lat_cell(s: dict) -> str:
return _fmt_mean_std_n(s["lat_mean"], s["lat_std"], s["n"], "ms")
tbl = agg.with_columns(
pl.struct(["lat_mean", "lat_std", "n"])
.map_elements(_fmt_lat_cell, return_dtype=pl.Utf8)
.alias("lat_fmt"),
).select("latent_dim", "method", "lat_fmt")
wide = tbl.pivot(
values="lat_fmt",
index=["latent_dim"],
columns="method",
aggregate_function="first",
).sort("latent_dim")
pdf = wide.fill_null("-").to_pandas().set_index("latent_dim")
desired_cols = [
c
for c in ["DeepSAD / LeNet", "DeepSAD / Efficient", "IsoForest", "OCSVM"]
if c in pdf.columns
]
if desired_cols:
pdf = pdf.reindex(columns=desired_cols)
latex = pdf.to_latex(
index=True,
escape=True,
na_rep="-",
multicolumn=True,
multicolumn_format="c",
bold_rows=False,
caption="Inference latency (ms/sample): mean ± std across folds; baselines collapsed across networks and semi-labeling.",
label="tab:inference_latency_compact",
)
return latex
def make_ae_pretraining_runtime_table(df_pre: pl.DataFrame) -> str:
"""
LaTeX table: Autoencoder (pretraining) runtime per latent dim.
Rows: latent_dim
Cols: AE / LeNet, AE / Efficient (mean ± std seconds across folds)
"""
# minimal columns we need
base = df_pre.select(
pl.col("network").cast(pl.Utf8),
pl.col("latent_dim").cast(pl.Int64),
pl.col("fold").cast(pl.Int64),
pl.col("train_time").cast(pl.Float64),
).drop_nulls(subset=["network", "latent_dim", "train_time"])
# Nice display label for network
network_disp = (
pl.when(pl.col("network").str.contains("efficient"))
.then(pl.lit("Efficient"))
.when(pl.col("network").str.contains("LeNet"))
.then(pl.lit("LeNet"))
.otherwise(pl.col("network"))
.alias("network_disp")
)
agg = (
base.with_columns(network_disp)
.group_by(["network_disp", "latent_dim"])
.agg(
n=pl.len(),
train_mean=pl.mean("train_time"),
train_std=pl.std("train_time", ddof=1),
)
.with_columns(
pl.format("AE / {}", pl.col("network_disp")).alias("method"),
pl.struct(["train_mean", "train_std", "n"])
.map_elements(
lambda s: _fmt_mean_std(s["train_mean"], s["train_std"], s["n"]),
return_dtype=pl.Utf8,
)
.alias("train_fmt"),
)
.select("latent_dim", "method", "train_fmt")
.sort(["latent_dim", "method"])
)
wide = agg.pivot(
values="train_fmt",
index=["latent_dim"],
columns="method",
aggregate_function="first",
).sort("latent_dim")
pdf = wide.fill_null("-").to_pandas().set_index("latent_dim")
# Order columns if both exist
desired = [
c for c in ["Autoencoder LeNet", "Autoencoder Efficient"] if c in pdf.columns
]
if desired:
pdf = pdf.reindex(columns=desired)
latex = pdf.to_latex(
index=True,
escape=True,
na_rep="-",
multicolumn=True,
multicolumn_format="c",
bold_rows=False,
caption="Autoencoder pretraining runtime (seconds): mean ± std across folds.",
label="tab:ae_pretrain_runtimes",
)
return latex
# ----------------------------
# Main
# ----------------------------
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Main results
df = load_results_dataframe(RESULTS_ROOT, allow_cache=True)
if "config_json" not in df.columns:
df = df.with_columns(pl.lit(None).alias("config_json"))
# AE pretraining results
df_pre = load_pretraining_results_dataframe(RESULTS_ROOT, allow_cache=True)
# Build LaTeX tables
latex_train = make_training_runtime_table(df)
latex_test = make_inference_runtime_table(df)
latex_train_params = make_longform_train_table_with_params(df)
latex_ae = make_ae_pretraining_runtime_table(df_pre)
# Timestamped output dirs
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ts_dir = OUTPUT_DIR / "archive" / ts
ts_dir.mkdir(parents=True, exist_ok=True)
# Write files
(ts_dir / "train_runtimes.tex").write_text(latex_train)
(ts_dir / "test_runtimes.tex").write_text(latex_test)
(ts_dir / "train_runtimes_with_params.tex").write_text(latex_train_params)
(ts_dir / "ae_pretraining_runtimes.tex").write_text(latex_ae)
# Save script & mirror latest
script_path = Path(__file__)
shutil.copy2(script_path, ts_dir)
latest = OUTPUT_DIR / "latest"
latest.mkdir(exist_ok=True, parents=True)
for f in ts_dir.iterdir():
if f.is_file():
shutil.copy2(f, latest / f.name)
print(f"Saved LaTeX tables to: {ts_dir}")
print(f"Also updated: {latest}")
if __name__ == "__main__":
main()