Files
mt/tools/load_results.py
2025-09-02 16:30:32 +02:00

761 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import json
import pickle
from itertools import product
from pathlib import Path
from typing import Any, Dict, List, Optional
import numpy as np
import polars as pl
# --- configure your intended grid here (use the *canonical* strings used in df) ---
NETWORKS_EXPECTED = ["subter_LeNet", "subter_efficient"]
LATENT_DIMS_EXPECTED = [32, 64, 128, 256, 512, 768, 1024]
SEMI_LABELS_EXPECTED = [(0, 0), (50, 10), (500, 100)]
MODELS_EXPECTED = ["deepsad", "isoforest", "ocsvm"]
EVALS_EXPECTED = ["exp_based", "manual_based"]
# If k-fold is uniform, set it. If None, we infer it *per combo* from df.
EXPECTED_K_FOLD: int | None = None # e.g., 3
def add_shape_columns(df: pl.DataFrame) -> pl.DataFrame:
return df.with_columns(
# ROC lens
roc_fpr_len=pl.when(pl.col("roc_curve").is_null())
.then(None)
.otherwise(pl.col("roc_curve").struct.field("fpr").list.len()),
roc_tpr_len=pl.when(pl.col("roc_curve").is_null())
.then(None)
.otherwise(pl.col("roc_curve").struct.field("tpr").list.len()),
roc_thr_len=pl.when(pl.col("roc_curve").is_null())
.then(None)
.otherwise(pl.col("roc_curve").struct.field("thr").list.len()),
# PRC lens
prc_prec_len=pl.when(pl.col("prc_curve").is_null())
.then(None)
.otherwise(pl.col("prc_curve").struct.field("precision").list.len()),
prc_rec_len=pl.when(pl.col("prc_curve").is_null())
.then(None)
.otherwise(pl.col("prc_curve").struct.field("recall").list.len()),
prc_thr_len=pl.when(pl.col("prc_curve").is_null())
.then(None)
.otherwise(pl.col("prc_curve").struct.field("thr").list.len()),
# scores lens
scores_len=pl.when(pl.col("scores").is_null())
.then(None)
.otherwise(pl.col("scores").list.len()),
# deepsad-only arrays (None for others)
idxs_len=pl.when(pl.col("sample_indices").is_null())
.then(None)
.otherwise(pl.col("sample_indices").list.len()),
labels_len=pl.when(pl.col("sample_labels").is_null())
.then(None)
.otherwise(pl.col("sample_labels").list.len()),
vmask_len=pl.when(pl.col("valid_mask").is_null())
.then(None)
.otherwise(pl.col("valid_mask").list.len()),
)
def check_grid_coverage_and_shapes(
df: pl.DataFrame,
networks=NETWORKS_EXPECTED,
latent_dims=LATENT_DIMS_EXPECTED,
semi_labels=SEMI_LABELS_EXPECTED,
models=MODELS_EXPECTED,
evals=EVALS_EXPECTED,
expected_k_fold: int | None = EXPECTED_K_FOLD,
):
dfx = add_shape_columns(df)
# helper: get rows for a specific base combo
def subframe(net, lat, s_norm, s_anom, mdl, ev):
return dfx.filter(
(pl.col("network") == net)
& (pl.col("latent_dim") == lat)
& (pl.col("semi_normals") == s_norm)
& (pl.col("semi_anomalous") == s_anom)
& (pl.col("model") == mdl)
& (pl.col("eval") == ev)
)
missing = []
incomplete = [] # (combo, expected_folds, present_folds)
shape_inconsistent = [] # (combo, metric_name, values_by_fold)
cross_model_diffs = [] # (net, lat, semi, ev, metric_name, shapes_by_model)
# 1) Coverage + within-combo shapes
for net, lat, (s_norm, s_anom), mdl, ev in product(
networks, latent_dims, semi_labels, models, evals
):
sf = subframe(net, lat, s_norm, s_anom, mdl, ev).select(
"fold",
"k_fold_num",
"scores_len",
"roc_fpr_len",
"roc_tpr_len",
"roc_thr_len",
"prc_prec_len",
"prc_rec_len",
"prc_thr_len",
"idxs_len",
"labels_len",
"vmask_len",
)
if sf.height == 0:
missing.append(
dict(
network=net,
latent_dim=lat,
semi_normals=s_norm,
semi_anomalous=s_anom,
model=mdl,
eval=ev,
)
)
continue
# folds present vs expected
folds_present = sorted(sf.get_column("fold").unique().to_list())
if expected_k_fold is not None:
kexp = expected_k_fold
else:
# infer from rows (take max k_fold_num within this combo)
kexp = int(sf.get_column("k_fold_num").max())
all_expected_folds = list(range(kexp))
if folds_present != all_expected_folds:
incomplete.append(
dict(
network=net,
latent_dim=lat,
semi_normals=s_norm,
semi_anomalous=s_anom,
model=mdl,
eval=ev,
expected_folds=all_expected_folds,
present_folds=folds_present,
)
)
# shape consistency across folds (for this combo)
# collect distinct values per metric
shape_cols = [
"scores_len",
"roc_fpr_len",
"roc_tpr_len",
"roc_thr_len",
"prc_prec_len",
"prc_rec_len",
"prc_thr_len",
"idxs_len",
"labels_len",
"vmask_len",
]
for colname in shape_cols:
vals = sf.select(colname).to_series()
uniq = sorted({v for v in vals.to_list()})
# Allow None-only columns (e.g., deepsad-only fields for other models)
if len([u for u in uniq if u is not None]) > 1:
# store per-fold values to help debug
per_fold = (
sf.select("fold", pl.col(colname))
.sort("fold")
.to_dict(as_series=False)
)
shape_inconsistent.append(
dict(
network=net,
latent_dim=lat,
semi_normals=s_norm,
semi_anomalous=s_anom,
model=mdl,
eval=ev,
metric=colname,
per_fold=per_fold,
)
)
# 2) Cross-model comparability at fixed (net,lat,semi,eval)
# We compare shapes that *should* logically match across models:
# - scores_len (same number of test samples)
# - idxs/labels/vmask (only deepsad fills them; we tolerate None elsewhere)
# ROC/PRC binning can differ across models; we *report* those differences for awareness.
base_keys = (
df.select("network", "latent_dim", "semi_normals", "semi_anomalous", "eval")
.unique()
.iter_rows()
)
for net, lat, s_norm, s_anom, ev in base_keys:
rows = (
dfx.filter(
(pl.col("network") == net)
& (pl.col("latent_dim") == lat)
& (pl.col("semi_normals") == s_norm)
& (pl.col("semi_anomalous") == s_anom)
& (pl.col("eval") == ev)
)
.group_by("model")
.agg(
pl.col("scores_len").unique().alias("scores_len_set"),
pl.col("idxs_len").unique().alias("idxs_len_set"),
pl.col("labels_len").unique().alias("labels_len_set"),
pl.col("vmask_len").unique().alias("vmask_len_set"),
pl.col("roc_fpr_len").unique().alias("roc_fpr_len_set"),
pl.col("prc_prec_len").unique().alias("prc_prec_len_set"),
)
.to_dict(as_series=False)
)
if not rows:
continue
# normalize sets
mdls = rows["model"]
s_sets = [set(x) for x in rows["scores_len_set"]]
# compare scores_len across models (ignore None values)
s_normed = [tuple(sorted([v for v in s if v is not None])) for s in s_sets]
if len(set(s_normed)) > 1:
cross_model_diffs.append(
dict(
network=net,
latent_dim=lat,
semi_normals=s_norm,
semi_anomalous=s_anom,
eval=ev,
metric="scores_len",
by_model={m: sorted(list(s_sets[i])) for i, m in enumerate(mdls)},
)
)
# Report ROC/PRC binning diffs (expected)
roc_sets = [set(x) for x in rows["roc_fpr_len_set"]]
if len(set(tuple(sorted(ss)) for ss in roc_sets)) > 1:
cross_model_diffs.append(
dict(
network=net,
latent_dim=lat,
semi_normals=s_norm,
semi_anomalous=s_anom,
eval=ev,
metric="roc_fpr_len",
by_model={m: sorted(list(roc_sets[i])) for i, m in enumerate(mdls)},
)
)
prc_sets = [set(x) for x in rows["prc_prec_len_set"]]
if len(set(tuple(sorted(ss)) for ss in prc_sets)) > 1:
cross_model_diffs.append(
dict(
network=net,
latent_dim=lat,
semi_normals=s_norm,
semi_anomalous=s_anom,
eval=ev,
metric="prc_prec_len",
by_model={m: sorted(list(prc_sets[i])) for i, m in enumerate(mdls)},
)
)
# --- Print a readable report ---
print("\n=== GRID COVERAGE ===")
print(f"Missing combos: {len(missing)}")
for m in missing[:20]:
print(" ", m)
if len(missing) > 20:
print(f" ... (+{len(missing) - 20} more)")
print("\nIncomplete combos (folds missing):", len(incomplete))
for inc in incomplete[:20]:
print(
" ",
{
k: inc[k]
for k in [
"network",
"latent_dim",
"semi_normals",
"semi_anomalous",
"model",
"eval",
]
},
"expected",
inc["expected_folds"],
"present",
inc["present_folds"],
)
if len(incomplete) > 20:
print(f" ... (+{len(incomplete) - 20} more)")
print("\n=== WITHIN-COMBO SHAPE CONSISTENCY (across folds) ===")
print(f"Mismatching groups: {len(shape_inconsistent)}")
for s in shape_inconsistent[:15]:
hdr = {
k: s[k]
for k in [
"network",
"latent_dim",
"semi_normals",
"semi_anomalous",
"model",
"eval",
"metric",
]
}
print(" ", hdr, "values:", s["per_fold"])
if len(shape_inconsistent) > 15:
print(f" ... (+{len(shape_inconsistent) - 15} more)")
print("\n=== CROSS-MODEL COMPARABILITY (by shape) ===")
print(
f"Shape differences across models at fixed (net,lat,semi,eval): {len(cross_model_diffs)}"
)
for s in cross_model_diffs[:15]:
hdr = {
k: s[k]
for k in [
"network",
"latent_dim",
"semi_normals",
"semi_anomalous",
"eval",
"metric",
]
}
print(" ", hdr, "by_model:", s["by_model"])
if len(cross_model_diffs) > 15:
print(f" ... (+{len(cross_model_diffs) - 15} more)")
# Return the raw details if you want to use them programmatically
return {
"missing": missing,
"incomplete": incomplete,
"shape_inconsistent": shape_inconsistent,
"cross_model_diffs": cross_model_diffs,
}
# ------------------------------------------------------------
# Config you can tweak
# ------------------------------------------------------------
MODELS = ["deepsad", "isoforest", "ocsvm"]
EVALS = ["exp_based", "manual_based"]
SCHEMA_STATIC = {
# identifiers / dims
"network": pl.Utf8, # e.g. "LeNet", "efficient"
"latent_dim": pl.Int32,
"semi_normals": pl.Int32,
"semi_anomalous": pl.Int32,
"model": pl.Utf8, # "deepsad" | "isoforest" | "ocsvm"
"eval": pl.Utf8, # "exp_based" | "manual_based"
"fold": pl.Int32,
# metrics
"auc": pl.Float64,
"ap": pl.Float64,
# per-sample scores: list of (idx, label, score)
"scores": pl.List(
pl.Struct(
{
"sample_idx": pl.Int32, # dataloader idx
"orig_label": pl.Int8, # {-1,0,1}
"score": pl.Float64, # anomaly score
}
)
),
# curves (normalized)
"roc_curve": pl.Struct(
{
"fpr": pl.List(pl.Float64),
"tpr": pl.List(pl.Float64),
"thr": pl.List(pl.Float64),
}
),
"prc_curve": pl.Struct(
{
"precision": pl.List(pl.Float64),
"recall": pl.List(pl.Float64),
"thr": pl.List(pl.Float64), # may be len(precision)-1
}
),
# deepsad-only per-eval arrays (None for other models)
"sample_indices": pl.List(pl.Int32),
"sample_labels": pl.List(pl.Int8),
"valid_mask": pl.List(pl.Boolean),
# timings / housekeeping
"train_time": pl.Float64,
"test_time": pl.Float64,
"folder": pl.Utf8,
"k_fold_num": pl.Int32,
}
# ------------------------------------------------------------
# Helpers: curve/scores normalizers (tuples/ndarrays -> dict/list)
# ------------------------------------------------------------
def _tolist(x):
if x is None:
return None
if isinstance(x, np.ndarray):
return x.tolist()
if isinstance(x, (list, tuple)):
return list(x)
# best-effort scalar wrap
try:
return [x]
except Exception:
return None
def normalize_roc(obj: Any) -> Optional[dict]:
if obj is None:
return None
fpr = tpr = thr = None
if isinstance(obj, (tuple, list)):
if len(obj) >= 2:
fpr, tpr = _tolist(obj[0]), _tolist(obj[1])
if len(obj) >= 3:
thr = _tolist(obj[2])
elif isinstance(obj, dict):
fpr = _tolist(obj.get("fpr") or obj.get("x"))
tpr = _tolist(obj.get("tpr") or obj.get("y"))
thr = _tolist(obj.get("thr") or obj.get("thresholds"))
else:
return None
if fpr is None or tpr is None:
return None
return {"fpr": fpr, "tpr": tpr, "thr": thr}
def normalize_prc(obj: Any) -> Optional[dict]:
if obj is None:
return None
precision = recall = thr = None
if isinstance(obj, (tuple, list)):
if len(obj) >= 2:
precision, recall = _tolist(obj[0]), _tolist(obj[1])
if len(obj) >= 3:
thr = _tolist(obj[2])
elif isinstance(obj, dict):
precision = _tolist(obj.get("precision") or obj.get("y"))
recall = _tolist(obj.get("recall") or obj.get("x"))
thr = _tolist(obj.get("thr") or obj.get("thresholds"))
else:
return None
if precision is None or recall is None:
return None
return {"precision": precision, "recall": recall, "thr": thr}
def normalize_scores_to_struct(seq) -> Optional[List[dict]]:
"""
Input: list of (idx, label, score) tuples (as produced in your test()).
Output: list of dicts with keys sample_idx, orig_label, score.
"""
if seq is None:
return None
if isinstance(seq, np.ndarray):
seq = seq.tolist()
if not isinstance(seq, (list, tuple)):
return None
out: List[dict] = []
for item in seq:
if isinstance(item, (list, tuple)) and len(item) >= 3:
idx, lab, sc = item[0], item[1], item[2]
out.append(
{
"sample_idx": None if idx is None else int(idx),
"orig_label": None if lab is None else int(lab),
"score": None if sc is None else float(sc),
}
)
else:
# fallback: single numeric -> score
sc = (
float(item)
if isinstance(item, (int, float, np.integer, np.floating))
else None
)
out.append({"sample_idx": None, "orig_label": None, "score": sc})
return out
def normalize_int_list(a) -> Optional[List[int]]:
if a is None:
return None
if isinstance(a, np.ndarray):
a = a.tolist()
return list(a)
def normalize_bool_list(a) -> Optional[List[bool]]:
if a is None:
return None
if isinstance(a, np.ndarray):
a = a.tolist()
return [bool(x) for x in a]
# ------------------------------------------------------------
# Low-level: read one experiment folder
# ------------------------------------------------------------
def read_config(exp_dir: Path) -> dict:
cfg = exp_dir / "config.json"
with cfg.open("r") as f:
c = json.load(f)
if not c.get("k_fold"):
raise ValueError(f"{exp_dir.name}: not trained as k-fold")
return c
def read_pickle(p: Path) -> Any:
with p.open("rb") as f:
return pickle.load(f)
# ------------------------------------------------------------
# Extractors for each model
# ------------------------------------------------------------
def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
"""
deepsad under data['test'][eval], with extra per-eval arrays and AP present.
"""
out: Dict[str, dict] = {}
test = data.get("test", {})
for ev in evals:
evd = test.get(ev)
if not isinstance(evd, dict):
continue
out[ev] = {
"auc": float(evd["auc"])
if "auc" in evd and evd["auc"] is not None
else None,
"roc": normalize_roc(evd.get("roc")),
"prc": normalize_prc(evd.get("prc")),
"ap": float(evd["ap"]) if "ap" in evd and evd["ap"] is not None else None,
"scores": normalize_scores_to_struct(evd.get("scores")),
"sample_indices": normalize_int_list(evd.get("indices")),
"sample_labels": normalize_int_list(evd.get("labels")),
"valid_mask": normalize_bool_list(evd.get("valid_mask")),
"train_time": data.get("train", {}).get("time"),
"test_time": test.get("time"),
}
return out
def rows_from_isoforest(data: dict, evals: List[str]) -> Dict[str, dict]:
"""
Keys: test_auc_<eval>, test_roc_<eval>, test_prc_<eval>, test_ap_<eval>, test_scores_<eval>.
"""
out: Dict[str, dict] = {}
for ev in evals:
auc = data.get(f"test_auc_{ev}")
if auc is None:
continue
out[ev] = {
"auc": float(auc),
"roc": normalize_roc(data.get(f"test_roc_{ev}")),
"prc": normalize_prc(data.get(f"test_prc_{ev}")),
"ap": float(data.get(f"test_ap_{ev}"))
if data.get(f"test_ap_{ev}") is not None
else None,
"scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")),
"sample_indices": None,
"sample_labels": None,
"valid_mask": None,
"train_time": data.get("train_time"),
"test_time": data.get("test_time"),
}
return out
def rows_from_ocsvm_default(data: dict, evals: List[str]) -> Dict[str, dict]:
"""
Default OCSVM only (ignore linear variant entirely).
"""
out: Dict[str, dict] = {}
for ev in evals:
auc = data.get(f"test_auc_{ev}")
if auc is None:
continue
out[ev] = {
"auc": float(auc),
"roc": normalize_roc(data.get(f"test_roc_{ev}")),
"prc": normalize_prc(data.get(f"test_prc_{ev}")),
"ap": float(data.get(f"test_ap_{ev}"))
if data.get(f"test_ap_{ev}") is not None
else None,
"scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")),
"sample_indices": None,
"sample_labels": None,
"valid_mask": None,
"train_time": data.get("train_time"),
"test_time": data.get("test_time"),
}
return out
# ------------------------------------------------------------
# Build the Polars DataFrame
# ------------------------------------------------------------
def build_results_frame(root: Path) -> pl.DataFrame:
"""
Walks experiment subdirs under `root`. For each (model, fold) it adds rows:
Columns (SCHEMA_STATIC):
network, latent_dim, semi_normals, semi_anomalous,
model, eval, fold,
auc, ap, scores{sample_idx,orig_label,score},
roc_curve{fpr,tpr,thr}, prc_curve{precision,recall,thr},
sample_indices, sample_labels, valid_mask,
train_time, test_time,
folder, k_fold_num
"""
rows: List[dict] = []
exp_dirs = [p for p in root.iterdir() if p.is_dir()]
for exp_dir in sorted(exp_dirs):
try:
cfg = read_config(exp_dir)
except Exception as e:
print(f"[warn] skipping {exp_dir.name}: {e}")
continue
network = cfg.get("net_name")
latent_dim = int(cfg.get("latent_space_dim"))
semi_normals = int(cfg.get("num_known_normal"))
semi_anomalous = int(cfg.get("num_known_outlier"))
k = int(cfg.get("k_fold_num"))
for model in MODELS:
for fold in range(k):
pkl = exp_dir / f"results_{model}_{fold}.pkl"
if not pkl.exists():
continue
try:
data = read_pickle(pkl)
except Exception as e:
print(f"[warn] failed to read {pkl.name}: {e}")
continue
if model == "deepsad":
per_eval = rows_from_deepsad(data, EVALS) # eval -> dict
elif model == "isoforest":
per_eval = rows_from_isoforest(data, EVALS) # eval -> dict
elif model == "ocsvm":
per_eval = rows_from_ocsvm_default(data, EVALS) # eval -> dict
else:
per_eval = {}
for ev, vals in per_eval.items():
rows.append(
{
"network": network,
"latent_dim": latent_dim,
"semi_normals": semi_normals,
"semi_anomalous": semi_anomalous,
"model": model,
"eval": ev,
"fold": fold,
"auc": vals["auc"],
"ap": vals["ap"],
"scores": vals["scores"],
"roc_curve": vals["roc"],
"prc_curve": vals["prc"],
"sample_indices": vals.get("sample_indices"),
"sample_labels": vals.get("sample_labels"),
"valid_mask": vals.get("valid_mask"),
"train_time": vals["train_time"],
"test_time": vals["test_time"],
"folder": str(exp_dir),
"k_fold_num": k,
}
)
# If empty, return a typed empty frame
if not rows:
return pl.DataFrame(schema=SCHEMA_STATIC)
df = pl.DataFrame(rows, schema=SCHEMA_STATIC)
# Cast to efficient dtypes (categoricals etc.) no extra sanitation
df = df.with_columns(
pl.col("network", "model", "eval").cast(pl.Categorical),
pl.col(
"latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
).cast(pl.Int32),
pl.col("auc", "ap", "train_time", "test_time").cast(pl.Float64),
# NOTE: no cast on 'scores' here; it's already List(Struct) per schema.
)
return df
# ------------------------------------------------------------
# Example “analysis-ready” queries (Polars idioms)
# ------------------------------------------------------------
def demo_queries(df: pl.DataFrame):
# q1: lazy is fine, then collect
q1 = (
df.lazy()
.filter(
(pl.col("network") == "LeNet")
& (pl.col("latent_dim") == 1024)
& (pl.col("semi_normals") == 0)
& (pl.col("semi_anomalous") == 0)
& (pl.col("eval") == "exp_based")
)
.group_by(["model"])
.agg(pl.col("auc").mean().alias("mean_auc"))
.sort(["mean_auc"], descending=True)
.collect()
)
# q2: do the filtering eagerly, then pivot (LazyFrame has no .pivot)
base = df.filter(
(pl.col("model") == "deepsad")
& (pl.col("eval") == "exp_based")
& (pl.col("network") == "LeNet")
& (pl.col("semi_normals") == 0)
& (pl.col("semi_anomalous") == 0)
).select("fold", "latent_dim", "auc")
q2 = base.pivot(
values="auc",
index="fold",
columns="latent_dim",
aggregate_function="first", # or "mean" if duplicates exist
).sort("fold")
# roc_subset: eager filter/select, then explode struct fields
roc_subset = (
df.filter(
(pl.col("model") == "ocsvm")
& (pl.col("eval") == "manual_based")
& (pl.col("network") == "efficient")
& (pl.col("latent_dim") == 1024)
& (pl.col("semi_normals") == 0)
& (pl.col("semi_anomalous") == 0)
)
.select("fold", "roc_curve")
.with_columns(
pl.col("roc_curve").struct.field("fpr").alias("fpr"),
pl.col("roc_curve").struct.field("tpr").alias("tpr"),
pl.col("roc_curve").struct.field("thr").alias("thr"),
)
)
return q1, q2, roc_subset
def main():
root = Path("/home/fedex/mt/results/done")
df = build_results_frame(root)
q1, q2, roc_subset = demo_queries(df)
print(df.shape, df.head())
# --- run it ---
report = check_grid_coverage_and_shapes(df)
print(report)
if __name__ == "__main__":
main()