From 3d968c305c02a7a616165c9fcbc5f9aa419633ea Mon Sep 17 00:00:00 2001 From: Jan Kowalczyk Date: Tue, 2 Sep 2025 16:30:32 +0200 Subject: [PATCH] load results into polar df --- tools/load_results.py | 760 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 760 insertions(+) create mode 100644 tools/load_results.py diff --git a/tools/load_results.py b/tools/load_results.py new file mode 100644 index 0000000..39ba3ee --- /dev/null +++ b/tools/load_results.py @@ -0,0 +1,760 @@ +from __future__ import annotations + +import json +import pickle +from itertools import product +from pathlib import Path +from typing import Any, Dict, List, Optional + +import numpy as np +import polars as pl + +# --- configure your intended grid here (use the *canonical* strings used in df) --- +NETWORKS_EXPECTED = ["subter_LeNet", "subter_efficient"] +LATENT_DIMS_EXPECTED = [32, 64, 128, 256, 512, 768, 1024] +SEMI_LABELS_EXPECTED = [(0, 0), (50, 10), (500, 100)] +MODELS_EXPECTED = ["deepsad", "isoforest", "ocsvm"] +EVALS_EXPECTED = ["exp_based", "manual_based"] + +# If k-fold is uniform, set it. If None, we infer it *per combo* from df. +EXPECTED_K_FOLD: int | None = None # e.g., 3 + + +def add_shape_columns(df: pl.DataFrame) -> pl.DataFrame: + return df.with_columns( + # ROC lens + roc_fpr_len=pl.when(pl.col("roc_curve").is_null()) + .then(None) + .otherwise(pl.col("roc_curve").struct.field("fpr").list.len()), + roc_tpr_len=pl.when(pl.col("roc_curve").is_null()) + .then(None) + .otherwise(pl.col("roc_curve").struct.field("tpr").list.len()), + roc_thr_len=pl.when(pl.col("roc_curve").is_null()) + .then(None) + .otherwise(pl.col("roc_curve").struct.field("thr").list.len()), + # PRC lens + prc_prec_len=pl.when(pl.col("prc_curve").is_null()) + .then(None) + .otherwise(pl.col("prc_curve").struct.field("precision").list.len()), + prc_rec_len=pl.when(pl.col("prc_curve").is_null()) + .then(None) + .otherwise(pl.col("prc_curve").struct.field("recall").list.len()), + prc_thr_len=pl.when(pl.col("prc_curve").is_null()) + .then(None) + .otherwise(pl.col("prc_curve").struct.field("thr").list.len()), + # scores lens + scores_len=pl.when(pl.col("scores").is_null()) + .then(None) + .otherwise(pl.col("scores").list.len()), + # deepsad-only arrays (None for others) + idxs_len=pl.when(pl.col("sample_indices").is_null()) + .then(None) + .otherwise(pl.col("sample_indices").list.len()), + labels_len=pl.when(pl.col("sample_labels").is_null()) + .then(None) + .otherwise(pl.col("sample_labels").list.len()), + vmask_len=pl.when(pl.col("valid_mask").is_null()) + .then(None) + .otherwise(pl.col("valid_mask").list.len()), + ) + + +def check_grid_coverage_and_shapes( + df: pl.DataFrame, + networks=NETWORKS_EXPECTED, + latent_dims=LATENT_DIMS_EXPECTED, + semi_labels=SEMI_LABELS_EXPECTED, + models=MODELS_EXPECTED, + evals=EVALS_EXPECTED, + expected_k_fold: int | None = EXPECTED_K_FOLD, +): + dfx = add_shape_columns(df) + + # helper: get rows for a specific base combo + def subframe(net, lat, s_norm, s_anom, mdl, ev): + return dfx.filter( + (pl.col("network") == net) + & (pl.col("latent_dim") == lat) + & (pl.col("semi_normals") == s_norm) + & (pl.col("semi_anomalous") == s_anom) + & (pl.col("model") == mdl) + & (pl.col("eval") == ev) + ) + + missing = [] + incomplete = [] # (combo, expected_folds, present_folds) + shape_inconsistent = [] # (combo, metric_name, values_by_fold) + cross_model_diffs = [] # (net, lat, semi, ev, metric_name, shapes_by_model) + + # 1) Coverage + within-combo shapes + for net, lat, (s_norm, s_anom), mdl, ev in product( + networks, latent_dims, semi_labels, models, evals + ): + sf = subframe(net, lat, s_norm, s_anom, mdl, ev).select( + "fold", + "k_fold_num", + "scores_len", + "roc_fpr_len", + "roc_tpr_len", + "roc_thr_len", + "prc_prec_len", + "prc_rec_len", + "prc_thr_len", + "idxs_len", + "labels_len", + "vmask_len", + ) + + if sf.height == 0: + missing.append( + dict( + network=net, + latent_dim=lat, + semi_normals=s_norm, + semi_anomalous=s_anom, + model=mdl, + eval=ev, + ) + ) + continue + + # folds present vs expected + folds_present = sorted(sf.get_column("fold").unique().to_list()) + if expected_k_fold is not None: + kexp = expected_k_fold + else: + # infer from rows (take max k_fold_num within this combo) + kexp = int(sf.get_column("k_fold_num").max()) + all_expected_folds = list(range(kexp)) + if folds_present != all_expected_folds: + incomplete.append( + dict( + network=net, + latent_dim=lat, + semi_normals=s_norm, + semi_anomalous=s_anom, + model=mdl, + eval=ev, + expected_folds=all_expected_folds, + present_folds=folds_present, + ) + ) + + # shape consistency across folds (for this combo) + # collect distinct values per metric + shape_cols = [ + "scores_len", + "roc_fpr_len", + "roc_tpr_len", + "roc_thr_len", + "prc_prec_len", + "prc_rec_len", + "prc_thr_len", + "idxs_len", + "labels_len", + "vmask_len", + ] + for colname in shape_cols: + vals = sf.select(colname).to_series() + uniq = sorted({v for v in vals.to_list()}) + # Allow None-only columns (e.g., deepsad-only fields for other models) + if len([u for u in uniq if u is not None]) > 1: + # store per-fold values to help debug + per_fold = ( + sf.select("fold", pl.col(colname)) + .sort("fold") + .to_dict(as_series=False) + ) + shape_inconsistent.append( + dict( + network=net, + latent_dim=lat, + semi_normals=s_norm, + semi_anomalous=s_anom, + model=mdl, + eval=ev, + metric=colname, + per_fold=per_fold, + ) + ) + + # 2) Cross-model comparability at fixed (net,lat,semi,eval) + # We compare shapes that *should* logically match across models: + # - scores_len (same number of test samples) + # - idxs/labels/vmask (only deepsad fills them; we tolerate None elsewhere) + # ROC/PRC binning can differ across models; we *report* those differences for awareness. + base_keys = ( + df.select("network", "latent_dim", "semi_normals", "semi_anomalous", "eval") + .unique() + .iter_rows() + ) + for net, lat, s_norm, s_anom, ev in base_keys: + rows = ( + dfx.filter( + (pl.col("network") == net) + & (pl.col("latent_dim") == lat) + & (pl.col("semi_normals") == s_norm) + & (pl.col("semi_anomalous") == s_anom) + & (pl.col("eval") == ev) + ) + .group_by("model") + .agg( + pl.col("scores_len").unique().alias("scores_len_set"), + pl.col("idxs_len").unique().alias("idxs_len_set"), + pl.col("labels_len").unique().alias("labels_len_set"), + pl.col("vmask_len").unique().alias("vmask_len_set"), + pl.col("roc_fpr_len").unique().alias("roc_fpr_len_set"), + pl.col("prc_prec_len").unique().alias("prc_prec_len_set"), + ) + .to_dict(as_series=False) + ) + if not rows: + continue + # normalize sets + mdls = rows["model"] + s_sets = [set(x) for x in rows["scores_len_set"]] + # compare scores_len across models (ignore None values) + s_normed = [tuple(sorted([v for v in s if v is not None])) for s in s_sets] + if len(set(s_normed)) > 1: + cross_model_diffs.append( + dict( + network=net, + latent_dim=lat, + semi_normals=s_norm, + semi_anomalous=s_anom, + eval=ev, + metric="scores_len", + by_model={m: sorted(list(s_sets[i])) for i, m in enumerate(mdls)}, + ) + ) + # Report ROC/PRC binning diffs (expected) + roc_sets = [set(x) for x in rows["roc_fpr_len_set"]] + if len(set(tuple(sorted(ss)) for ss in roc_sets)) > 1: + cross_model_diffs.append( + dict( + network=net, + latent_dim=lat, + semi_normals=s_norm, + semi_anomalous=s_anom, + eval=ev, + metric="roc_fpr_len", + by_model={m: sorted(list(roc_sets[i])) for i, m in enumerate(mdls)}, + ) + ) + prc_sets = [set(x) for x in rows["prc_prec_len_set"]] + if len(set(tuple(sorted(ss)) for ss in prc_sets)) > 1: + cross_model_diffs.append( + dict( + network=net, + latent_dim=lat, + semi_normals=s_norm, + semi_anomalous=s_anom, + eval=ev, + metric="prc_prec_len", + by_model={m: sorted(list(prc_sets[i])) for i, m in enumerate(mdls)}, + ) + ) + + # --- Print a readable report --- + print("\n=== GRID COVERAGE ===") + print(f"Missing combos: {len(missing)}") + for m in missing[:20]: + print(" ", m) + if len(missing) > 20: + print(f" ... (+{len(missing) - 20} more)") + + print("\nIncomplete combos (folds missing):", len(incomplete)) + for inc in incomplete[:20]: + print( + " ", + { + k: inc[k] + for k in [ + "network", + "latent_dim", + "semi_normals", + "semi_anomalous", + "model", + "eval", + ] + }, + "expected", + inc["expected_folds"], + "present", + inc["present_folds"], + ) + if len(incomplete) > 20: + print(f" ... (+{len(incomplete) - 20} more)") + + print("\n=== WITHIN-COMBO SHAPE CONSISTENCY (across folds) ===") + print(f"Mismatching groups: {len(shape_inconsistent)}") + for s in shape_inconsistent[:15]: + hdr = { + k: s[k] + for k in [ + "network", + "latent_dim", + "semi_normals", + "semi_anomalous", + "model", + "eval", + "metric", + ] + } + print(" ", hdr, "values:", s["per_fold"]) + if len(shape_inconsistent) > 15: + print(f" ... (+{len(shape_inconsistent) - 15} more)") + + print("\n=== CROSS-MODEL COMPARABILITY (by shape) ===") + print( + f"Shape differences across models at fixed (net,lat,semi,eval): {len(cross_model_diffs)}" + ) + for s in cross_model_diffs[:15]: + hdr = { + k: s[k] + for k in [ + "network", + "latent_dim", + "semi_normals", + "semi_anomalous", + "eval", + "metric", + ] + } + print(" ", hdr, "by_model:", s["by_model"]) + if len(cross_model_diffs) > 15: + print(f" ... (+{len(cross_model_diffs) - 15} more)") + + # Return the raw details if you want to use them programmatically + return { + "missing": missing, + "incomplete": incomplete, + "shape_inconsistent": shape_inconsistent, + "cross_model_diffs": cross_model_diffs, + } + + +# ------------------------------------------------------------ +# Config you can tweak +# ------------------------------------------------------------ +MODELS = ["deepsad", "isoforest", "ocsvm"] +EVALS = ["exp_based", "manual_based"] + +SCHEMA_STATIC = { + # identifiers / dims + "network": pl.Utf8, # e.g. "LeNet", "efficient" + "latent_dim": pl.Int32, + "semi_normals": pl.Int32, + "semi_anomalous": pl.Int32, + "model": pl.Utf8, # "deepsad" | "isoforest" | "ocsvm" + "eval": pl.Utf8, # "exp_based" | "manual_based" + "fold": pl.Int32, + # metrics + "auc": pl.Float64, + "ap": pl.Float64, + # per-sample scores: list of (idx, label, score) + "scores": pl.List( + pl.Struct( + { + "sample_idx": pl.Int32, # dataloader idx + "orig_label": pl.Int8, # {-1,0,1} + "score": pl.Float64, # anomaly score + } + ) + ), + # curves (normalized) + "roc_curve": pl.Struct( + { + "fpr": pl.List(pl.Float64), + "tpr": pl.List(pl.Float64), + "thr": pl.List(pl.Float64), + } + ), + "prc_curve": pl.Struct( + { + "precision": pl.List(pl.Float64), + "recall": pl.List(pl.Float64), + "thr": pl.List(pl.Float64), # may be len(precision)-1 + } + ), + # deepsad-only per-eval arrays (None for other models) + "sample_indices": pl.List(pl.Int32), + "sample_labels": pl.List(pl.Int8), + "valid_mask": pl.List(pl.Boolean), + # timings / housekeeping + "train_time": pl.Float64, + "test_time": pl.Float64, + "folder": pl.Utf8, + "k_fold_num": pl.Int32, +} + + +# ------------------------------------------------------------ +# Helpers: curve/scores normalizers (tuples/ndarrays -> dict/list) +# ------------------------------------------------------------ +def _tolist(x): + if x is None: + return None + if isinstance(x, np.ndarray): + return x.tolist() + if isinstance(x, (list, tuple)): + return list(x) + # best-effort scalar wrap + try: + return [x] + except Exception: + return None + + +def normalize_roc(obj: Any) -> Optional[dict]: + if obj is None: + return None + fpr = tpr = thr = None + if isinstance(obj, (tuple, list)): + if len(obj) >= 2: + fpr, tpr = _tolist(obj[0]), _tolist(obj[1]) + if len(obj) >= 3: + thr = _tolist(obj[2]) + elif isinstance(obj, dict): + fpr = _tolist(obj.get("fpr") or obj.get("x")) + tpr = _tolist(obj.get("tpr") or obj.get("y")) + thr = _tolist(obj.get("thr") or obj.get("thresholds")) + else: + return None + if fpr is None or tpr is None: + return None + return {"fpr": fpr, "tpr": tpr, "thr": thr} + + +def normalize_prc(obj: Any) -> Optional[dict]: + if obj is None: + return None + precision = recall = thr = None + if isinstance(obj, (tuple, list)): + if len(obj) >= 2: + precision, recall = _tolist(obj[0]), _tolist(obj[1]) + if len(obj) >= 3: + thr = _tolist(obj[2]) + elif isinstance(obj, dict): + precision = _tolist(obj.get("precision") or obj.get("y")) + recall = _tolist(obj.get("recall") or obj.get("x")) + thr = _tolist(obj.get("thr") or obj.get("thresholds")) + else: + return None + if precision is None or recall is None: + return None + return {"precision": precision, "recall": recall, "thr": thr} + + +def normalize_scores_to_struct(seq) -> Optional[List[dict]]: + """ + Input: list of (idx, label, score) tuples (as produced in your test()). + Output: list of dicts with keys sample_idx, orig_label, score. + """ + if seq is None: + return None + if isinstance(seq, np.ndarray): + seq = seq.tolist() + if not isinstance(seq, (list, tuple)): + return None + out: List[dict] = [] + for item in seq: + if isinstance(item, (list, tuple)) and len(item) >= 3: + idx, lab, sc = item[0], item[1], item[2] + out.append( + { + "sample_idx": None if idx is None else int(idx), + "orig_label": None if lab is None else int(lab), + "score": None if sc is None else float(sc), + } + ) + else: + # fallback: single numeric -> score + sc = ( + float(item) + if isinstance(item, (int, float, np.integer, np.floating)) + else None + ) + out.append({"sample_idx": None, "orig_label": None, "score": sc}) + return out + + +def normalize_int_list(a) -> Optional[List[int]]: + if a is None: + return None + if isinstance(a, np.ndarray): + a = a.tolist() + return list(a) + + +def normalize_bool_list(a) -> Optional[List[bool]]: + if a is None: + return None + if isinstance(a, np.ndarray): + a = a.tolist() + return [bool(x) for x in a] + + +# ------------------------------------------------------------ +# Low-level: read one experiment folder +# ------------------------------------------------------------ +def read_config(exp_dir: Path) -> dict: + cfg = exp_dir / "config.json" + with cfg.open("r") as f: + c = json.load(f) + if not c.get("k_fold"): + raise ValueError(f"{exp_dir.name}: not trained as k-fold") + return c + + +def read_pickle(p: Path) -> Any: + with p.open("rb") as f: + return pickle.load(f) + + +# ------------------------------------------------------------ +# Extractors for each model +# ------------------------------------------------------------ +def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]: + """ + deepsad under data['test'][eval], with extra per-eval arrays and AP present. + """ + out: Dict[str, dict] = {} + test = data.get("test", {}) + for ev in evals: + evd = test.get(ev) + if not isinstance(evd, dict): + continue + out[ev] = { + "auc": float(evd["auc"]) + if "auc" in evd and evd["auc"] is not None + else None, + "roc": normalize_roc(evd.get("roc")), + "prc": normalize_prc(evd.get("prc")), + "ap": float(evd["ap"]) if "ap" in evd and evd["ap"] is not None else None, + "scores": normalize_scores_to_struct(evd.get("scores")), + "sample_indices": normalize_int_list(evd.get("indices")), + "sample_labels": normalize_int_list(evd.get("labels")), + "valid_mask": normalize_bool_list(evd.get("valid_mask")), + "train_time": data.get("train", {}).get("time"), + "test_time": test.get("time"), + } + return out + + +def rows_from_isoforest(data: dict, evals: List[str]) -> Dict[str, dict]: + """ + Keys: test_auc_, test_roc_, test_prc_, test_ap_, test_scores_. + """ + out: Dict[str, dict] = {} + for ev in evals: + auc = data.get(f"test_auc_{ev}") + if auc is None: + continue + out[ev] = { + "auc": float(auc), + "roc": normalize_roc(data.get(f"test_roc_{ev}")), + "prc": normalize_prc(data.get(f"test_prc_{ev}")), + "ap": float(data.get(f"test_ap_{ev}")) + if data.get(f"test_ap_{ev}") is not None + else None, + "scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")), + "sample_indices": None, + "sample_labels": None, + "valid_mask": None, + "train_time": data.get("train_time"), + "test_time": data.get("test_time"), + } + return out + + +def rows_from_ocsvm_default(data: dict, evals: List[str]) -> Dict[str, dict]: + """ + Default OCSVM only (ignore linear variant entirely). + """ + out: Dict[str, dict] = {} + for ev in evals: + auc = data.get(f"test_auc_{ev}") + if auc is None: + continue + out[ev] = { + "auc": float(auc), + "roc": normalize_roc(data.get(f"test_roc_{ev}")), + "prc": normalize_prc(data.get(f"test_prc_{ev}")), + "ap": float(data.get(f"test_ap_{ev}")) + if data.get(f"test_ap_{ev}") is not None + else None, + "scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")), + "sample_indices": None, + "sample_labels": None, + "valid_mask": None, + "train_time": data.get("train_time"), + "test_time": data.get("test_time"), + } + return out + + +# ------------------------------------------------------------ +# Build the Polars DataFrame +# ------------------------------------------------------------ +def build_results_frame(root: Path) -> pl.DataFrame: + """ + Walks experiment subdirs under `root`. For each (model, fold) it adds rows: + Columns (SCHEMA_STATIC): + network, latent_dim, semi_normals, semi_anomalous, + model, eval, fold, + auc, ap, scores{sample_idx,orig_label,score}, + roc_curve{fpr,tpr,thr}, prc_curve{precision,recall,thr}, + sample_indices, sample_labels, valid_mask, + train_time, test_time, + folder, k_fold_num + """ + rows: List[dict] = [] + + exp_dirs = [p for p in root.iterdir() if p.is_dir()] + for exp_dir in sorted(exp_dirs): + try: + cfg = read_config(exp_dir) + except Exception as e: + print(f"[warn] skipping {exp_dir.name}: {e}") + continue + + network = cfg.get("net_name") + latent_dim = int(cfg.get("latent_space_dim")) + semi_normals = int(cfg.get("num_known_normal")) + semi_anomalous = int(cfg.get("num_known_outlier")) + k = int(cfg.get("k_fold_num")) + + for model in MODELS: + for fold in range(k): + pkl = exp_dir / f"results_{model}_{fold}.pkl" + if not pkl.exists(): + continue + + try: + data = read_pickle(pkl) + except Exception as e: + print(f"[warn] failed to read {pkl.name}: {e}") + continue + + if model == "deepsad": + per_eval = rows_from_deepsad(data, EVALS) # eval -> dict + elif model == "isoforest": + per_eval = rows_from_isoforest(data, EVALS) # eval -> dict + elif model == "ocsvm": + per_eval = rows_from_ocsvm_default(data, EVALS) # eval -> dict + else: + per_eval = {} + + for ev, vals in per_eval.items(): + rows.append( + { + "network": network, + "latent_dim": latent_dim, + "semi_normals": semi_normals, + "semi_anomalous": semi_anomalous, + "model": model, + "eval": ev, + "fold": fold, + "auc": vals["auc"], + "ap": vals["ap"], + "scores": vals["scores"], + "roc_curve": vals["roc"], + "prc_curve": vals["prc"], + "sample_indices": vals.get("sample_indices"), + "sample_labels": vals.get("sample_labels"), + "valid_mask": vals.get("valid_mask"), + "train_time": vals["train_time"], + "test_time": vals["test_time"], + "folder": str(exp_dir), + "k_fold_num": k, + } + ) + + # If empty, return a typed empty frame + if not rows: + return pl.DataFrame(schema=SCHEMA_STATIC) + + df = pl.DataFrame(rows, schema=SCHEMA_STATIC) + + # Cast to efficient dtypes (categoricals etc.) – no extra sanitation + df = df.with_columns( + pl.col("network", "model", "eval").cast(pl.Categorical), + pl.col( + "latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num" + ).cast(pl.Int32), + pl.col("auc", "ap", "train_time", "test_time").cast(pl.Float64), + # NOTE: no cast on 'scores' here; it's already List(Struct) per schema. + ) + + return df + + +# ------------------------------------------------------------ +# Example “analysis-ready” queries (Polars idioms) +# ------------------------------------------------------------ +def demo_queries(df: pl.DataFrame): + # q1: lazy is fine, then collect + q1 = ( + df.lazy() + .filter( + (pl.col("network") == "LeNet") + & (pl.col("latent_dim") == 1024) + & (pl.col("semi_normals") == 0) + & (pl.col("semi_anomalous") == 0) + & (pl.col("eval") == "exp_based") + ) + .group_by(["model"]) + .agg(pl.col("auc").mean().alias("mean_auc")) + .sort(["mean_auc"], descending=True) + .collect() + ) + + # q2: do the filtering eagerly, then pivot (LazyFrame has no .pivot) + base = df.filter( + (pl.col("model") == "deepsad") + & (pl.col("eval") == "exp_based") + & (pl.col("network") == "LeNet") + & (pl.col("semi_normals") == 0) + & (pl.col("semi_anomalous") == 0) + ).select("fold", "latent_dim", "auc") + q2 = base.pivot( + values="auc", + index="fold", + columns="latent_dim", + aggregate_function="first", # or "mean" if duplicates exist + ).sort("fold") + + # roc_subset: eager filter/select, then explode struct fields + roc_subset = ( + df.filter( + (pl.col("model") == "ocsvm") + & (pl.col("eval") == "manual_based") + & (pl.col("network") == "efficient") + & (pl.col("latent_dim") == 1024) + & (pl.col("semi_normals") == 0) + & (pl.col("semi_anomalous") == 0) + ) + .select("fold", "roc_curve") + .with_columns( + pl.col("roc_curve").struct.field("fpr").alias("fpr"), + pl.col("roc_curve").struct.field("tpr").alias("tpr"), + pl.col("roc_curve").struct.field("thr").alias("thr"), + ) + ) + + return q1, q2, roc_subset + + +def main(): + root = Path("/home/fedex/mt/results/done") + df = build_results_frame(root) + q1, q2, roc_subset = demo_queries(df) + print(df.shape, df.head()) + # --- run it --- + report = check_grid_coverage_and_shapes(df) + print(report) + + +if __name__ == "__main__": + main()