from __future__ import annotations import json import pickle from itertools import product from pathlib import Path from typing import Any, Dict, List, Optional import numpy as np import polars as pl # --- configure your intended grid here (use the *canonical* strings used in df) --- NETWORKS_EXPECTED = ["subter_LeNet", "subter_efficient"] LATENT_DIMS_EXPECTED = [32, 64, 128, 256, 512, 768, 1024] SEMI_LABELS_EXPECTED = [(0, 0), (50, 10), (500, 100)] MODELS_EXPECTED = ["deepsad", "isoforest", "ocsvm"] EVALS_EXPECTED = ["exp_based", "manual_based"] # If k-fold is uniform, set it. If None, we infer it *per combo* from df. EXPECTED_K_FOLD: int | None = None # e.g., 3 def add_shape_columns(df: pl.DataFrame) -> pl.DataFrame: return df.with_columns( # ROC lens roc_fpr_len=pl.when(pl.col("roc_curve").is_null()) .then(None) .otherwise(pl.col("roc_curve").struct.field("fpr").list.len()), roc_tpr_len=pl.when(pl.col("roc_curve").is_null()) .then(None) .otherwise(pl.col("roc_curve").struct.field("tpr").list.len()), roc_thr_len=pl.when(pl.col("roc_curve").is_null()) .then(None) .otherwise(pl.col("roc_curve").struct.field("thr").list.len()), # PRC lens prc_prec_len=pl.when(pl.col("prc_curve").is_null()) .then(None) .otherwise(pl.col("prc_curve").struct.field("precision").list.len()), prc_rec_len=pl.when(pl.col("prc_curve").is_null()) .then(None) .otherwise(pl.col("prc_curve").struct.field("recall").list.len()), prc_thr_len=pl.when(pl.col("prc_curve").is_null()) .then(None) .otherwise(pl.col("prc_curve").struct.field("thr").list.len()), # scores lens scores_len=pl.when(pl.col("scores").is_null()) .then(None) .otherwise(pl.col("scores").list.len()), # deepsad-only arrays (None for others) idxs_len=pl.when(pl.col("sample_indices").is_null()) .then(None) .otherwise(pl.col("sample_indices").list.len()), labels_len=pl.when(pl.col("sample_labels").is_null()) .then(None) .otherwise(pl.col("sample_labels").list.len()), vmask_len=pl.when(pl.col("valid_mask").is_null()) .then(None) .otherwise(pl.col("valid_mask").list.len()), ) def check_grid_coverage_and_shapes( df: pl.DataFrame, networks=NETWORKS_EXPECTED, latent_dims=LATENT_DIMS_EXPECTED, semi_labels=SEMI_LABELS_EXPECTED, models=MODELS_EXPECTED, evals=EVALS_EXPECTED, expected_k_fold: int | None = EXPECTED_K_FOLD, ): dfx = add_shape_columns(df) # helper: get rows for a specific base combo def subframe(net, lat, s_norm, s_anom, mdl, ev): return dfx.filter( (pl.col("network") == net) & (pl.col("latent_dim") == lat) & (pl.col("semi_normals") == s_norm) & (pl.col("semi_anomalous") == s_anom) & (pl.col("model") == mdl) & (pl.col("eval") == ev) ) missing = [] incomplete = [] # (combo, expected_folds, present_folds) shape_inconsistent = [] # (combo, metric_name, values_by_fold) cross_model_diffs = [] # (net, lat, semi, ev, metric_name, shapes_by_model) # 1) Coverage + within-combo shapes for net, lat, (s_norm, s_anom), mdl, ev in product( networks, latent_dims, semi_labels, models, evals ): sf = subframe(net, lat, s_norm, s_anom, mdl, ev).select( "fold", "k_fold_num", "scores_len", "roc_fpr_len", "roc_tpr_len", "roc_thr_len", "prc_prec_len", "prc_rec_len", "prc_thr_len", "idxs_len", "labels_len", "vmask_len", ) if sf.height == 0: missing.append( dict( network=net, latent_dim=lat, semi_normals=s_norm, semi_anomalous=s_anom, model=mdl, eval=ev, ) ) continue # folds present vs expected folds_present = sorted(sf.get_column("fold").unique().to_list()) if expected_k_fold is not None: kexp = expected_k_fold else: # infer from rows (take max k_fold_num within this combo) kexp = int(sf.get_column("k_fold_num").max()) all_expected_folds = list(range(kexp)) if folds_present != all_expected_folds: incomplete.append( dict( network=net, latent_dim=lat, semi_normals=s_norm, semi_anomalous=s_anom, model=mdl, eval=ev, expected_folds=all_expected_folds, present_folds=folds_present, ) ) # shape consistency across folds (for this combo) # collect distinct values per metric shape_cols = [ "scores_len", "roc_fpr_len", "roc_tpr_len", "roc_thr_len", "prc_prec_len", "prc_rec_len", "prc_thr_len", "idxs_len", "labels_len", "vmask_len", ] for colname in shape_cols: vals = sf.select(colname).to_series() uniq = sorted({v for v in vals.to_list()}) # Allow None-only columns (e.g., deepsad-only fields for other models) if len([u for u in uniq if u is not None]) > 1: # store per-fold values to help debug per_fold = ( sf.select("fold", pl.col(colname)) .sort("fold") .to_dict(as_series=False) ) shape_inconsistent.append( dict( network=net, latent_dim=lat, semi_normals=s_norm, semi_anomalous=s_anom, model=mdl, eval=ev, metric=colname, per_fold=per_fold, ) ) # 2) Cross-model comparability at fixed (net,lat,semi,eval) # We compare shapes that *should* logically match across models: # - scores_len (same number of test samples) # - idxs/labels/vmask (only deepsad fills them; we tolerate None elsewhere) # ROC/PRC binning can differ across models; we *report* those differences for awareness. base_keys = ( df.select("network", "latent_dim", "semi_normals", "semi_anomalous", "eval") .unique() .iter_rows() ) for net, lat, s_norm, s_anom, ev in base_keys: rows = ( dfx.filter( (pl.col("network") == net) & (pl.col("latent_dim") == lat) & (pl.col("semi_normals") == s_norm) & (pl.col("semi_anomalous") == s_anom) & (pl.col("eval") == ev) ) .group_by("model") .agg( pl.col("scores_len").unique().alias("scores_len_set"), pl.col("idxs_len").unique().alias("idxs_len_set"), pl.col("labels_len").unique().alias("labels_len_set"), pl.col("vmask_len").unique().alias("vmask_len_set"), pl.col("roc_fpr_len").unique().alias("roc_fpr_len_set"), pl.col("prc_prec_len").unique().alias("prc_prec_len_set"), ) .to_dict(as_series=False) ) if not rows: continue # normalize sets mdls = rows["model"] s_sets = [set(x) for x in rows["scores_len_set"]] # compare scores_len across models (ignore None values) s_normed = [tuple(sorted([v for v in s if v is not None])) for s in s_sets] if len(set(s_normed)) > 1: cross_model_diffs.append( dict( network=net, latent_dim=lat, semi_normals=s_norm, semi_anomalous=s_anom, eval=ev, metric="scores_len", by_model={m: sorted(list(s_sets[i])) for i, m in enumerate(mdls)}, ) ) # Report ROC/PRC binning diffs (expected) roc_sets = [set(x) for x in rows["roc_fpr_len_set"]] if len(set(tuple(sorted(ss)) for ss in roc_sets)) > 1: cross_model_diffs.append( dict( network=net, latent_dim=lat, semi_normals=s_norm, semi_anomalous=s_anom, eval=ev, metric="roc_fpr_len", by_model={m: sorted(list(roc_sets[i])) for i, m in enumerate(mdls)}, ) ) prc_sets = [set(x) for x in rows["prc_prec_len_set"]] if len(set(tuple(sorted(ss)) for ss in prc_sets)) > 1: cross_model_diffs.append( dict( network=net, latent_dim=lat, semi_normals=s_norm, semi_anomalous=s_anom, eval=ev, metric="prc_prec_len", by_model={m: sorted(list(prc_sets[i])) for i, m in enumerate(mdls)}, ) ) # --- Print a readable report --- print("\n=== GRID COVERAGE ===") print(f"Missing combos: {len(missing)}") for m in missing[:20]: print(" ", m) if len(missing) > 20: print(f" ... (+{len(missing) - 20} more)") print("\nIncomplete combos (folds missing):", len(incomplete)) for inc in incomplete[:20]: print( " ", { k: inc[k] for k in [ "network", "latent_dim", "semi_normals", "semi_anomalous", "model", "eval", ] }, "expected", inc["expected_folds"], "present", inc["present_folds"], ) if len(incomplete) > 20: print(f" ... (+{len(incomplete) - 20} more)") print("\n=== WITHIN-COMBO SHAPE CONSISTENCY (across folds) ===") print(f"Mismatching groups: {len(shape_inconsistent)}") for s in shape_inconsistent[:15]: hdr = { k: s[k] for k in [ "network", "latent_dim", "semi_normals", "semi_anomalous", "model", "eval", "metric", ] } print(" ", hdr, "values:", s["per_fold"]) if len(shape_inconsistent) > 15: print(f" ... (+{len(shape_inconsistent) - 15} more)") print("\n=== CROSS-MODEL COMPARABILITY (by shape) ===") print( f"Shape differences across models at fixed (net,lat,semi,eval): {len(cross_model_diffs)}" ) for s in cross_model_diffs[:15]: hdr = { k: s[k] for k in [ "network", "latent_dim", "semi_normals", "semi_anomalous", "eval", "metric", ] } print(" ", hdr, "by_model:", s["by_model"]) if len(cross_model_diffs) > 15: print(f" ... (+{len(cross_model_diffs) - 15} more)") # Return the raw details if you want to use them programmatically return { "missing": missing, "incomplete": incomplete, "shape_inconsistent": shape_inconsistent, "cross_model_diffs": cross_model_diffs, } # ------------------------------------------------------------ # Config you can tweak # ------------------------------------------------------------ MODELS = ["deepsad", "isoforest", "ocsvm"] EVALS = ["exp_based", "manual_based"] SCHEMA_STATIC = { # identifiers / dims "network": pl.Utf8, # e.g. "LeNet", "efficient" "latent_dim": pl.Int32, "semi_normals": pl.Int32, "semi_anomalous": pl.Int32, "model": pl.Utf8, # "deepsad" | "isoforest" | "ocsvm" "eval": pl.Utf8, # "exp_based" | "manual_based" "fold": pl.Int32, # metrics "auc": pl.Float64, "ap": pl.Float64, # per-sample scores: list of (idx, label, score) "scores": pl.List( pl.Struct( { "sample_idx": pl.Int32, # dataloader idx "orig_label": pl.Int8, # {-1,0,1} "score": pl.Float64, # anomaly score } ) ), # curves (normalized) "roc_curve": pl.Struct( { "fpr": pl.List(pl.Float64), "tpr": pl.List(pl.Float64), "thr": pl.List(pl.Float64), } ), "prc_curve": pl.Struct( { "precision": pl.List(pl.Float64), "recall": pl.List(pl.Float64), "thr": pl.List(pl.Float64), # may be len(precision)-1 } ), # deepsad-only per-eval arrays (None for other models) "sample_indices": pl.List(pl.Int32), "sample_labels": pl.List(pl.Int8), "valid_mask": pl.List(pl.Boolean), # timings / housekeeping "train_time": pl.Float64, "test_time": pl.Float64, "folder": pl.Utf8, "k_fold_num": pl.Int32, } # ------------------------------------------------------------ # Helpers: curve/scores normalizers (tuples/ndarrays -> dict/list) # ------------------------------------------------------------ def _tolist(x): if x is None: return None if isinstance(x, np.ndarray): return x.tolist() if isinstance(x, (list, tuple)): return list(x) # best-effort scalar wrap try: return [x] except Exception: return None def normalize_roc(obj: Any) -> Optional[dict]: if obj is None: return None fpr = tpr = thr = None if isinstance(obj, (tuple, list)): if len(obj) >= 2: fpr, tpr = _tolist(obj[0]), _tolist(obj[1]) if len(obj) >= 3: thr = _tolist(obj[2]) elif isinstance(obj, dict): fpr = _tolist(obj.get("fpr") or obj.get("x")) tpr = _tolist(obj.get("tpr") or obj.get("y")) thr = _tolist(obj.get("thr") or obj.get("thresholds")) else: return None if fpr is None or tpr is None: return None return {"fpr": fpr, "tpr": tpr, "thr": thr} def normalize_prc(obj: Any) -> Optional[dict]: if obj is None: return None precision = recall = thr = None if isinstance(obj, (tuple, list)): if len(obj) >= 2: precision, recall = _tolist(obj[0]), _tolist(obj[1]) if len(obj) >= 3: thr = _tolist(obj[2]) elif isinstance(obj, dict): precision = _tolist(obj.get("precision") or obj.get("y")) recall = _tolist(obj.get("recall") or obj.get("x")) thr = _tolist(obj.get("thr") or obj.get("thresholds")) else: return None if precision is None or recall is None: return None return {"precision": precision, "recall": recall, "thr": thr} def normalize_scores_to_struct(seq) -> Optional[List[dict]]: """ Input: list of (idx, label, score) tuples (as produced in your test()). Output: list of dicts with keys sample_idx, orig_label, score. """ if seq is None: return None if isinstance(seq, np.ndarray): seq = seq.tolist() if not isinstance(seq, (list, tuple)): return None out: List[dict] = [] for item in seq: if isinstance(item, (list, tuple)) and len(item) >= 3: idx, lab, sc = item[0], item[1], item[2] out.append( { "sample_idx": None if idx is None else int(idx), "orig_label": None if lab is None else int(lab), "score": None if sc is None else float(sc), } ) else: # fallback: single numeric -> score sc = ( float(item) if isinstance(item, (int, float, np.integer, np.floating)) else None ) out.append({"sample_idx": None, "orig_label": None, "score": sc}) return out def normalize_int_list(a) -> Optional[List[int]]: if a is None: return None if isinstance(a, np.ndarray): a = a.tolist() return list(a) def normalize_bool_list(a) -> Optional[List[bool]]: if a is None: return None if isinstance(a, np.ndarray): a = a.tolist() return [bool(x) for x in a] # ------------------------------------------------------------ # Low-level: read one experiment folder # ------------------------------------------------------------ def read_config(exp_dir: Path) -> dict: cfg = exp_dir / "config.json" with cfg.open("r") as f: c = json.load(f) if not c.get("k_fold"): raise ValueError(f"{exp_dir.name}: not trained as k-fold") return c def read_pickle(p: Path) -> Any: with p.open("rb") as f: return pickle.load(f) # ------------------------------------------------------------ # Extractors for each model # ------------------------------------------------------------ def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]: """ deepsad under data['test'][eval], with extra per-eval arrays and AP present. """ out: Dict[str, dict] = {} test = data.get("test", {}) for ev in evals: evd = test.get(ev) if not isinstance(evd, dict): continue out[ev] = { "auc": float(evd["auc"]) if "auc" in evd and evd["auc"] is not None else None, "roc": normalize_roc(evd.get("roc")), "prc": normalize_prc(evd.get("prc")), "ap": float(evd["ap"]) if "ap" in evd and evd["ap"] is not None else None, "scores": normalize_scores_to_struct(evd.get("scores")), "sample_indices": normalize_int_list(evd.get("indices")), "sample_labels": normalize_int_list(evd.get("labels")), "valid_mask": normalize_bool_list(evd.get("valid_mask")), "train_time": data.get("train", {}).get("time"), "test_time": test.get("time"), } return out def rows_from_isoforest(data: dict, evals: List[str]) -> Dict[str, dict]: """ Keys: test_auc_, test_roc_, test_prc_, test_ap_, test_scores_. """ out: Dict[str, dict] = {} for ev in evals: auc = data.get(f"test_auc_{ev}") if auc is None: continue out[ev] = { "auc": float(auc), "roc": normalize_roc(data.get(f"test_roc_{ev}")), "prc": normalize_prc(data.get(f"test_prc_{ev}")), "ap": float(data.get(f"test_ap_{ev}")) if data.get(f"test_ap_{ev}") is not None else None, "scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")), "sample_indices": None, "sample_labels": None, "valid_mask": None, "train_time": data.get("train_time"), "test_time": data.get("test_time"), } return out def rows_from_ocsvm_default(data: dict, evals: List[str]) -> Dict[str, dict]: """ Default OCSVM only (ignore linear variant entirely). """ out: Dict[str, dict] = {} for ev in evals: auc = data.get(f"test_auc_{ev}") if auc is None: continue out[ev] = { "auc": float(auc), "roc": normalize_roc(data.get(f"test_roc_{ev}")), "prc": normalize_prc(data.get(f"test_prc_{ev}")), "ap": float(data.get(f"test_ap_{ev}")) if data.get(f"test_ap_{ev}") is not None else None, "scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")), "sample_indices": None, "sample_labels": None, "valid_mask": None, "train_time": data.get("train_time"), "test_time": data.get("test_time"), } return out # ------------------------------------------------------------ # Build the Polars DataFrame # ------------------------------------------------------------ def build_results_frame(root: Path) -> pl.DataFrame: """ Walks experiment subdirs under `root`. For each (model, fold) it adds rows: Columns (SCHEMA_STATIC): network, latent_dim, semi_normals, semi_anomalous, model, eval, fold, auc, ap, scores{sample_idx,orig_label,score}, roc_curve{fpr,tpr,thr}, prc_curve{precision,recall,thr}, sample_indices, sample_labels, valid_mask, train_time, test_time, folder, k_fold_num """ rows: List[dict] = [] exp_dirs = [p for p in root.iterdir() if p.is_dir()] for exp_dir in sorted(exp_dirs): try: cfg = read_config(exp_dir) except Exception as e: print(f"[warn] skipping {exp_dir.name}: {e}") continue network = cfg.get("net_name") latent_dim = int(cfg.get("latent_space_dim")) semi_normals = int(cfg.get("num_known_normal")) semi_anomalous = int(cfg.get("num_known_outlier")) k = int(cfg.get("k_fold_num")) for model in MODELS: for fold in range(k): pkl = exp_dir / f"results_{model}_{fold}.pkl" if not pkl.exists(): continue try: data = read_pickle(pkl) except Exception as e: print(f"[warn] failed to read {pkl.name}: {e}") continue if model == "deepsad": per_eval = rows_from_deepsad(data, EVALS) # eval -> dict elif model == "isoforest": per_eval = rows_from_isoforest(data, EVALS) # eval -> dict elif model == "ocsvm": per_eval = rows_from_ocsvm_default(data, EVALS) # eval -> dict else: per_eval = {} for ev, vals in per_eval.items(): rows.append( { "network": network, "latent_dim": latent_dim, "semi_normals": semi_normals, "semi_anomalous": semi_anomalous, "model": model, "eval": ev, "fold": fold, "auc": vals["auc"], "ap": vals["ap"], "scores": vals["scores"], "roc_curve": vals["roc"], "prc_curve": vals["prc"], "sample_indices": vals.get("sample_indices"), "sample_labels": vals.get("sample_labels"), "valid_mask": vals.get("valid_mask"), "train_time": vals["train_time"], "test_time": vals["test_time"], "folder": str(exp_dir), "k_fold_num": k, } ) # If empty, return a typed empty frame if not rows: return pl.DataFrame(schema=SCHEMA_STATIC) df = pl.DataFrame(rows, schema=SCHEMA_STATIC) # Cast to efficient dtypes (categoricals etc.) – no extra sanitation df = df.with_columns( pl.col("network", "model", "eval").cast(pl.Categorical), pl.col( "latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num" ).cast(pl.Int32), pl.col("auc", "ap", "train_time", "test_time").cast(pl.Float64), # NOTE: no cast on 'scores' here; it's already List(Struct) per schema. ) return df # ------------------------------------------------------------ # Example “analysis-ready” queries (Polars idioms) # ------------------------------------------------------------ def demo_queries(df: pl.DataFrame): # q1: lazy is fine, then collect q1 = ( df.lazy() .filter( (pl.col("network") == "LeNet") & (pl.col("latent_dim") == 1024) & (pl.col("semi_normals") == 0) & (pl.col("semi_anomalous") == 0) & (pl.col("eval") == "exp_based") ) .group_by(["model"]) .agg(pl.col("auc").mean().alias("mean_auc")) .sort(["mean_auc"], descending=True) .collect() ) # q2: do the filtering eagerly, then pivot (LazyFrame has no .pivot) base = df.filter( (pl.col("model") == "deepsad") & (pl.col("eval") == "exp_based") & (pl.col("network") == "LeNet") & (pl.col("semi_normals") == 0) & (pl.col("semi_anomalous") == 0) ).select("fold", "latent_dim", "auc") q2 = base.pivot( values="auc", index="fold", columns="latent_dim", aggregate_function="first", # or "mean" if duplicates exist ).sort("fold") # roc_subset: eager filter/select, then explode struct fields roc_subset = ( df.filter( (pl.col("model") == "ocsvm") & (pl.col("eval") == "manual_based") & (pl.col("network") == "efficient") & (pl.col("latent_dim") == 1024) & (pl.col("semi_normals") == 0) & (pl.col("semi_anomalous") == 0) ) .select("fold", "roc_curve") .with_columns( pl.col("roc_curve").struct.field("fpr").alias("fpr"), pl.col("roc_curve").struct.field("tpr").alias("tpr"), pl.col("roc_curve").struct.field("thr").alias("thr"), ) ) return q1, q2, roc_subset def main(): root = Path("/home/fedex/mt/results/done") df = build_results_frame(root) q1, q2, roc_subset = demo_queries(df) print(df.shape, df.head()) # --- run it --- report = check_grid_coverage_and_shapes(df) print(report) if __name__ == "__main__": main()