data loading and plotting for results wip

2025-09-03 14:55:54 +02:00
parent 3d968c305c
commit ed80faf1e2
16 changed files with 2732 additions and 952 deletions
--- a/tools/load_results.py
+++ b/tools/load_results.py
@@ -2,338 +2,12 @@ from __future__ import annotations

 import json
 import pickle
-from itertools import product
 from pathlib import Path
 from typing import Any, Dict, List, Optional

 import numpy as np
 import polars as pl

-# --- configure your intended grid here (use the *canonical* strings used in df) ---
-NETWORKS_EXPECTED = ["subter_LeNet", "subter_efficient"]
-LATENT_DIMS_EXPECTED = [32, 64, 128, 256, 512, 768, 1024]
-SEMI_LABELS_EXPECTED = [(0, 0), (50, 10), (500, 100)]
-MODELS_EXPECTED = ["deepsad", "isoforest", "ocsvm"]
-EVALS_EXPECTED = ["exp_based", "manual_based"]
-
-# If k-fold is uniform, set it. If None, we infer it *per combo* from df.
-EXPECTED_K_FOLD: int | None = None  # e.g., 3
-
-
-def add_shape_columns(df: pl.DataFrame) -> pl.DataFrame:
-    return df.with_columns(
-        # ROC lens
-        roc_fpr_len=pl.when(pl.col("roc_curve").is_null())
-        .then(None)
-        .otherwise(pl.col("roc_curve").struct.field("fpr").list.len()),
-        roc_tpr_len=pl.when(pl.col("roc_curve").is_null())
-        .then(None)
-        .otherwise(pl.col("roc_curve").struct.field("tpr").list.len()),
-        roc_thr_len=pl.when(pl.col("roc_curve").is_null())
-        .then(None)
-        .otherwise(pl.col("roc_curve").struct.field("thr").list.len()),
-        # PRC lens
-        prc_prec_len=pl.when(pl.col("prc_curve").is_null())
-        .then(None)
-        .otherwise(pl.col("prc_curve").struct.field("precision").list.len()),
-        prc_rec_len=pl.when(pl.col("prc_curve").is_null())
-        .then(None)
-        .otherwise(pl.col("prc_curve").struct.field("recall").list.len()),
-        prc_thr_len=pl.when(pl.col("prc_curve").is_null())
-        .then(None)
-        .otherwise(pl.col("prc_curve").struct.field("thr").list.len()),
-        # scores lens
-        scores_len=pl.when(pl.col("scores").is_null())
-        .then(None)
-        .otherwise(pl.col("scores").list.len()),
-        # deepsad-only arrays (None for others)
-        idxs_len=pl.when(pl.col("sample_indices").is_null())
-        .then(None)
-        .otherwise(pl.col("sample_indices").list.len()),
-        labels_len=pl.when(pl.col("sample_labels").is_null())
-        .then(None)
-        .otherwise(pl.col("sample_labels").list.len()),
-        vmask_len=pl.when(pl.col("valid_mask").is_null())
-        .then(None)
-        .otherwise(pl.col("valid_mask").list.len()),
-    )
-
-
-def check_grid_coverage_and_shapes(
-    df: pl.DataFrame,
-    networks=NETWORKS_EXPECTED,
-    latent_dims=LATENT_DIMS_EXPECTED,
-    semi_labels=SEMI_LABELS_EXPECTED,
-    models=MODELS_EXPECTED,
-    evals=EVALS_EXPECTED,
-    expected_k_fold: int | None = EXPECTED_K_FOLD,
-):
-    dfx = add_shape_columns(df)
-
-    # helper: get rows for a specific base combo
-    def subframe(net, lat, s_norm, s_anom, mdl, ev):
-        return dfx.filter(
-            (pl.col("network") == net)
-            & (pl.col("latent_dim") == lat)
-            & (pl.col("semi_normals") == s_norm)
-            & (pl.col("semi_anomalous") == s_anom)
-            & (pl.col("model") == mdl)
-            & (pl.col("eval") == ev)
-        )
-
-    missing = []
-    incomplete = []  # (combo, expected_folds, present_folds)
-    shape_inconsistent = []  # (combo, metric_name, values_by_fold)
-    cross_model_diffs = []  # (net, lat, semi, ev, metric_name, shapes_by_model)
-
-    # 1) Coverage + within-combo shapes
-    for net, lat, (s_norm, s_anom), mdl, ev in product(
-        networks, latent_dims, semi_labels, models, evals
-    ):
-        sf = subframe(net, lat, s_norm, s_anom, mdl, ev).select(
-            "fold",
-            "k_fold_num",
-            "scores_len",
-            "roc_fpr_len",
-            "roc_tpr_len",
-            "roc_thr_len",
-            "prc_prec_len",
-            "prc_rec_len",
-            "prc_thr_len",
-            "idxs_len",
-            "labels_len",
-            "vmask_len",
-        )
-
-        if sf.height == 0:
-            missing.append(
-                dict(
-                    network=net,
-                    latent_dim=lat,
-                    semi_normals=s_norm,
-                    semi_anomalous=s_anom,
-                    model=mdl,
-                    eval=ev,
-                )
-            )
-            continue
-
-        # folds present vs expected
-        folds_present = sorted(sf.get_column("fold").unique().to_list())
-        if expected_k_fold is not None:
-            kexp = expected_k_fold
-        else:
-            # infer from rows (take max k_fold_num within this combo)
-            kexp = int(sf.get_column("k_fold_num").max())
-        all_expected_folds = list(range(kexp))
-        if folds_present != all_expected_folds:
-            incomplete.append(
-                dict(
-                    network=net,
-                    latent_dim=lat,
-                    semi_normals=s_norm,
-                    semi_anomalous=s_anom,
-                    model=mdl,
-                    eval=ev,
-                    expected_folds=all_expected_folds,
-                    present_folds=folds_present,
-                )
-            )
-
-        # shape consistency across folds (for this combo)
-        # collect distinct values per metric
-        shape_cols = [
-            "scores_len",
-            "roc_fpr_len",
-            "roc_tpr_len",
-            "roc_thr_len",
-            "prc_prec_len",
-            "prc_rec_len",
-            "prc_thr_len",
-            "idxs_len",
-            "labels_len",
-            "vmask_len",
-        ]
-        for colname in shape_cols:
-            vals = sf.select(colname).to_series()
-            uniq = sorted({v for v in vals.to_list()})
-            # Allow None-only columns (e.g., deepsad-only fields for other models)
-            if len([u for u in uniq if u is not None]) > 1:
-                # store per-fold values to help debug
-                per_fold = (
-                    sf.select("fold", pl.col(colname))
-                    .sort("fold")
-                    .to_dict(as_series=False)
-                )
-                shape_inconsistent.append(
-                    dict(
-                        network=net,
-                        latent_dim=lat,
-                        semi_normals=s_norm,
-                        semi_anomalous=s_anom,
-                        model=mdl,
-                        eval=ev,
-                        metric=colname,
-                        per_fold=per_fold,
-                    )
-                )
-
-    # 2) Cross-model comparability at fixed (net,lat,semi,eval)
-    # We compare shapes that *should* logically match across models:
-    #   - scores_len (same number of test samples)
-    #   - idxs/labels/vmask (only deepsad fills them; we tolerate None elsewhere)
-    # ROC/PRC binning can differ across models; we *report* those differences for awareness.
-    base_keys = (
-        df.select("network", "latent_dim", "semi_normals", "semi_anomalous", "eval")
-        .unique()
-        .iter_rows()
-    )
-    for net, lat, s_norm, s_anom, ev in base_keys:
-        rows = (
-            dfx.filter(
-                (pl.col("network") == net)
-                & (pl.col("latent_dim") == lat)
-                & (pl.col("semi_normals") == s_norm)
-                & (pl.col("semi_anomalous") == s_anom)
-                & (pl.col("eval") == ev)
-            )
-            .group_by("model")
-            .agg(
-                pl.col("scores_len").unique().alias("scores_len_set"),
-                pl.col("idxs_len").unique().alias("idxs_len_set"),
-                pl.col("labels_len").unique().alias("labels_len_set"),
-                pl.col("vmask_len").unique().alias("vmask_len_set"),
-                pl.col("roc_fpr_len").unique().alias("roc_fpr_len_set"),
-                pl.col("prc_prec_len").unique().alias("prc_prec_len_set"),
-            )
-            .to_dict(as_series=False)
-        )
-        if not rows:
-            continue
-        # normalize sets
-        mdls = rows["model"]
-        s_sets = [set(x) for x in rows["scores_len_set"]]
-        # compare scores_len across models (ignore None values)
-        s_normed = [tuple(sorted([v for v in s if v is not None])) for s in s_sets]
-        if len(set(s_normed)) > 1:
-            cross_model_diffs.append(
-                dict(
-                    network=net,
-                    latent_dim=lat,
-                    semi_normals=s_norm,
-                    semi_anomalous=s_anom,
-                    eval=ev,
-                    metric="scores_len",
-                    by_model={m: sorted(list(s_sets[i])) for i, m in enumerate(mdls)},
-                )
-            )
-        # Report ROC/PRC binning diffs (expected)
-        roc_sets = [set(x) for x in rows["roc_fpr_len_set"]]
-        if len(set(tuple(sorted(ss)) for ss in roc_sets)) > 1:
-            cross_model_diffs.append(
-                dict(
-                    network=net,
-                    latent_dim=lat,
-                    semi_normals=s_norm,
-                    semi_anomalous=s_anom,
-                    eval=ev,
-                    metric="roc_fpr_len",
-                    by_model={m: sorted(list(roc_sets[i])) for i, m in enumerate(mdls)},
-                )
-            )
-        prc_sets = [set(x) for x in rows["prc_prec_len_set"]]
-        if len(set(tuple(sorted(ss)) for ss in prc_sets)) > 1:
-            cross_model_diffs.append(
-                dict(
-                    network=net,
-                    latent_dim=lat,
-                    semi_normals=s_norm,
-                    semi_anomalous=s_anom,
-                    eval=ev,
-                    metric="prc_prec_len",
-                    by_model={m: sorted(list(prc_sets[i])) for i, m in enumerate(mdls)},
-                )
-            )
-
-    # --- Print a readable report ---
-    print("\n=== GRID COVERAGE ===")
-    print(f"Missing combos: {len(missing)}")
-    for m in missing[:20]:
-        print("  ", m)
-    if len(missing) > 20:
-        print(f"  ... (+{len(missing) - 20} more)")
-
-    print("\nIncomplete combos (folds missing):", len(incomplete))
-    for inc in incomplete[:20]:
-        print(
-            "  ",
-            {
-                k: inc[k]
-                for k in [
-                    "network",
-                    "latent_dim",
-                    "semi_normals",
-                    "semi_anomalous",
-                    "model",
-                    "eval",
-                ]
-            },
-            "expected",
-            inc["expected_folds"],
-            "present",
-            inc["present_folds"],
-        )
-    if len(incomplete) > 20:
-        print(f"  ... (+{len(incomplete) - 20} more)")
-
-    print("\n=== WITHIN-COMBO SHAPE CONSISTENCY (across folds) ===")
-    print(f"Mismatching groups: {len(shape_inconsistent)}")
-    for s in shape_inconsistent[:15]:
-        hdr = {
-            k: s[k]
-            for k in [
-                "network",
-                "latent_dim",
-                "semi_normals",
-                "semi_anomalous",
-                "model",
-                "eval",
-                "metric",
-            ]
-        }
-        print("  ", hdr, "values:", s["per_fold"])
-    if len(shape_inconsistent) > 15:
-        print(f"  ... (+{len(shape_inconsistent) - 15} more)")
-
-    print("\n=== CROSS-MODEL COMPARABILITY (by shape) ===")
-    print(
-        f"Shape differences across models at fixed (net,lat,semi,eval): {len(cross_model_diffs)}"
-    )
-    for s in cross_model_diffs[:15]:
-        hdr = {
-            k: s[k]
-            for k in [
-                "network",
-                "latent_dim",
-                "semi_normals",
-                "semi_anomalous",
-                "eval",
-                "metric",
-            ]
-        }
-        print("  ", hdr, "by_model:", s["by_model"])
-    if len(cross_model_diffs) > 15:
-        print(f"  ... (+{len(cross_model_diffs) - 15} more)")
-
-    # Return the raw details if you want to use them programmatically
-    return {
-        "missing": missing,
-        "incomplete": incomplete,
-        "shape_inconsistent": shape_inconsistent,
-        "cross_model_diffs": cross_model_diffs,
-    }
-
-
 # ------------------------------------------------------------
 # Config you can tweak
 # ------------------------------------------------------------
@@ -386,6 +60,37 @@ SCHEMA_STATIC = {
    "test_time": pl.Float64,
    "folder": pl.Utf8,
    "k_fold_num": pl.Int32,
+    "config_json": pl.Utf8,  # full config.json as string (for reference)
+}
+
+# Pretraining-only (AE) schema
+# Pretraining-only (AE) schema — lighter defaults
+PRETRAIN_SCHEMA = {
+    # identifiers / dims
+    "network": pl.Utf8,  # e.g. "LeNet", "efficient"
+    "latent_dim": pl.Int32,
+    "semi_normals": pl.Int32,
+    "semi_anomalous": pl.Int32,
+    "model": pl.Utf8,  # always "ae"
+    "fold": pl.Int32,
+    "split": pl.Utf8,  # "train" | "test"
+    # timings and optimization
+    "time": pl.Float64,
+    "loss": pl.Float64,
+    # per-sample arrays (as lists)
+    "indices": pl.List(pl.Int32),
+    "labels_exp_based": pl.List(pl.Int32),
+    "labels_manual_based": pl.List(pl.Int32),
+    "semi_targets": pl.List(pl.Int32),
+    "file_ids": pl.List(pl.Int32),
+    "frame_ids": pl.List(pl.Int32),
+    "scores": pl.List(pl.Float32),  # <— use Float32 to match source and save space
+    # file id -> name mapping from the result dict
+    "file_names": pl.List(pl.Struct({"file_id": pl.Int32, "name": pl.Utf8})),
+    # housekeeping
+    "folder": pl.Utf8,
+    "k_fold_num": pl.Int32,
+    "config_json": pl.Utf8,  # full config.json as string (for reference)
 }


@@ -406,6 +111,33 @@ def _tolist(x):
        return None


+def normalize_float_list(a) -> Optional[List[float]]:
+    if a is None:
+        return None
+    if isinstance(a, np.ndarray):
+        a = a.tolist()
+    return [None if x is None else float(x) for x in a]
+
+
+def normalize_file_names(d) -> Optional[List[dict]]:
+    """
+    Convert the 'file_names' dict (keys like numpy.int64 -> str) to a
+    list[ {file_id:int, name:str} ], sorted by file_id.
+    """
+    if not isinstance(d, dict):
+        return None
+    out: List[dict] = []
+    for k, v in d.items():
+        try:
+            file_id = int(k)
+        except Exception:
+            # keys are printed as np.int64 in the structure; best-effort cast
+            continue
+        out.append({"file_id": file_id, "name": str(v)})
+    out.sort(key=lambda x: x["file_id"])
+    return out
+
+
 def normalize_roc(obj: Any) -> Optional[dict]:
    if obj is None:
        return None
@@ -597,7 +329,7 @@ def rows_from_ocsvm_default(data: dict, evals: List[str]) -> Dict[str, dict]:
 # ------------------------------------------------------------
 # Build the Polars DataFrame
 # ------------------------------------------------------------
-def build_results_frame(root: Path) -> pl.DataFrame:
+def load_results_dataframe(root: Path, allow_cache: bool = True) -> pl.DataFrame:
    """
    Walks experiment subdirs under `root`. For each (model, fold) it adds rows:
    Columns (SCHEMA_STATIC):
@@ -609,12 +341,23 @@ def build_results_frame(root: Path) -> pl.DataFrame:
      train_time, test_time,
      folder, k_fold_num
    """
+    if allow_cache:
+        cache = root / "results_cache.parquet"
+        if cache.exists():
+            try:
+                df = pl.read_parquet(cache)
+                print(f"[info] loaded cached results frame from {cache}")
+                return df
+            except Exception as e:
+                print(f"[warn] failed to load cache {cache}: {e}")
+
    rows: List[dict] = []

    exp_dirs = [p for p in root.iterdir() if p.is_dir()]
    for exp_dir in sorted(exp_dirs):
        try:
            cfg = read_config(exp_dir)
+            cfg_json = json.dumps(cfg, sort_keys=True)
        except Exception as e:
            print(f"[warn] skipping {exp_dir.name}: {e}")
            continue
@@ -668,6 +411,7 @@ def build_results_frame(root: Path) -> pl.DataFrame:
                            "test_time": vals["test_time"],
                            "folder": str(exp_dir),
                            "k_fold_num": k,
+                            "config_json": cfg_json,
                        }
                    )

@@ -687,73 +431,166 @@ def build_results_frame(root: Path) -> pl.DataFrame:
        # NOTE: no cast on 'scores' here; it's already List(Struct) per schema.
    )

+    if allow_cache:
+        try:
+            df.write_parquet(cache)
+            print(f"[info] cached results frame to {cache}")
+        except Exception as e:
+            print(f"[warn] failed to write cache {cache}: {e}")
+
    return df


-# ------------------------------------------------------------
-# Example “analysis-ready” queries (Polars idioms)
-# ------------------------------------------------------------
-def demo_queries(df: pl.DataFrame):
-    # q1: lazy is fine, then collect
-    q1 = (
-        df.lazy()
-        .filter(
-            (pl.col("network") == "LeNet")
-            & (pl.col("latent_dim") == 1024)
-            & (pl.col("semi_normals") == 0)
-            & (pl.col("semi_anomalous") == 0)
-            & (pl.col("eval") == "exp_based")
-        )
-        .group_by(["model"])
-        .agg(pl.col("auc").mean().alias("mean_auc"))
-        .sort(["mean_auc"], descending=True)
-        .collect()
+def load_pretraining_results_dataframe(
+    root: Path,
+    allow_cache: bool = True,
+    include_train: bool = False,  # <— default: store only TEST to keep cache tiny
+    keep_file_names: bool = False,  # <— drop file_names by default; they’re repeated
+    parquet_compression: str = "zstd",
+    parquet_compression_level: int = 7,  # <— stronger compression than default
+) -> pl.DataFrame:
+    """
+    Loads only AE pretraining results: files named `results_ae_<fold>.pkl`.
+    Produces one row per (experiment, fold, split). By default we:
+      - include only the TEST split (include_train=False)
+      - store scores as Float32
+      - drop the repeated file_names mapping to save space
+      - write Parquet with zstd(level=7)
+    """
+    if allow_cache:
+        cache = root / "pretraining_results_cache.parquet"
+        if cache.exists():
+            try:
+                df = pl.read_parquet(cache)
+                print(f"[info] loaded cached pretraining frame from {cache}")
+                return df
+            except Exception as e:
+                print(f"[warn] failed to load pretraining cache {cache}: {e}")
+
+    rows: List[dict] = []
+
+    exp_dirs = [p for p in root.iterdir() if p.is_dir()]
+    for exp_dir in sorted(exp_dirs):
+        try:
+            cfg = read_config(exp_dir)
+            cfg_json = json.dumps(cfg, sort_keys=True)
+        except Exception as e:
+            print(f"[warn] skipping {exp_dir.name} (pretraining): {e}")
+            continue
+
+        network = cfg.get("net_name")
+        latent_dim = int(cfg.get("latent_space_dim"))
+        semi_normals = int(cfg.get("num_known_normal"))
+        semi_anomalous = int(cfg.get("num_known_outlier"))
+        k = int(cfg.get("k_fold_num"))
+
+        # Only test split by default (include_train=False)
+        splits = ("train", "test") if include_train else ("test",)
+
+        for fold in range(k):
+            pkl = exp_dir / f"results_ae_{fold}.pkl"
+            if not pkl.exists():
+                continue
+
+            try:
+                data = read_pickle(pkl)  # expected: {"train": {...}, "test": {...}}
+            except Exception as e:
+                print(f"[warn] failed to read {pkl.name}: {e}")
+                continue
+
+            for split in splits:
+                splitd = data.get(split)
+                if not isinstance(splitd, dict):
+                    continue
+
+                rows.append(
+                    {
+                        "network": network,
+                        "latent_dim": latent_dim,
+                        "semi_normals": semi_normals,
+                        "semi_anomalous": semi_anomalous,
+                        "model": "ae",
+                        "fold": fold,
+                        "split": split,
+                        "time": float(splitd.get("time"))
+                        if splitd.get("time") is not None
+                        else None,
+                        "loss": float(splitd.get("loss"))
+                        if splitd.get("loss") is not None
+                        else None,
+                        # ints as Int32, scores as Float32 to save space
+                        "indices": normalize_int_list(splitd.get("indices")),
+                        "labels_exp_based": normalize_int_list(
+                            splitd.get("labels_exp_based")
+                        ),
+                        "labels_manual_based": normalize_int_list(
+                            splitd.get("labels_manual_based")
+                        ),
+                        "semi_targets": normalize_int_list(splitd.get("semi_targets")),
+                        "file_ids": normalize_int_list(splitd.get("file_ids")),
+                        "frame_ids": normalize_int_list(splitd.get("frame_ids")),
+                        "scores": (
+                            None
+                            if splitd.get("scores") is None
+                            else [
+                                float(x)
+                                for x in (
+                                    splitd["scores"].tolist()
+                                    if isinstance(splitd["scores"], np.ndarray)
+                                    else splitd["scores"]
+                                )
+                            ]
+                        ),
+                        "file_names": normalize_file_names(splitd.get("file_names"))
+                        if keep_file_names
+                        else None,
+                        "folder": str(exp_dir),
+                        "k_fold_num": k,
+                        "config_json": cfg_json,
+                    }
+                )
+
+    if not rows:
+        return pl.DataFrame(schema=PRETRAIN_SCHEMA)
+
+    df = pl.DataFrame(rows, schema=PRETRAIN_SCHEMA)
+
+    # Cast/optimize a bit (categoricals, ints, floats)
+    df = df.with_columns(
+        pl.col("network", "model", "split").cast(pl.Categorical),
+        pl.col(
+            "latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
+        ).cast(pl.Int32),
+        pl.col("time", "loss").cast(pl.Float64),
+        pl.col("scores").cast(pl.List(pl.Float32)),  # ensure downcast took
    )

-    # q2: do the filtering eagerly, then pivot (LazyFrame has no .pivot)
-    base = df.filter(
-        (pl.col("model") == "deepsad")
-        & (pl.col("eval") == "exp_based")
-        & (pl.col("network") == "LeNet")
-        & (pl.col("semi_normals") == 0)
-        & (pl.col("semi_anomalous") == 0)
-    ).select("fold", "latent_dim", "auc")
-    q2 = base.pivot(
-        values="auc",
-        index="fold",
-        columns="latent_dim",
-        aggregate_function="first",  # or "mean" if duplicates exist
-    ).sort("fold")
+    if allow_cache:
+        try:
+            cache = root / "pretraining_results_cache.parquet"
+            df.write_parquet(
+                cache,
+                compression=parquet_compression,
+                compression_level=parquet_compression_level,
+                statistics=True,
+            )
+            print(
+                f"[info] cached pretraining frame to {cache} "
+                f"({parquet_compression}, level={parquet_compression_level})"
+            )
+        except Exception as e:
+            print(f"[warn] failed to write pretraining cache {cache}: {e}")

-    # roc_subset: eager filter/select, then explode struct fields
-    roc_subset = (
-        df.filter(
-            (pl.col("model") == "ocsvm")
-            & (pl.col("eval") == "manual_based")
-            & (pl.col("network") == "efficient")
-            & (pl.col("latent_dim") == 1024)
-            & (pl.col("semi_normals") == 0)
-            & (pl.col("semi_anomalous") == 0)
-        )
-        .select("fold", "roc_curve")
-        .with_columns(
-            pl.col("roc_curve").struct.field("fpr").alias("fpr"),
-            pl.col("roc_curve").struct.field("tpr").alias("tpr"),
-            pl.col("roc_curve").struct.field("thr").alias("thr"),
-        )
-    )
-
-    return q1, q2, roc_subset
+    return df


 def main():
    root = Path("/home/fedex/mt/results/done")
-    df = build_results_frame(root)
-    q1, q2, roc_subset = demo_queries(df)
+    df = load_results_dataframe(root, allow_cache=True)
    print(df.shape, df.head())
-    # --- run it ---
-    report = check_grid_coverage_and_shapes(df)
-    print(report)
+
+    df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
+    print("pretraining:", df_pre.shape, df_pre.head())


 if __name__ == "__main__":