data loading and plotting for results wip

2025-09-03 14:55:54 +02:00
parent 3d968c305c
commit ed80faf1e2
16 changed files with 2732 additions and 952 deletions
--- a/tools/plot_scripts/load_results.py
+++ b/tools/plot_scripts/load_results.py
@@ -0,0 +1,597 @@
+from __future__ import annotations
+
+import json
+import pickle
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import polars as pl
+
+# ------------------------------------------------------------
+# Config you can tweak
+# ------------------------------------------------------------
+MODELS = ["deepsad", "isoforest", "ocsvm"]
+EVALS = ["exp_based", "manual_based"]
+
+SCHEMA_STATIC = {
+    # identifiers / dims
+    "network": pl.Utf8,  # e.g. "LeNet", "efficient"
+    "latent_dim": pl.Int32,
+    "semi_normals": pl.Int32,
+    "semi_anomalous": pl.Int32,
+    "model": pl.Utf8,  # "deepsad" | "isoforest" | "ocsvm"
+    "eval": pl.Utf8,  # "exp_based" | "manual_based"
+    "fold": pl.Int32,
+    # metrics
+    "auc": pl.Float64,
+    "ap": pl.Float64,
+    # per-sample scores: list of (idx, label, score)
+    "scores": pl.List(
+        pl.Struct(
+            {
+                "sample_idx": pl.Int32,  # dataloader idx
+                "orig_label": pl.Int8,  # {-1,0,1}
+                "score": pl.Float64,  # anomaly score
+            }
+        )
+    ),
+    # curves (normalized)
+    "roc_curve": pl.Struct(
+        {
+            "fpr": pl.List(pl.Float64),
+            "tpr": pl.List(pl.Float64),
+            "thr": pl.List(pl.Float64),
+        }
+    ),
+    "prc_curve": pl.Struct(
+        {
+            "precision": pl.List(pl.Float64),
+            "recall": pl.List(pl.Float64),
+            "thr": pl.List(pl.Float64),  # may be len(precision)-1
+        }
+    ),
+    # deepsad-only per-eval arrays (None for other models)
+    "sample_indices": pl.List(pl.Int32),
+    "sample_labels": pl.List(pl.Int8),
+    "valid_mask": pl.List(pl.Boolean),
+    # timings / housekeeping
+    "train_time": pl.Float64,
+    "test_time": pl.Float64,
+    "folder": pl.Utf8,
+    "k_fold_num": pl.Int32,
+    "config_json": pl.Utf8,  # full config.json as string (for reference)
+}
+
+# Pretraining-only (AE) schema
+# Pretraining-only (AE) schema — lighter defaults
+PRETRAIN_SCHEMA = {
+    # identifiers / dims
+    "network": pl.Utf8,  # e.g. "LeNet", "efficient"
+    "latent_dim": pl.Int32,
+    "semi_normals": pl.Int32,
+    "semi_anomalous": pl.Int32,
+    "model": pl.Utf8,  # always "ae"
+    "fold": pl.Int32,
+    "split": pl.Utf8,  # "train" | "test"
+    # timings and optimization
+    "time": pl.Float64,
+    "loss": pl.Float64,
+    # per-sample arrays (as lists)
+    "indices": pl.List(pl.Int32),
+    "labels_exp_based": pl.List(pl.Int32),
+    "labels_manual_based": pl.List(pl.Int32),
+    "semi_targets": pl.List(pl.Int32),
+    "file_ids": pl.List(pl.Int32),
+    "frame_ids": pl.List(pl.Int32),
+    "scores": pl.List(pl.Float32),  # <— use Float32 to match source and save space
+    # file id -> name mapping from the result dict
+    "file_names": pl.List(pl.Struct({"file_id": pl.Int32, "name": pl.Utf8})),
+    # housekeeping
+    "folder": pl.Utf8,
+    "k_fold_num": pl.Int32,
+    "config_json": pl.Utf8,  # full config.json as string (for reference)
+}
+
+
+# ------------------------------------------------------------
+# Helpers: curve/scores normalizers (tuples/ndarrays -> dict/list)
+# ------------------------------------------------------------
+def _tolist(x):
+    if x is None:
+        return None
+    if isinstance(x, np.ndarray):
+        return x.tolist()
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    # best-effort scalar wrap
+    try:
+        return [x]
+    except Exception:
+        return None
+
+
+def normalize_float_list(a) -> Optional[List[float]]:
+    if a is None:
+        return None
+    if isinstance(a, np.ndarray):
+        a = a.tolist()
+    return [None if x is None else float(x) for x in a]
+
+
+def normalize_file_names(d) -> Optional[List[dict]]:
+    """
+    Convert the 'file_names' dict (keys like numpy.int64 -> str) to a
+    list[ {file_id:int, name:str} ], sorted by file_id.
+    """
+    if not isinstance(d, dict):
+        return None
+    out: List[dict] = []
+    for k, v in d.items():
+        try:
+            file_id = int(k)
+        except Exception:
+            # keys are printed as np.int64 in the structure; best-effort cast
+            continue
+        out.append({"file_id": file_id, "name": str(v)})
+    out.sort(key=lambda x: x["file_id"])
+    return out
+
+
+def normalize_roc(obj: Any) -> Optional[dict]:
+    if obj is None:
+        return None
+    fpr = tpr = thr = None
+    if isinstance(obj, (tuple, list)):
+        if len(obj) >= 2:
+            fpr, tpr = _tolist(obj[0]), _tolist(obj[1])
+        if len(obj) >= 3:
+            thr = _tolist(obj[2])
+    elif isinstance(obj, dict):
+        fpr = _tolist(obj.get("fpr") or obj.get("x"))
+        tpr = _tolist(obj.get("tpr") or obj.get("y"))
+        thr = _tolist(obj.get("thr") or obj.get("thresholds"))
+    else:
+        return None
+    if fpr is None or tpr is None:
+        return None
+    return {"fpr": fpr, "tpr": tpr, "thr": thr}
+
+
+def normalize_prc(obj: Any) -> Optional[dict]:
+    if obj is None:
+        return None
+    precision = recall = thr = None
+    if isinstance(obj, (tuple, list)):
+        if len(obj) >= 2:
+            precision, recall = _tolist(obj[0]), _tolist(obj[1])
+        if len(obj) >= 3:
+            thr = _tolist(obj[2])
+    elif isinstance(obj, dict):
+        precision = _tolist(obj.get("precision") or obj.get("y"))
+        recall = _tolist(obj.get("recall") or obj.get("x"))
+        thr = _tolist(obj.get("thr") or obj.get("thresholds"))
+    else:
+        return None
+    if precision is None or recall is None:
+        return None
+    return {"precision": precision, "recall": recall, "thr": thr}
+
+
+def normalize_scores_to_struct(seq) -> Optional[List[dict]]:
+    """
+    Input: list of (idx, label, score) tuples (as produced in your test()).
+    Output: list of dicts with keys sample_idx, orig_label, score.
+    """
+    if seq is None:
+        return None
+    if isinstance(seq, np.ndarray):
+        seq = seq.tolist()
+    if not isinstance(seq, (list, tuple)):
+        return None
+    out: List[dict] = []
+    for item in seq:
+        if isinstance(item, (list, tuple)) and len(item) >= 3:
+            idx, lab, sc = item[0], item[1], item[2]
+            out.append(
+                {
+                    "sample_idx": None if idx is None else int(idx),
+                    "orig_label": None if lab is None else int(lab),
+                    "score": None if sc is None else float(sc),
+                }
+            )
+        else:
+            # fallback: single numeric -> score
+            sc = (
+                float(item)
+                if isinstance(item, (int, float, np.integer, np.floating))
+                else None
+            )
+            out.append({"sample_idx": None, "orig_label": None, "score": sc})
+    return out
+
+
+def normalize_int_list(a) -> Optional[List[int]]:
+    if a is None:
+        return None
+    if isinstance(a, np.ndarray):
+        a = a.tolist()
+    return list(a)
+
+
+def normalize_bool_list(a) -> Optional[List[bool]]:
+    if a is None:
+        return None
+    if isinstance(a, np.ndarray):
+        a = a.tolist()
+    return [bool(x) for x in a]
+
+
+# ------------------------------------------------------------
+# Low-level: read one experiment folder
+# ------------------------------------------------------------
+def read_config(exp_dir: Path) -> dict:
+    cfg = exp_dir / "config.json"
+    with cfg.open("r") as f:
+        c = json.load(f)
+    if not c.get("k_fold"):
+        raise ValueError(f"{exp_dir.name}: not trained as k-fold")
+    return c
+
+
+def read_pickle(p: Path) -> Any:
+    with p.open("rb") as f:
+        return pickle.load(f)
+
+
+# ------------------------------------------------------------
+# Extractors for each model
+# ------------------------------------------------------------
+def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
+    """
+    deepsad under data['test'][eval], with extra per-eval arrays and AP present.
+    """
+    out: Dict[str, dict] = {}
+    test = data.get("test", {})
+    for ev in evals:
+        evd = test.get(ev)
+        if not isinstance(evd, dict):
+            continue
+        out[ev] = {
+            "auc": float(evd["auc"])
+            if "auc" in evd and evd["auc"] is not None
+            else None,
+            "roc": normalize_roc(evd.get("roc")),
+            "prc": normalize_prc(evd.get("prc")),
+            "ap": float(evd["ap"]) if "ap" in evd and evd["ap"] is not None else None,
+            "scores": normalize_scores_to_struct(evd.get("scores")),
+            "sample_indices": normalize_int_list(evd.get("indices")),
+            "sample_labels": normalize_int_list(evd.get("labels")),
+            "valid_mask": normalize_bool_list(evd.get("valid_mask")),
+            "train_time": data.get("train", {}).get("time"),
+            "test_time": test.get("time"),
+        }
+    return out
+
+
+def rows_from_isoforest(data: dict, evals: List[str]) -> Dict[str, dict]:
+    """
+    Keys: test_auc_<eval>, test_roc_<eval>, test_prc_<eval>, test_ap_<eval>, test_scores_<eval>.
+    """
+    out: Dict[str, dict] = {}
+    for ev in evals:
+        auc = data.get(f"test_auc_{ev}")
+        if auc is None:
+            continue
+        out[ev] = {
+            "auc": float(auc),
+            "roc": normalize_roc(data.get(f"test_roc_{ev}")),
+            "prc": normalize_prc(data.get(f"test_prc_{ev}")),
+            "ap": float(data.get(f"test_ap_{ev}"))
+            if data.get(f"test_ap_{ev}") is not None
+            else None,
+            "scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")),
+            "sample_indices": None,
+            "sample_labels": None,
+            "valid_mask": None,
+            "train_time": data.get("train_time"),
+            "test_time": data.get("test_time"),
+        }
+    return out
+
+
+def rows_from_ocsvm_default(data: dict, evals: List[str]) -> Dict[str, dict]:
+    """
+    Default OCSVM only (ignore linear variant entirely).
+    """
+    out: Dict[str, dict] = {}
+    for ev in evals:
+        auc = data.get(f"test_auc_{ev}")
+        if auc is None:
+            continue
+        out[ev] = {
+            "auc": float(auc),
+            "roc": normalize_roc(data.get(f"test_roc_{ev}")),
+            "prc": normalize_prc(data.get(f"test_prc_{ev}")),
+            "ap": float(data.get(f"test_ap_{ev}"))
+            if data.get(f"test_ap_{ev}") is not None
+            else None,
+            "scores": normalize_scores_to_struct(data.get(f"test_scores_{ev}")),
+            "sample_indices": None,
+            "sample_labels": None,
+            "valid_mask": None,
+            "train_time": data.get("train_time"),
+            "test_time": data.get("test_time"),
+        }
+    return out
+
+
+# ------------------------------------------------------------
+# Build the Polars DataFrame
+# ------------------------------------------------------------
+def load_results_dataframe(root: Path, allow_cache: bool = True) -> pl.DataFrame:
+    """
+    Walks experiment subdirs under `root`. For each (model, fold) it adds rows:
+    Columns (SCHEMA_STATIC):
+      network, latent_dim, semi_normals, semi_anomalous,
+      model, eval, fold,
+      auc, ap, scores{sample_idx,orig_label,score},
+      roc_curve{fpr,tpr,thr}, prc_curve{precision,recall,thr},
+      sample_indices, sample_labels, valid_mask,
+      train_time, test_time,
+      folder, k_fold_num
+    """
+    if allow_cache:
+        cache = root / "results_cache.parquet"
+        if cache.exists():
+            try:
+                df = pl.read_parquet(cache)
+                print(f"[info] loaded cached results frame from {cache}")
+                return df
+            except Exception as e:
+                print(f"[warn] failed to load cache {cache}: {e}")
+
+    rows: List[dict] = []
+
+    exp_dirs = [p for p in root.iterdir() if p.is_dir()]
+    for exp_dir in sorted(exp_dirs):
+        try:
+            cfg = read_config(exp_dir)
+            cfg_json = json.dumps(cfg, sort_keys=True)
+        except Exception as e:
+            print(f"[warn] skipping {exp_dir.name}: {e}")
+            continue
+
+        network = cfg.get("net_name")
+        latent_dim = int(cfg.get("latent_space_dim"))
+        semi_normals = int(cfg.get("num_known_normal"))
+        semi_anomalous = int(cfg.get("num_known_outlier"))
+        k = int(cfg.get("k_fold_num"))
+
+        for model in MODELS:
+            for fold in range(k):
+                pkl = exp_dir / f"results_{model}_{fold}.pkl"
+                if not pkl.exists():
+                    continue
+
+                try:
+                    data = read_pickle(pkl)
+                except Exception as e:
+                    print(f"[warn] failed to read {pkl.name}: {e}")
+                    continue
+
+                if model == "deepsad":
+                    per_eval = rows_from_deepsad(data, EVALS)  # eval -> dict
+                elif model == "isoforest":
+                    per_eval = rows_from_isoforest(data, EVALS)  # eval -> dict
+                elif model == "ocsvm":
+                    per_eval = rows_from_ocsvm_default(data, EVALS)  # eval -> dict
+                else:
+                    per_eval = {}
+
+                for ev, vals in per_eval.items():
+                    rows.append(
+                        {
+                            "network": network,
+                            "latent_dim": latent_dim,
+                            "semi_normals": semi_normals,
+                            "semi_anomalous": semi_anomalous,
+                            "model": model,
+                            "eval": ev,
+                            "fold": fold,
+                            "auc": vals["auc"],
+                            "ap": vals["ap"],
+                            "scores": vals["scores"],
+                            "roc_curve": vals["roc"],
+                            "prc_curve": vals["prc"],
+                            "sample_indices": vals.get("sample_indices"),
+                            "sample_labels": vals.get("sample_labels"),
+                            "valid_mask": vals.get("valid_mask"),
+                            "train_time": vals["train_time"],
+                            "test_time": vals["test_time"],
+                            "folder": str(exp_dir),
+                            "k_fold_num": k,
+                            "config_json": cfg_json,
+                        }
+                    )
+
+    # If empty, return a typed empty frame
+    if not rows:
+        return pl.DataFrame(schema=SCHEMA_STATIC)
+
+    df = pl.DataFrame(rows, schema=SCHEMA_STATIC)
+
+    # Cast to efficient dtypes (categoricals etc.) – no extra sanitation
+    df = df.with_columns(
+        pl.col("network", "model", "eval").cast(pl.Categorical),
+        pl.col(
+            "latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
+        ).cast(pl.Int32),
+        pl.col("auc", "ap", "train_time", "test_time").cast(pl.Float64),
+        # NOTE: no cast on 'scores' here; it's already List(Struct) per schema.
+    )
+
+    if allow_cache:
+        try:
+            df.write_parquet(cache)
+            print(f"[info] cached results frame to {cache}")
+        except Exception as e:
+            print(f"[warn] failed to write cache {cache}: {e}")
+
+    return df
+
+
+def load_pretraining_results_dataframe(
+    root: Path,
+    allow_cache: bool = True,
+    include_train: bool = False,  # <— default: store only TEST to keep cache tiny
+    keep_file_names: bool = False,  # <— drop file_names by default; they’re repeated
+    parquet_compression: str = "zstd",
+    parquet_compression_level: int = 7,  # <— stronger compression than default
+) -> pl.DataFrame:
+    """
+    Loads only AE pretraining results: files named `results_ae_<fold>.pkl`.
+    Produces one row per (experiment, fold, split). By default we:
+      - include only the TEST split (include_train=False)
+      - store scores as Float32
+      - drop the repeated file_names mapping to save space
+      - write Parquet with zstd(level=7)
+    """
+    if allow_cache:
+        cache = root / "pretraining_results_cache.parquet"
+        if cache.exists():
+            try:
+                df = pl.read_parquet(cache)
+                print(f"[info] loaded cached pretraining frame from {cache}")
+                return df
+            except Exception as e:
+                print(f"[warn] failed to load pretraining cache {cache}: {e}")
+
+    rows: List[dict] = []
+
+    exp_dirs = [p for p in root.iterdir() if p.is_dir()]
+    for exp_dir in sorted(exp_dirs):
+        try:
+            cfg = read_config(exp_dir)
+            cfg_json = json.dumps(cfg, sort_keys=True)
+        except Exception as e:
+            print(f"[warn] skipping {exp_dir.name} (pretraining): {e}")
+            continue
+
+        network = cfg.get("net_name")
+        latent_dim = int(cfg.get("latent_space_dim"))
+        semi_normals = int(cfg.get("num_known_normal"))
+        semi_anomalous = int(cfg.get("num_known_outlier"))
+        k = int(cfg.get("k_fold_num"))
+
+        # Only test split by default (include_train=False)
+        splits = ("train", "test") if include_train else ("test",)
+
+        for fold in range(k):
+            pkl = exp_dir / f"results_ae_{fold}.pkl"
+            if not pkl.exists():
+                continue
+
+            try:
+                data = read_pickle(pkl)  # expected: {"train": {...}, "test": {...}}
+            except Exception as e:
+                print(f"[warn] failed to read {pkl.name}: {e}")
+                continue
+
+            for split in splits:
+                splitd = data.get(split)
+                if not isinstance(splitd, dict):
+                    continue
+
+                rows.append(
+                    {
+                        "network": network,
+                        "latent_dim": latent_dim,
+                        "semi_normals": semi_normals,
+                        "semi_anomalous": semi_anomalous,
+                        "model": "ae",
+                        "fold": fold,
+                        "split": split,
+                        "time": float(splitd.get("time"))
+                        if splitd.get("time") is not None
+                        else None,
+                        "loss": float(splitd.get("loss"))
+                        if splitd.get("loss") is not None
+                        else None,
+                        # ints as Int32, scores as Float32 to save space
+                        "indices": normalize_int_list(splitd.get("indices")),
+                        "labels_exp_based": normalize_int_list(
+                            splitd.get("labels_exp_based")
+                        ),
+                        "labels_manual_based": normalize_int_list(
+                            splitd.get("labels_manual_based")
+                        ),
+                        "semi_targets": normalize_int_list(splitd.get("semi_targets")),
+                        "file_ids": normalize_int_list(splitd.get("file_ids")),
+                        "frame_ids": normalize_int_list(splitd.get("frame_ids")),
+                        "scores": (
+                            None
+                            if splitd.get("scores") is None
+                            else [
+                                float(x)
+                                for x in (
+                                    splitd["scores"].tolist()
+                                    if isinstance(splitd["scores"], np.ndarray)
+                                    else splitd["scores"]
+                                )
+                            ]
+                        ),
+                        "file_names": normalize_file_names(splitd.get("file_names"))
+                        if keep_file_names
+                        else None,
+                        "folder": str(exp_dir),
+                        "k_fold_num": k,
+                        "config_json": cfg_json,
+                    }
+                )
+
+    if not rows:
+        return pl.DataFrame(schema=PRETRAIN_SCHEMA)
+
+    df = pl.DataFrame(rows, schema=PRETRAIN_SCHEMA)
+
+    # Cast/optimize a bit (categoricals, ints, floats)
+    df = df.with_columns(
+        pl.col("network", "model", "split").cast(pl.Categorical),
+        pl.col(
+            "latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
+        ).cast(pl.Int32),
+        pl.col("time", "loss").cast(pl.Float64),
+        pl.col("scores").cast(pl.List(pl.Float32)),  # ensure downcast took
+    )
+
+    if allow_cache:
+        try:
+            cache = root / "pretraining_results_cache.parquet"
+            df.write_parquet(
+                cache,
+                compression=parquet_compression,
+                compression_level=parquet_compression_level,
+                statistics=True,
+            )
+            print(
+                f"[info] cached pretraining frame to {cache} "
+                f"({parquet_compression}, level={parquet_compression_level})"
+            )
+        except Exception as e:
+            print(f"[warn] failed to write pretraining cache {cache}: {e}")
+
+    return df
+
+
+def main():
+    root = Path("/home/fedex/mt/results/done")
+    df = load_results_dataframe(root, allow_cache=True)
+    print(df.shape, df.head())
+
+    df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
+    print("pretraining:", df_pre.shape, df_pre.head())
+
+
+if __name__ == "__main__":
+    main()