wip inference

2025-09-15 11:21:30 +02:00
parent e4b298cf06
commit e7624d2786
8 changed files with 1027 additions and 35 deletions
--- a/tools/plot_scripts/load_results.py
+++ b/tools/plot_scripts/load_results.py
@@ -96,6 +96,21 @@ PRETRAIN_SCHEMA = {
    "config_json": pl.Utf8,  # full config.json as string (for reference)
 }

+SCHEMA_INFERENCE = {
+    # identifiers / dims
+    "experiment": pl.Utf8,  # e.g. "2_static_no_artifacts_illuminated_2023-01-23-001"
+    "network": pl.Utf8,  # e.g. "LeNet", "efficient"
+    "latent_dim": pl.Int32,
+    "semi_normals": pl.Int32,
+    "semi_anomalous": pl.Int32,
+    "model": pl.Utf8,  # "deepsad" | "isoforest" | "ocsvm"
+    # metrics
+    "scores": pl.List(pl.Float64),
+    # timings / housekeeping
+    "folder": pl.Utf8,
+    "config_json": pl.Utf8,  # full config.json as string (for reference)
+}
+

 # ------------------------------------------------------------
 # Helpers: curve/scores normalizers (tuples/ndarrays -> dict/list)
@@ -233,11 +248,11 @@ def normalize_bool_list(a) -> Optional[List[bool]]:
 # ------------------------------------------------------------
 # Low-level: read one experiment folder
 # ------------------------------------------------------------
-def read_config(exp_dir: Path) -> dict:
+def read_config(exp_dir: Path, k_fold_required: bool = True) -> dict:
    cfg = exp_dir / "config.json"
    with cfg.open("r") as f:
        c = json.load(f)
-    if not c.get("k_fold"):
+    if k_fold_required and not c.get("k_fold"):
        raise ValueError(f"{exp_dir.name}: not trained as k-fold")
    return c

@@ -589,7 +604,129 @@ def load_pretraining_results_dataframe(
    return df


+def load_inference_results_dataframe(
+    root: Path,
+    allow_cache: bool = True,
+    models: List[str] = MODELS,
+) -> pl.DataFrame:
+    """Load inference results from experiment folders.
+
+    Args:
+        root: Path to root directory containing experiment folders
+        allow_cache: Whether to use/create cache file
+        models: List of models to look for scores
+
+    Returns:
+        pl.DataFrame: DataFrame containing inference results
+    """
+    if allow_cache:
+        cache = root / "inference_results_cache.parquet"
+        if cache.exists():
+            try:
+                df = pl.read_parquet(cache)
+                print(f"[info] loaded cached inference frame from {cache}")
+                return df
+            except Exception as e:
+                print(f"[warn] failed to load inference cache {cache}: {e}")
+
+    rows: List[dict] = []
+
+    exp_dirs = [p for p in root.iterdir() if p.is_dir()]
+    for exp_dir in sorted(exp_dirs):
+        try:
+            # Load and validate config
+            cfg = read_config(exp_dir, k_fold_required=False)
+            cfg_json = json.dumps(cfg, sort_keys=True)
+
+            # Extract config values
+            network = cfg.get("net_name")
+            latent_dim = int(cfg.get("latent_space_dim"))
+            semi_normals = int(cfg.get("num_known_normal"))
+            semi_anomalous = int(cfg.get("num_known_outlier"))
+
+            # Process each model's scores
+            inference_dir = exp_dir / "inference"
+            if not inference_dir.exists():
+                print(f"[warn] no inference directory for {exp_dir.name}")
+                continue
+
+            # Find all unique experiments in this folder's inference files
+            score_files = list(inference_dir.glob("*_scores.npy"))
+            if not score_files:
+                print(f"[warn] no score files in {inference_dir}")
+                continue
+
+            # Extract unique experiment names from score files
+            # Format: {experiment}_{model}_scores.npy
+            experiments = set()
+            for score_file in score_files:
+                exp_name = score_file.stem.rsplit("_", 2)[0]
+                experiments.add(exp_name)
+
+            # Load scores for each experiment and model
+            for experiment in sorted(experiments):
+                for model in models:
+                    score_file = inference_dir / f"{experiment}_{model}_scores.npy"
+                    if not score_file.exists():
+                        print(f"[warn] missing score file for {experiment}, {model}")
+                        continue
+
+                    try:
+                        scores = np.load(score_file)
+                        rows.append(
+                            {
+                                "experiment": experiment,
+                                "network": network,
+                                "latent_dim": latent_dim,
+                                "semi_normals": semi_normals,
+                                "semi_anomalous": semi_anomalous,
+                                "model": model,
+                                "scores": scores.tolist(),
+                                "folder": str(exp_dir),
+                                "config_json": cfg_json,
+                            }
+                        )
+                    except Exception as e:
+                        print(
+                            f"[warn] failed to load scores for {experiment}, {model}: {e}"
+                        )
+                        continue
+
+        except Exception as e:
+            print(f"[warn] skipping {exp_dir.name}: {e}")
+            continue
+
+    # If empty, return a typed empty frame
+    if not rows:
+        return pl.DataFrame(schema=SCHEMA_INFERENCE)
+
+    df = pl.DataFrame(rows, schema=SCHEMA_INFERENCE)
+
+    # Optimize datatypes
+    df = df.with_columns(
+        [
+            pl.col("experiment", "network", "model").cast(pl.Categorical),
+            pl.col("latent_dim", "semi_normals", "semi_anomalous").cast(pl.Int32),
+        ]
+    )
+
+    # Cache if enabled
+    if allow_cache:
+        try:
+            df.write_parquet(cache)
+            print(f"[info] cached inference frame to {cache}")
+        except Exception as e:
+            print(f"[warn] failed to write cache {cache}: {e}")
+
+    return df
+
+
 def main():
+    inference_root = Path("/home/fedex/mt/results/inference/copy")
+    df_inference = load_inference_results_dataframe(inference_root, allow_cache=True)
+
+    exit(0)
+
    root = Path("/home/fedex/mt/results/copy")
    df1 = load_results_dataframe(root, allow_cache=True)
    exit(0)