data loading and plotting for results wip
This commit is contained in:
@@ -2,338 +2,12 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import pickle
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
# --- configure your intended grid here (use the *canonical* strings used in df) ---
|
||||
NETWORKS_EXPECTED = ["subter_LeNet", "subter_efficient"]
|
||||
LATENT_DIMS_EXPECTED = [32, 64, 128, 256, 512, 768, 1024]
|
||||
SEMI_LABELS_EXPECTED = [(0, 0), (50, 10), (500, 100)]
|
||||
MODELS_EXPECTED = ["deepsad", "isoforest", "ocsvm"]
|
||||
EVALS_EXPECTED = ["exp_based", "manual_based"]
|
||||
|
||||
# If k-fold is uniform, set it. If None, we infer it *per combo* from df.
|
||||
EXPECTED_K_FOLD: int | None = None # e.g., 3
|
||||
|
||||
|
||||
def add_shape_columns(df: pl.DataFrame) -> pl.DataFrame:
|
||||
return df.with_columns(
|
||||
# ROC lens
|
||||
roc_fpr_len=pl.when(pl.col("roc_curve").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("roc_curve").struct.field("fpr").list.len()),
|
||||
roc_tpr_len=pl.when(pl.col("roc_curve").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("roc_curve").struct.field("tpr").list.len()),
|
||||
roc_thr_len=pl.when(pl.col("roc_curve").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("roc_curve").struct.field("thr").list.len()),
|
||||
# PRC lens
|
||||
prc_prec_len=pl.when(pl.col("prc_curve").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("prc_curve").struct.field("precision").list.len()),
|
||||
prc_rec_len=pl.when(pl.col("prc_curve").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("prc_curve").struct.field("recall").list.len()),
|
||||
prc_thr_len=pl.when(pl.col("prc_curve").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("prc_curve").struct.field("thr").list.len()),
|
||||
# scores lens
|
||||
scores_len=pl.when(pl.col("scores").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("scores").list.len()),
|
||||
# deepsad-only arrays (None for others)
|
||||
idxs_len=pl.when(pl.col("sample_indices").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("sample_indices").list.len()),
|
||||
labels_len=pl.when(pl.col("sample_labels").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("sample_labels").list.len()),
|
||||
vmask_len=pl.when(pl.col("valid_mask").is_null())
|
||||
.then(None)
|
||||
.otherwise(pl.col("valid_mask").list.len()),
|
||||
)
|
||||
|
||||
|
||||
def check_grid_coverage_and_shapes(
|
||||
df: pl.DataFrame,
|
||||
networks=NETWORKS_EXPECTED,
|
||||
latent_dims=LATENT_DIMS_EXPECTED,
|
||||
semi_labels=SEMI_LABELS_EXPECTED,
|
||||
models=MODELS_EXPECTED,
|
||||
evals=EVALS_EXPECTED,
|
||||
expected_k_fold: int | None = EXPECTED_K_FOLD,
|
||||
):
|
||||
dfx = add_shape_columns(df)
|
||||
|
||||
# helper: get rows for a specific base combo
|
||||
def subframe(net, lat, s_norm, s_anom, mdl, ev):
|
||||
return dfx.filter(
|
||||
(pl.col("network") == net)
|
||||
& (pl.col("latent_dim") == lat)
|
||||
& (pl.col("semi_normals") == s_norm)
|
||||
& (pl.col("semi_anomalous") == s_anom)
|
||||
& (pl.col("model") == mdl)
|
||||
& (pl.col("eval") == ev)
|
||||
)
|
||||
|
||||
missing = []
|
||||
incomplete = [] # (combo, expected_folds, present_folds)
|
||||
shape_inconsistent = [] # (combo, metric_name, values_by_fold)
|
||||
cross_model_diffs = [] # (net, lat, semi, ev, metric_name, shapes_by_model)
|
||||
|
||||
# 1) Coverage + within-combo shapes
|
||||
for net, lat, (s_norm, s_anom), mdl, ev in product(
|
||||
networks, latent_dims, semi_labels, models, evals
|
||||
):
|
||||
sf = subframe(net, lat, s_norm, s_anom, mdl, ev).select(
|
||||
"fold",
|
||||
"k_fold_num",
|
||||
"scores_len",
|
||||
"roc_fpr_len",
|
||||
"roc_tpr_len",
|
||||
"roc_thr_len",
|
||||
"prc_prec_len",
|
||||
"prc_rec_len",
|
||||
"prc_thr_len",
|
||||
"idxs_len",
|
||||
"labels_len",
|
||||
"vmask_len",
|
||||
)
|
||||
|
||||
if sf.height == 0:
|
||||
missing.append(
|
||||
dict(
|
||||
network=net,
|
||||
latent_dim=lat,
|
||||
semi_normals=s_norm,
|
||||
semi_anomalous=s_anom,
|
||||
model=mdl,
|
||||
eval=ev,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# folds present vs expected
|
||||
folds_present = sorted(sf.get_column("fold").unique().to_list())
|
||||
if expected_k_fold is not None:
|
||||
kexp = expected_k_fold
|
||||
else:
|
||||
# infer from rows (take max k_fold_num within this combo)
|
||||
kexp = int(sf.get_column("k_fold_num").max())
|
||||
all_expected_folds = list(range(kexp))
|
||||
if folds_present != all_expected_folds:
|
||||
incomplete.append(
|
||||
dict(
|
||||
network=net,
|
||||
latent_dim=lat,
|
||||
semi_normals=s_norm,
|
||||
semi_anomalous=s_anom,
|
||||
model=mdl,
|
||||
eval=ev,
|
||||
expected_folds=all_expected_folds,
|
||||
present_folds=folds_present,
|
||||
)
|
||||
)
|
||||
|
||||
# shape consistency across folds (for this combo)
|
||||
# collect distinct values per metric
|
||||
shape_cols = [
|
||||
"scores_len",
|
||||
"roc_fpr_len",
|
||||
"roc_tpr_len",
|
||||
"roc_thr_len",
|
||||
"prc_prec_len",
|
||||
"prc_rec_len",
|
||||
"prc_thr_len",
|
||||
"idxs_len",
|
||||
"labels_len",
|
||||
"vmask_len",
|
||||
]
|
||||
for colname in shape_cols:
|
||||
vals = sf.select(colname).to_series()
|
||||
uniq = sorted({v for v in vals.to_list()})
|
||||
# Allow None-only columns (e.g., deepsad-only fields for other models)
|
||||
if len([u for u in uniq if u is not None]) > 1:
|
||||
# store per-fold values to help debug
|
||||
per_fold = (
|
||||
sf.select("fold", pl.col(colname))
|
||||
.sort("fold")
|
||||
.to_dict(as_series=False)
|
||||
)
|
||||
shape_inconsistent.append(
|
||||
dict(
|
||||
network=net,
|
||||
latent_dim=lat,
|
||||
semi_normals=s_norm,
|
||||
semi_anomalous=s_anom,
|
||||
model=mdl,
|
||||
eval=ev,
|
||||
metric=colname,
|
||||
per_fold=per_fold,
|
||||
)
|
||||
)
|
||||
|
||||
# 2) Cross-model comparability at fixed (net,lat,semi,eval)
|
||||
# We compare shapes that *should* logically match across models:
|
||||
# - scores_len (same number of test samples)
|
||||
# - idxs/labels/vmask (only deepsad fills them; we tolerate None elsewhere)
|
||||
# ROC/PRC binning can differ across models; we *report* those differences for awareness.
|
||||
base_keys = (
|
||||
df.select("network", "latent_dim", "semi_normals", "semi_anomalous", "eval")
|
||||
.unique()
|
||||
.iter_rows()
|
||||
)
|
||||
for net, lat, s_norm, s_anom, ev in base_keys:
|
||||
rows = (
|
||||
dfx.filter(
|
||||
(pl.col("network") == net)
|
||||
& (pl.col("latent_dim") == lat)
|
||||
& (pl.col("semi_normals") == s_norm)
|
||||
& (pl.col("semi_anomalous") == s_anom)
|
||||
& (pl.col("eval") == ev)
|
||||
)
|
||||
.group_by("model")
|
||||
.agg(
|
||||
pl.col("scores_len").unique().alias("scores_len_set"),
|
||||
pl.col("idxs_len").unique().alias("idxs_len_set"),
|
||||
pl.col("labels_len").unique().alias("labels_len_set"),
|
||||
pl.col("vmask_len").unique().alias("vmask_len_set"),
|
||||
pl.col("roc_fpr_len").unique().alias("roc_fpr_len_set"),
|
||||
pl.col("prc_prec_len").unique().alias("prc_prec_len_set"),
|
||||
)
|
||||
.to_dict(as_series=False)
|
||||
)
|
||||
if not rows:
|
||||
continue
|
||||
# normalize sets
|
||||
mdls = rows["model"]
|
||||
s_sets = [set(x) for x in rows["scores_len_set"]]
|
||||
# compare scores_len across models (ignore None values)
|
||||
s_normed = [tuple(sorted([v for v in s if v is not None])) for s in s_sets]
|
||||
if len(set(s_normed)) > 1:
|
||||
cross_model_diffs.append(
|
||||
dict(
|
||||
network=net,
|
||||
latent_dim=lat,
|
||||
semi_normals=s_norm,
|
||||
semi_anomalous=s_anom,
|
||||
eval=ev,
|
||||
metric="scores_len",
|
||||
by_model={m: sorted(list(s_sets[i])) for i, m in enumerate(mdls)},
|
||||
)
|
||||
)
|
||||
# Report ROC/PRC binning diffs (expected)
|
||||
roc_sets = [set(x) for x in rows["roc_fpr_len_set"]]
|
||||
if len(set(tuple(sorted(ss)) for ss in roc_sets)) > 1:
|
||||
cross_model_diffs.append(
|
||||
dict(
|
||||
network=net,
|
||||
latent_dim=lat,
|
||||
semi_normals=s_norm,
|
||||
semi_anomalous=s_anom,
|
||||
eval=ev,
|
||||
metric="roc_fpr_len",
|
||||
by_model={m: sorted(list(roc_sets[i])) for i, m in enumerate(mdls)},
|
||||
)
|
||||
)
|
||||
prc_sets = [set(x) for x in rows["prc_prec_len_set"]]
|
||||
if len(set(tuple(sorted(ss)) for ss in prc_sets)) > 1:
|
||||
cross_model_diffs.append(
|
||||
dict(
|
||||
network=net,
|
||||
latent_dim=lat,
|
||||
semi_normals=s_norm,
|
||||
semi_anomalous=s_anom,
|
||||
eval=ev,
|
||||
metric="prc_prec_len",
|
||||
by_model={m: sorted(list(prc_sets[i])) for i, m in enumerate(mdls)},
|
||||
)
|
||||
)
|
||||
|
||||
# --- Print a readable report ---
|
||||
print("\n=== GRID COVERAGE ===")
|
||||
print(f"Missing combos: {len(missing)}")
|
||||
for m in missing[:20]:
|
||||
print(" ", m)
|
||||
if len(missing) > 20:
|
||||
print(f" ... (+{len(missing) - 20} more)")
|
||||
|
||||
print("\nIncomplete combos (folds missing):", len(incomplete))
|
||||
for inc in incomplete[:20]:
|
||||
print(
|
||||
" ",
|
||||
{
|
||||
k: inc[k]
|
||||
for k in [
|
||||
"network",
|
||||
"latent_dim",
|
||||
"semi_normals",
|
||||
"semi_anomalous",
|
||||
"model",
|
||||
"eval",
|
||||
]
|
||||
},
|
||||
"expected",
|
||||
inc["expected_folds"],
|
||||
"present",
|
||||
inc["present_folds"],
|
||||
)
|
||||
if len(incomplete) > 20:
|
||||
print(f" ... (+{len(incomplete) - 20} more)")
|
||||
|
||||
print("\n=== WITHIN-COMBO SHAPE CONSISTENCY (across folds) ===")
|
||||
print(f"Mismatching groups: {len(shape_inconsistent)}")
|
||||
for s in shape_inconsistent[:15]:
|
||||
hdr = {
|
||||
k: s[k]
|
||||
for k in [
|
||||
"network",
|
||||
"latent_dim",
|
||||
"semi_normals",
|
||||
"semi_anomalous",
|
||||
"model",
|
||||
"eval",
|
||||
"metric",
|
||||
]
|
||||
}
|
||||
print(" ", hdr, "values:", s["per_fold"])
|
||||
if len(shape_inconsistent) > 15:
|
||||
print(f" ... (+{len(shape_inconsistent) - 15} more)")
|
||||
|
||||
print("\n=== CROSS-MODEL COMPARABILITY (by shape) ===")
|
||||
print(
|
||||
f"Shape differences across models at fixed (net,lat,semi,eval): {len(cross_model_diffs)}"
|
||||
)
|
||||
for s in cross_model_diffs[:15]:
|
||||
hdr = {
|
||||
k: s[k]
|
||||
for k in [
|
||||
"network",
|
||||
"latent_dim",
|
||||
"semi_normals",
|
||||
"semi_anomalous",
|
||||
"eval",
|
||||
"metric",
|
||||
]
|
||||
}
|
||||
print(" ", hdr, "by_model:", s["by_model"])
|
||||
if len(cross_model_diffs) > 15:
|
||||
print(f" ... (+{len(cross_model_diffs) - 15} more)")
|
||||
|
||||
# Return the raw details if you want to use them programmatically
|
||||
return {
|
||||
"missing": missing,
|
||||
"incomplete": incomplete,
|
||||
"shape_inconsistent": shape_inconsistent,
|
||||
"cross_model_diffs": cross_model_diffs,
|
||||
}
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Config you can tweak
|
||||
# ------------------------------------------------------------
|
||||
@@ -386,6 +60,37 @@ SCHEMA_STATIC = {
|
||||
"test_time": pl.Float64,
|
||||
"folder": pl.Utf8,
|
||||
"k_fold_num": pl.Int32,
|
||||
"config_json": pl.Utf8, # full config.json as string (for reference)
|
||||
}
|
||||
|
||||
# Pretraining-only (AE) schema
|
||||
# Pretraining-only (AE) schema — lighter defaults
|
||||
PRETRAIN_SCHEMA = {
|
||||
# identifiers / dims
|
||||
"network": pl.Utf8, # e.g. "LeNet", "efficient"
|
||||
"latent_dim": pl.Int32,
|
||||
"semi_normals": pl.Int32,
|
||||
"semi_anomalous": pl.Int32,
|
||||
"model": pl.Utf8, # always "ae"
|
||||
"fold": pl.Int32,
|
||||
"split": pl.Utf8, # "train" | "test"
|
||||
# timings and optimization
|
||||
"time": pl.Float64,
|
||||
"loss": pl.Float64,
|
||||
# per-sample arrays (as lists)
|
||||
"indices": pl.List(pl.Int32),
|
||||
"labels_exp_based": pl.List(pl.Int32),
|
||||
"labels_manual_based": pl.List(pl.Int32),
|
||||
"semi_targets": pl.List(pl.Int32),
|
||||
"file_ids": pl.List(pl.Int32),
|
||||
"frame_ids": pl.List(pl.Int32),
|
||||
"scores": pl.List(pl.Float32), # <— use Float32 to match source and save space
|
||||
# file id -> name mapping from the result dict
|
||||
"file_names": pl.List(pl.Struct({"file_id": pl.Int32, "name": pl.Utf8})),
|
||||
# housekeeping
|
||||
"folder": pl.Utf8,
|
||||
"k_fold_num": pl.Int32,
|
||||
"config_json": pl.Utf8, # full config.json as string (for reference)
|
||||
}
|
||||
|
||||
|
||||
@@ -406,6 +111,33 @@ def _tolist(x):
|
||||
return None
|
||||
|
||||
|
||||
def normalize_float_list(a) -> Optional[List[float]]:
|
||||
if a is None:
|
||||
return None
|
||||
if isinstance(a, np.ndarray):
|
||||
a = a.tolist()
|
||||
return [None if x is None else float(x) for x in a]
|
||||
|
||||
|
||||
def normalize_file_names(d) -> Optional[List[dict]]:
|
||||
"""
|
||||
Convert the 'file_names' dict (keys like numpy.int64 -> str) to a
|
||||
list[ {file_id:int, name:str} ], sorted by file_id.
|
||||
"""
|
||||
if not isinstance(d, dict):
|
||||
return None
|
||||
out: List[dict] = []
|
||||
for k, v in d.items():
|
||||
try:
|
||||
file_id = int(k)
|
||||
except Exception:
|
||||
# keys are printed as np.int64 in the structure; best-effort cast
|
||||
continue
|
||||
out.append({"file_id": file_id, "name": str(v)})
|
||||
out.sort(key=lambda x: x["file_id"])
|
||||
return out
|
||||
|
||||
|
||||
def normalize_roc(obj: Any) -> Optional[dict]:
|
||||
if obj is None:
|
||||
return None
|
||||
@@ -597,7 +329,7 @@ def rows_from_ocsvm_default(data: dict, evals: List[str]) -> Dict[str, dict]:
|
||||
# ------------------------------------------------------------
|
||||
# Build the Polars DataFrame
|
||||
# ------------------------------------------------------------
|
||||
def build_results_frame(root: Path) -> pl.DataFrame:
|
||||
def load_results_dataframe(root: Path, allow_cache: bool = True) -> pl.DataFrame:
|
||||
"""
|
||||
Walks experiment subdirs under `root`. For each (model, fold) it adds rows:
|
||||
Columns (SCHEMA_STATIC):
|
||||
@@ -609,12 +341,23 @@ def build_results_frame(root: Path) -> pl.DataFrame:
|
||||
train_time, test_time,
|
||||
folder, k_fold_num
|
||||
"""
|
||||
if allow_cache:
|
||||
cache = root / "results_cache.parquet"
|
||||
if cache.exists():
|
||||
try:
|
||||
df = pl.read_parquet(cache)
|
||||
print(f"[info] loaded cached results frame from {cache}")
|
||||
return df
|
||||
except Exception as e:
|
||||
print(f"[warn] failed to load cache {cache}: {e}")
|
||||
|
||||
rows: List[dict] = []
|
||||
|
||||
exp_dirs = [p for p in root.iterdir() if p.is_dir()]
|
||||
for exp_dir in sorted(exp_dirs):
|
||||
try:
|
||||
cfg = read_config(exp_dir)
|
||||
cfg_json = json.dumps(cfg, sort_keys=True)
|
||||
except Exception as e:
|
||||
print(f"[warn] skipping {exp_dir.name}: {e}")
|
||||
continue
|
||||
@@ -668,6 +411,7 @@ def build_results_frame(root: Path) -> pl.DataFrame:
|
||||
"test_time": vals["test_time"],
|
||||
"folder": str(exp_dir),
|
||||
"k_fold_num": k,
|
||||
"config_json": cfg_json,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -687,73 +431,166 @@ def build_results_frame(root: Path) -> pl.DataFrame:
|
||||
# NOTE: no cast on 'scores' here; it's already List(Struct) per schema.
|
||||
)
|
||||
|
||||
if allow_cache:
|
||||
try:
|
||||
df.write_parquet(cache)
|
||||
print(f"[info] cached results frame to {cache}")
|
||||
except Exception as e:
|
||||
print(f"[warn] failed to write cache {cache}: {e}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Example “analysis-ready” queries (Polars idioms)
|
||||
# ------------------------------------------------------------
|
||||
def demo_queries(df: pl.DataFrame):
|
||||
# q1: lazy is fine, then collect
|
||||
q1 = (
|
||||
df.lazy()
|
||||
.filter(
|
||||
(pl.col("network") == "LeNet")
|
||||
& (pl.col("latent_dim") == 1024)
|
||||
& (pl.col("semi_normals") == 0)
|
||||
& (pl.col("semi_anomalous") == 0)
|
||||
& (pl.col("eval") == "exp_based")
|
||||
)
|
||||
.group_by(["model"])
|
||||
.agg(pl.col("auc").mean().alias("mean_auc"))
|
||||
.sort(["mean_auc"], descending=True)
|
||||
.collect()
|
||||
def load_pretraining_results_dataframe(
|
||||
root: Path,
|
||||
allow_cache: bool = True,
|
||||
include_train: bool = False, # <— default: store only TEST to keep cache tiny
|
||||
keep_file_names: bool = False, # <— drop file_names by default; they’re repeated
|
||||
parquet_compression: str = "zstd",
|
||||
parquet_compression_level: int = 7, # <— stronger compression than default
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Loads only AE pretraining results: files named `results_ae_<fold>.pkl`.
|
||||
Produces one row per (experiment, fold, split). By default we:
|
||||
- include only the TEST split (include_train=False)
|
||||
- store scores as Float32
|
||||
- drop the repeated file_names mapping to save space
|
||||
- write Parquet with zstd(level=7)
|
||||
"""
|
||||
if allow_cache:
|
||||
cache = root / "pretraining_results_cache.parquet"
|
||||
if cache.exists():
|
||||
try:
|
||||
df = pl.read_parquet(cache)
|
||||
print(f"[info] loaded cached pretraining frame from {cache}")
|
||||
return df
|
||||
except Exception as e:
|
||||
print(f"[warn] failed to load pretraining cache {cache}: {e}")
|
||||
|
||||
rows: List[dict] = []
|
||||
|
||||
exp_dirs = [p for p in root.iterdir() if p.is_dir()]
|
||||
for exp_dir in sorted(exp_dirs):
|
||||
try:
|
||||
cfg = read_config(exp_dir)
|
||||
cfg_json = json.dumps(cfg, sort_keys=True)
|
||||
except Exception as e:
|
||||
print(f"[warn] skipping {exp_dir.name} (pretraining): {e}")
|
||||
continue
|
||||
|
||||
network = cfg.get("net_name")
|
||||
latent_dim = int(cfg.get("latent_space_dim"))
|
||||
semi_normals = int(cfg.get("num_known_normal"))
|
||||
semi_anomalous = int(cfg.get("num_known_outlier"))
|
||||
k = int(cfg.get("k_fold_num"))
|
||||
|
||||
# Only test split by default (include_train=False)
|
||||
splits = ("train", "test") if include_train else ("test",)
|
||||
|
||||
for fold in range(k):
|
||||
pkl = exp_dir / f"results_ae_{fold}.pkl"
|
||||
if not pkl.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
data = read_pickle(pkl) # expected: {"train": {...}, "test": {...}}
|
||||
except Exception as e:
|
||||
print(f"[warn] failed to read {pkl.name}: {e}")
|
||||
continue
|
||||
|
||||
for split in splits:
|
||||
splitd = data.get(split)
|
||||
if not isinstance(splitd, dict):
|
||||
continue
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"network": network,
|
||||
"latent_dim": latent_dim,
|
||||
"semi_normals": semi_normals,
|
||||
"semi_anomalous": semi_anomalous,
|
||||
"model": "ae",
|
||||
"fold": fold,
|
||||
"split": split,
|
||||
"time": float(splitd.get("time"))
|
||||
if splitd.get("time") is not None
|
||||
else None,
|
||||
"loss": float(splitd.get("loss"))
|
||||
if splitd.get("loss") is not None
|
||||
else None,
|
||||
# ints as Int32, scores as Float32 to save space
|
||||
"indices": normalize_int_list(splitd.get("indices")),
|
||||
"labels_exp_based": normalize_int_list(
|
||||
splitd.get("labels_exp_based")
|
||||
),
|
||||
"labels_manual_based": normalize_int_list(
|
||||
splitd.get("labels_manual_based")
|
||||
),
|
||||
"semi_targets": normalize_int_list(splitd.get("semi_targets")),
|
||||
"file_ids": normalize_int_list(splitd.get("file_ids")),
|
||||
"frame_ids": normalize_int_list(splitd.get("frame_ids")),
|
||||
"scores": (
|
||||
None
|
||||
if splitd.get("scores") is None
|
||||
else [
|
||||
float(x)
|
||||
for x in (
|
||||
splitd["scores"].tolist()
|
||||
if isinstance(splitd["scores"], np.ndarray)
|
||||
else splitd["scores"]
|
||||
)
|
||||
]
|
||||
),
|
||||
"file_names": normalize_file_names(splitd.get("file_names"))
|
||||
if keep_file_names
|
||||
else None,
|
||||
"folder": str(exp_dir),
|
||||
"k_fold_num": k,
|
||||
"config_json": cfg_json,
|
||||
}
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return pl.DataFrame(schema=PRETRAIN_SCHEMA)
|
||||
|
||||
df = pl.DataFrame(rows, schema=PRETRAIN_SCHEMA)
|
||||
|
||||
# Cast/optimize a bit (categoricals, ints, floats)
|
||||
df = df.with_columns(
|
||||
pl.col("network", "model", "split").cast(pl.Categorical),
|
||||
pl.col(
|
||||
"latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
|
||||
).cast(pl.Int32),
|
||||
pl.col("time", "loss").cast(pl.Float64),
|
||||
pl.col("scores").cast(pl.List(pl.Float32)), # ensure downcast took
|
||||
)
|
||||
|
||||
# q2: do the filtering eagerly, then pivot (LazyFrame has no .pivot)
|
||||
base = df.filter(
|
||||
(pl.col("model") == "deepsad")
|
||||
& (pl.col("eval") == "exp_based")
|
||||
& (pl.col("network") == "LeNet")
|
||||
& (pl.col("semi_normals") == 0)
|
||||
& (pl.col("semi_anomalous") == 0)
|
||||
).select("fold", "latent_dim", "auc")
|
||||
q2 = base.pivot(
|
||||
values="auc",
|
||||
index="fold",
|
||||
columns="latent_dim",
|
||||
aggregate_function="first", # or "mean" if duplicates exist
|
||||
).sort("fold")
|
||||
if allow_cache:
|
||||
try:
|
||||
cache = root / "pretraining_results_cache.parquet"
|
||||
df.write_parquet(
|
||||
cache,
|
||||
compression=parquet_compression,
|
||||
compression_level=parquet_compression_level,
|
||||
statistics=True,
|
||||
)
|
||||
print(
|
||||
f"[info] cached pretraining frame to {cache} "
|
||||
f"({parquet_compression}, level={parquet_compression_level})"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[warn] failed to write pretraining cache {cache}: {e}")
|
||||
|
||||
# roc_subset: eager filter/select, then explode struct fields
|
||||
roc_subset = (
|
||||
df.filter(
|
||||
(pl.col("model") == "ocsvm")
|
||||
& (pl.col("eval") == "manual_based")
|
||||
& (pl.col("network") == "efficient")
|
||||
& (pl.col("latent_dim") == 1024)
|
||||
& (pl.col("semi_normals") == 0)
|
||||
& (pl.col("semi_anomalous") == 0)
|
||||
)
|
||||
.select("fold", "roc_curve")
|
||||
.with_columns(
|
||||
pl.col("roc_curve").struct.field("fpr").alias("fpr"),
|
||||
pl.col("roc_curve").struct.field("tpr").alias("tpr"),
|
||||
pl.col("roc_curve").struct.field("thr").alias("thr"),
|
||||
)
|
||||
)
|
||||
|
||||
return q1, q2, roc_subset
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
root = Path("/home/fedex/mt/results/done")
|
||||
df = build_results_frame(root)
|
||||
q1, q2, roc_subset = demo_queries(df)
|
||||
df = load_results_dataframe(root, allow_cache=True)
|
||||
print(df.shape, df.head())
|
||||
# --- run it ---
|
||||
report = check_grid_coverage_and_shapes(df)
|
||||
print(report)
|
||||
|
||||
df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
|
||||
print("pretraining:", df_pre.shape, df_pre.head())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user