This commit is contained in:
Jan Kowalczyk
2025-09-10 19:41:00 +02:00
parent ef0c36eed5
commit cf15d5501e
17 changed files with 1198 additions and 720 deletions

View File

@@ -3,10 +3,12 @@ from __future__ import annotations
import json
import pickle
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import polars as pl
from diff_df import recursive_diff_frames
from polars.testing import assert_frame_equal
# ------------------------------------------------------------
# Config you can tweak
@@ -75,7 +77,8 @@ PRETRAIN_SCHEMA = {
"fold": pl.Int32,
"split": pl.Utf8, # "train" | "test"
# timings and optimization
"time": pl.Float64,
"train_time": pl.Float64,
"test_time": pl.Float64,
"loss": pl.Float64,
# per-sample arrays (as lists)
"indices": pl.List(pl.Int32),
@@ -247,6 +250,14 @@ def read_pickle(p: Path) -> Any:
# ------------------------------------------------------------
# Extractors for each model
# ------------------------------------------------------------
counting = {
(label_method, eval_method): []
for label_method in ["exp_based", "manual_based"]
for eval_method in ["roc", "prc"]
}
def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
"""
deepsad under data['test'][eval], with extra per-eval arrays and AP present.
@@ -257,6 +268,8 @@ def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
evd = test.get(ev)
if not isinstance(evd, dict):
continue
counting[(ev, "roc")].append(len(evd["roc"][0]))
counting[(ev, "prc")].append(len(evd["prc"][0]))
out[ev] = {
"auc": float(evd["auc"])
if "auc" in evd and evd["auc"] is not None
@@ -444,7 +457,6 @@ def load_results_dataframe(root: Path, allow_cache: bool = True) -> pl.DataFrame
def load_pretraining_results_dataframe(
root: Path,
allow_cache: bool = True,
include_train: bool = False, # <— default: store only TEST to keep cache tiny
keep_file_names: bool = False, # <— drop file_names by default; theyre repeated
parquet_compression: str = "zstd",
parquet_compression_level: int = 7, # <— stronger compression than default
@@ -484,9 +496,6 @@ def load_pretraining_results_dataframe(
semi_anomalous = int(cfg.get("num_known_outlier"))
k = int(cfg.get("k_fold_num"))
# Only test split by default (include_train=False)
splits = ("train", "test") if include_train else ("test",)
for fold in range(k):
pkl = exp_dir / f"results_ae_{fold}.pkl"
if not pkl.exists():
@@ -498,57 +507,53 @@ def load_pretraining_results_dataframe(
print(f"[warn] failed to read {pkl.name}: {e}")
continue
for split in splits:
splitd = data.get(split)
if not isinstance(splitd, dict):
continue
train_time = data.get("train", {}).get("time")
data = data.get("test", {})
rows.append(
{
"network": network,
"latent_dim": latent_dim,
"semi_normals": semi_normals,
"semi_anomalous": semi_anomalous,
"model": "ae",
"fold": fold,
"split": split,
"time": float(splitd.get("time"))
if splitd.get("time") is not None
else None,
"loss": float(splitd.get("loss"))
if splitd.get("loss") is not None
else None,
# ints as Int32, scores as Float32 to save space
"indices": normalize_int_list(splitd.get("indices")),
"labels_exp_based": normalize_int_list(
splitd.get("labels_exp_based")
),
"labels_manual_based": normalize_int_list(
splitd.get("labels_manual_based")
),
"semi_targets": normalize_int_list(splitd.get("semi_targets")),
"file_ids": normalize_int_list(splitd.get("file_ids")),
"frame_ids": normalize_int_list(splitd.get("frame_ids")),
"scores": (
None
if splitd.get("scores") is None
else [
float(x)
for x in (
splitd["scores"].tolist()
if isinstance(splitd["scores"], np.ndarray)
else splitd["scores"]
)
]
),
"file_names": normalize_file_names(splitd.get("file_names"))
if keep_file_names
else None,
"folder": str(exp_dir),
"k_fold_num": k,
"config_json": cfg_json,
}
)
rows.append(
{
"network": network,
"latent_dim": latent_dim,
"semi_normals": semi_normals,
"semi_anomalous": semi_anomalous,
"model": "ae",
"fold": fold,
"train_time": train_time,
"test_time": data.get("time"),
"loss": float(data.get("loss"))
if data.get("loss") is not None
else None,
# ints as Int32, scores as Float32 to save space
"indices": normalize_int_list(data.get("indices")),
"labels_exp_based": normalize_int_list(
data.get("labels_exp_based")
),
"labels_manual_based": normalize_int_list(
data.get("labels_manual_based")
),
"semi_targets": normalize_int_list(data.get("semi_targets")),
"file_ids": normalize_int_list(data.get("file_ids")),
"frame_ids": normalize_int_list(data.get("frame_ids")),
"scores": (
None
if data.get("scores") is None
else [
float(x)
for x in (
data["scores"].tolist()
if isinstance(data["scores"], np.ndarray)
else data["scores"]
)
]
),
"file_names": normalize_file_names(data.get("file_names"))
if keep_file_names
else None,
"folder": str(exp_dir),
"k_fold_num": k,
"config_json": cfg_json,
}
)
if not rows:
return pl.DataFrame(schema=PRETRAIN_SCHEMA)
@@ -561,7 +566,7 @@ def load_pretraining_results_dataframe(
pl.col(
"latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
).cast(pl.Int32),
pl.col("time", "loss").cast(pl.Float64),
pl.col("test_time", "train_time", "loss").cast(pl.Float64),
pl.col("scores").cast(pl.List(pl.Float32)), # ensure downcast took
)
@@ -585,12 +590,53 @@ def load_pretraining_results_dataframe(
def main():
root = Path("/home/fedex/mt/results/done")
df = load_results_dataframe(root, allow_cache=True)
print(df.shape, df.head())
root = Path("/home/fedex/mt/results/copy")
df1 = load_results_dataframe(root, allow_cache=True)
exit(0)
df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
print("pretraining:", df_pre.shape, df_pre.head())
retest_root = Path("/home/fedex/mt/results/copy/retest_nodrop")
df2 = load_results_dataframe(retest_root, allow_cache=False).drop("folder")
# exact schema & shape first (optional but helpful messages)
assert df1.shape == df2.shape, f"Shape differs: {df1.shape} vs {df2.shape}"
assert set(df1.columns) == set(df2.columns), (
f"Column sets differ: {df1.columns} vs {df2.columns}"
)
# allow small float diffs, ignore column order differences if you want
df1_sorted = df1.select(sorted(df1.columns))
df2_sorted = df2.select(sorted(df2.columns))
# Optionally pre-align/sort both frames by a stable key before diffing.
summary, leaves = recursive_diff_frames(
df1,
df2,
ignore=["timestamp"], # columns to ignore
float_atol=0.1, # absolute tolerance for floats
float_rtol=0.0, # relative tolerance for floats
max_rows_per_column=20, # limit expansion per column
max_leafs_per_row=200, # cap leaves per row
)
pl.Config.set_fmt_table_cell_list_len(100)
pl.Config.set_tbl_rows(100)
print(summary) # which columns differ & how many rows
print(leaves) # exact nested paths + scalar diffs
# check_exact=False lets us use atol/rtol for floats
assert_frame_equal(
df1_sorted,
df2_sorted,
check_exact=False,
atol=0.1, # absolute tolerance for floats
rtol=0.0, # relative tolerance (set if you want % based)
check_dtypes=True, # set False if you only care about values
)
print("DataFrames match within tolerance ✅")
# df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
# print("pretraining:", df_pre.shape, df_pre.head())
if __name__ == "__main__":