update
This commit is contained in:
@@ -3,10 +3,12 @@ from __future__ import annotations
|
||||
import json
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from diff_df import recursive_diff_frames
|
||||
from polars.testing import assert_frame_equal
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Config you can tweak
|
||||
@@ -75,7 +77,8 @@ PRETRAIN_SCHEMA = {
|
||||
"fold": pl.Int32,
|
||||
"split": pl.Utf8, # "train" | "test"
|
||||
# timings and optimization
|
||||
"time": pl.Float64,
|
||||
"train_time": pl.Float64,
|
||||
"test_time": pl.Float64,
|
||||
"loss": pl.Float64,
|
||||
# per-sample arrays (as lists)
|
||||
"indices": pl.List(pl.Int32),
|
||||
@@ -247,6 +250,14 @@ def read_pickle(p: Path) -> Any:
|
||||
# ------------------------------------------------------------
|
||||
# Extractors for each model
|
||||
# ------------------------------------------------------------
|
||||
|
||||
counting = {
|
||||
(label_method, eval_method): []
|
||||
for label_method in ["exp_based", "manual_based"]
|
||||
for eval_method in ["roc", "prc"]
|
||||
}
|
||||
|
||||
|
||||
def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
|
||||
"""
|
||||
deepsad under data['test'][eval], with extra per-eval arrays and AP present.
|
||||
@@ -257,6 +268,8 @@ def rows_from_deepsad(data: dict, evals: List[str]) -> Dict[str, dict]:
|
||||
evd = test.get(ev)
|
||||
if not isinstance(evd, dict):
|
||||
continue
|
||||
counting[(ev, "roc")].append(len(evd["roc"][0]))
|
||||
counting[(ev, "prc")].append(len(evd["prc"][0]))
|
||||
out[ev] = {
|
||||
"auc": float(evd["auc"])
|
||||
if "auc" in evd and evd["auc"] is not None
|
||||
@@ -444,7 +457,6 @@ def load_results_dataframe(root: Path, allow_cache: bool = True) -> pl.DataFrame
|
||||
def load_pretraining_results_dataframe(
|
||||
root: Path,
|
||||
allow_cache: bool = True,
|
||||
include_train: bool = False, # <— default: store only TEST to keep cache tiny
|
||||
keep_file_names: bool = False, # <— drop file_names by default; they’re repeated
|
||||
parquet_compression: str = "zstd",
|
||||
parquet_compression_level: int = 7, # <— stronger compression than default
|
||||
@@ -484,9 +496,6 @@ def load_pretraining_results_dataframe(
|
||||
semi_anomalous = int(cfg.get("num_known_outlier"))
|
||||
k = int(cfg.get("k_fold_num"))
|
||||
|
||||
# Only test split by default (include_train=False)
|
||||
splits = ("train", "test") if include_train else ("test",)
|
||||
|
||||
for fold in range(k):
|
||||
pkl = exp_dir / f"results_ae_{fold}.pkl"
|
||||
if not pkl.exists():
|
||||
@@ -498,57 +507,53 @@ def load_pretraining_results_dataframe(
|
||||
print(f"[warn] failed to read {pkl.name}: {e}")
|
||||
continue
|
||||
|
||||
for split in splits:
|
||||
splitd = data.get(split)
|
||||
if not isinstance(splitd, dict):
|
||||
continue
|
||||
train_time = data.get("train", {}).get("time")
|
||||
data = data.get("test", {})
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"network": network,
|
||||
"latent_dim": latent_dim,
|
||||
"semi_normals": semi_normals,
|
||||
"semi_anomalous": semi_anomalous,
|
||||
"model": "ae",
|
||||
"fold": fold,
|
||||
"split": split,
|
||||
"time": float(splitd.get("time"))
|
||||
if splitd.get("time") is not None
|
||||
else None,
|
||||
"loss": float(splitd.get("loss"))
|
||||
if splitd.get("loss") is not None
|
||||
else None,
|
||||
# ints as Int32, scores as Float32 to save space
|
||||
"indices": normalize_int_list(splitd.get("indices")),
|
||||
"labels_exp_based": normalize_int_list(
|
||||
splitd.get("labels_exp_based")
|
||||
),
|
||||
"labels_manual_based": normalize_int_list(
|
||||
splitd.get("labels_manual_based")
|
||||
),
|
||||
"semi_targets": normalize_int_list(splitd.get("semi_targets")),
|
||||
"file_ids": normalize_int_list(splitd.get("file_ids")),
|
||||
"frame_ids": normalize_int_list(splitd.get("frame_ids")),
|
||||
"scores": (
|
||||
None
|
||||
if splitd.get("scores") is None
|
||||
else [
|
||||
float(x)
|
||||
for x in (
|
||||
splitd["scores"].tolist()
|
||||
if isinstance(splitd["scores"], np.ndarray)
|
||||
else splitd["scores"]
|
||||
)
|
||||
]
|
||||
),
|
||||
"file_names": normalize_file_names(splitd.get("file_names"))
|
||||
if keep_file_names
|
||||
else None,
|
||||
"folder": str(exp_dir),
|
||||
"k_fold_num": k,
|
||||
"config_json": cfg_json,
|
||||
}
|
||||
)
|
||||
rows.append(
|
||||
{
|
||||
"network": network,
|
||||
"latent_dim": latent_dim,
|
||||
"semi_normals": semi_normals,
|
||||
"semi_anomalous": semi_anomalous,
|
||||
"model": "ae",
|
||||
"fold": fold,
|
||||
"train_time": train_time,
|
||||
"test_time": data.get("time"),
|
||||
"loss": float(data.get("loss"))
|
||||
if data.get("loss") is not None
|
||||
else None,
|
||||
# ints as Int32, scores as Float32 to save space
|
||||
"indices": normalize_int_list(data.get("indices")),
|
||||
"labels_exp_based": normalize_int_list(
|
||||
data.get("labels_exp_based")
|
||||
),
|
||||
"labels_manual_based": normalize_int_list(
|
||||
data.get("labels_manual_based")
|
||||
),
|
||||
"semi_targets": normalize_int_list(data.get("semi_targets")),
|
||||
"file_ids": normalize_int_list(data.get("file_ids")),
|
||||
"frame_ids": normalize_int_list(data.get("frame_ids")),
|
||||
"scores": (
|
||||
None
|
||||
if data.get("scores") is None
|
||||
else [
|
||||
float(x)
|
||||
for x in (
|
||||
data["scores"].tolist()
|
||||
if isinstance(data["scores"], np.ndarray)
|
||||
else data["scores"]
|
||||
)
|
||||
]
|
||||
),
|
||||
"file_names": normalize_file_names(data.get("file_names"))
|
||||
if keep_file_names
|
||||
else None,
|
||||
"folder": str(exp_dir),
|
||||
"k_fold_num": k,
|
||||
"config_json": cfg_json,
|
||||
}
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return pl.DataFrame(schema=PRETRAIN_SCHEMA)
|
||||
@@ -561,7 +566,7 @@ def load_pretraining_results_dataframe(
|
||||
pl.col(
|
||||
"latent_dim", "semi_normals", "semi_anomalous", "fold", "k_fold_num"
|
||||
).cast(pl.Int32),
|
||||
pl.col("time", "loss").cast(pl.Float64),
|
||||
pl.col("test_time", "train_time", "loss").cast(pl.Float64),
|
||||
pl.col("scores").cast(pl.List(pl.Float32)), # ensure downcast took
|
||||
)
|
||||
|
||||
@@ -585,12 +590,53 @@ def load_pretraining_results_dataframe(
|
||||
|
||||
|
||||
def main():
|
||||
root = Path("/home/fedex/mt/results/done")
|
||||
df = load_results_dataframe(root, allow_cache=True)
|
||||
print(df.shape, df.head())
|
||||
root = Path("/home/fedex/mt/results/copy")
|
||||
df1 = load_results_dataframe(root, allow_cache=True)
|
||||
exit(0)
|
||||
|
||||
df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
|
||||
print("pretraining:", df_pre.shape, df_pre.head())
|
||||
retest_root = Path("/home/fedex/mt/results/copy/retest_nodrop")
|
||||
df2 = load_results_dataframe(retest_root, allow_cache=False).drop("folder")
|
||||
|
||||
# exact schema & shape first (optional but helpful messages)
|
||||
assert df1.shape == df2.shape, f"Shape differs: {df1.shape} vs {df2.shape}"
|
||||
assert set(df1.columns) == set(df2.columns), (
|
||||
f"Column sets differ: {df1.columns} vs {df2.columns}"
|
||||
)
|
||||
|
||||
# allow small float diffs, ignore column order differences if you want
|
||||
df1_sorted = df1.select(sorted(df1.columns))
|
||||
df2_sorted = df2.select(sorted(df2.columns))
|
||||
|
||||
# Optionally pre-align/sort both frames by a stable key before diffing.
|
||||
summary, leaves = recursive_diff_frames(
|
||||
df1,
|
||||
df2,
|
||||
ignore=["timestamp"], # columns to ignore
|
||||
float_atol=0.1, # absolute tolerance for floats
|
||||
float_rtol=0.0, # relative tolerance for floats
|
||||
max_rows_per_column=20, # limit expansion per column
|
||||
max_leafs_per_row=200, # cap leaves per row
|
||||
)
|
||||
|
||||
pl.Config.set_fmt_table_cell_list_len(100)
|
||||
pl.Config.set_tbl_rows(100)
|
||||
|
||||
print(summary) # which columns differ & how many rows
|
||||
print(leaves) # exact nested paths + scalar diffs
|
||||
|
||||
# check_exact=False lets us use atol/rtol for floats
|
||||
assert_frame_equal(
|
||||
df1_sorted,
|
||||
df2_sorted,
|
||||
check_exact=False,
|
||||
atol=0.1, # absolute tolerance for floats
|
||||
rtol=0.0, # relative tolerance (set if you want % based)
|
||||
check_dtypes=True, # set False if you only care about values
|
||||
)
|
||||
print("DataFrames match within tolerance ✅")
|
||||
|
||||
# df_pre = load_pretraining_results_dataframe(root, allow_cache=True)
|
||||
# print("pretraining:", df_pre.shape, df_pre.head())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user