wip inference

This commit is contained in:
Jan Kowalczyk
2025-09-15 11:21:30 +02:00
parent e4b298cf06
commit e7624d2786
8 changed files with 1027 additions and 35 deletions

View File

@@ -96,6 +96,21 @@ PRETRAIN_SCHEMA = {
"config_json": pl.Utf8, # full config.json as string (for reference)
}
SCHEMA_INFERENCE = {
# identifiers / dims
"experiment": pl.Utf8, # e.g. "2_static_no_artifacts_illuminated_2023-01-23-001"
"network": pl.Utf8, # e.g. "LeNet", "efficient"
"latent_dim": pl.Int32,
"semi_normals": pl.Int32,
"semi_anomalous": pl.Int32,
"model": pl.Utf8, # "deepsad" | "isoforest" | "ocsvm"
# metrics
"scores": pl.List(pl.Float64),
# timings / housekeeping
"folder": pl.Utf8,
"config_json": pl.Utf8, # full config.json as string (for reference)
}
# ------------------------------------------------------------
# Helpers: curve/scores normalizers (tuples/ndarrays -> dict/list)
@@ -233,11 +248,11 @@ def normalize_bool_list(a) -> Optional[List[bool]]:
# ------------------------------------------------------------
# Low-level: read one experiment folder
# ------------------------------------------------------------
def read_config(exp_dir: Path) -> dict:
def read_config(exp_dir: Path, k_fold_required: bool = True) -> dict:
cfg = exp_dir / "config.json"
with cfg.open("r") as f:
c = json.load(f)
if not c.get("k_fold"):
if k_fold_required and not c.get("k_fold"):
raise ValueError(f"{exp_dir.name}: not trained as k-fold")
return c
@@ -589,7 +604,129 @@ def load_pretraining_results_dataframe(
return df
def load_inference_results_dataframe(
root: Path,
allow_cache: bool = True,
models: List[str] = MODELS,
) -> pl.DataFrame:
"""Load inference results from experiment folders.
Args:
root: Path to root directory containing experiment folders
allow_cache: Whether to use/create cache file
models: List of models to look for scores
Returns:
pl.DataFrame: DataFrame containing inference results
"""
if allow_cache:
cache = root / "inference_results_cache.parquet"
if cache.exists():
try:
df = pl.read_parquet(cache)
print(f"[info] loaded cached inference frame from {cache}")
return df
except Exception as e:
print(f"[warn] failed to load inference cache {cache}: {e}")
rows: List[dict] = []
exp_dirs = [p for p in root.iterdir() if p.is_dir()]
for exp_dir in sorted(exp_dirs):
try:
# Load and validate config
cfg = read_config(exp_dir, k_fold_required=False)
cfg_json = json.dumps(cfg, sort_keys=True)
# Extract config values
network = cfg.get("net_name")
latent_dim = int(cfg.get("latent_space_dim"))
semi_normals = int(cfg.get("num_known_normal"))
semi_anomalous = int(cfg.get("num_known_outlier"))
# Process each model's scores
inference_dir = exp_dir / "inference"
if not inference_dir.exists():
print(f"[warn] no inference directory for {exp_dir.name}")
continue
# Find all unique experiments in this folder's inference files
score_files = list(inference_dir.glob("*_scores.npy"))
if not score_files:
print(f"[warn] no score files in {inference_dir}")
continue
# Extract unique experiment names from score files
# Format: {experiment}_{model}_scores.npy
experiments = set()
for score_file in score_files:
exp_name = score_file.stem.rsplit("_", 2)[0]
experiments.add(exp_name)
# Load scores for each experiment and model
for experiment in sorted(experiments):
for model in models:
score_file = inference_dir / f"{experiment}_{model}_scores.npy"
if not score_file.exists():
print(f"[warn] missing score file for {experiment}, {model}")
continue
try:
scores = np.load(score_file)
rows.append(
{
"experiment": experiment,
"network": network,
"latent_dim": latent_dim,
"semi_normals": semi_normals,
"semi_anomalous": semi_anomalous,
"model": model,
"scores": scores.tolist(),
"folder": str(exp_dir),
"config_json": cfg_json,
}
)
except Exception as e:
print(
f"[warn] failed to load scores for {experiment}, {model}: {e}"
)
continue
except Exception as e:
print(f"[warn] skipping {exp_dir.name}: {e}")
continue
# If empty, return a typed empty frame
if not rows:
return pl.DataFrame(schema=SCHEMA_INFERENCE)
df = pl.DataFrame(rows, schema=SCHEMA_INFERENCE)
# Optimize datatypes
df = df.with_columns(
[
pl.col("experiment", "network", "model").cast(pl.Categorical),
pl.col("latent_dim", "semi_normals", "semi_anomalous").cast(pl.Int32),
]
)
# Cache if enabled
if allow_cache:
try:
df.write_parquet(cache)
print(f"[info] cached inference frame to {cache}")
except Exception as e:
print(f"[warn] failed to write cache {cache}: {e}")
return df
def main():
inference_root = Path("/home/fedex/mt/results/inference/copy")
df_inference = load_inference_results_dataframe(inference_root, allow_cache=True)
exit(0)
root = Path("/home/fedex/mt/results/copy")
df1 = load_results_dataframe(root, allow_cache=True)
exit(0)