705 lines
22 KiB
Python
705 lines
22 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import shutil
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Any, Dict, Optional
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
import pandas as pd
|
||
|
|
import polars as pl
|
||
|
|
from load_results import (
|
||
|
|
load_pretraining_results_dataframe,
|
||
|
|
load_results_dataframe,
|
||
|
|
)
|
||
|
|
|
||
|
|
# ----------------------------
|
||
|
|
# Config
|
||
|
|
# ----------------------------
|
||
|
|
RESULTS_ROOT = Path("/home/fedex/mt/results/done") # folder with experiment subdirs
|
||
|
|
OUTPUT_DIR = Path("/home/fedex/mt/plots/setup_runtime_tables") # where .tex goes
|
||
|
|
|
||
|
|
# If you want to optionally prefer a specific network label for baselines in column names,
|
||
|
|
# set to a substring to detect (e.g. "efficient"). If None, keep network as-is.
|
||
|
|
BASELINE_NETWORK_HINT: Optional[str] = None # e.g., "efficient" or None
|
||
|
|
|
||
|
|
|
||
|
|
# ----------------------------
|
||
|
|
# Helpers
|
||
|
|
# ----------------------------
|
||
|
|
def _net_label_for_display(net: str | None) -> str:
|
||
|
|
s = (net or "").lower()
|
||
|
|
if "effic" in s:
|
||
|
|
return "Efficient"
|
||
|
|
if "lenet" in s:
|
||
|
|
return "LeNet"
|
||
|
|
return net or ""
|
||
|
|
|
||
|
|
|
||
|
|
def _fmt_mean_std_n(
|
||
|
|
mean: float | None, std: float | None, n: int | None, unit: str = ""
|
||
|
|
) -> str:
|
||
|
|
if mean is None or (isinstance(mean, float) and (np.isnan(mean) or np.isinf(mean))):
|
||
|
|
return "-"
|
||
|
|
base = f"{mean:.2f}"
|
||
|
|
if std is not None and not (
|
||
|
|
isinstance(std, float) and (np.isnan(std) or np.isinf(std))
|
||
|
|
):
|
||
|
|
base = f"{base} ± {std:.2f}"
|
||
|
|
if unit:
|
||
|
|
base = f"{base} {unit}"
|
||
|
|
if n is not None and n > 0:
|
||
|
|
base = f"{base} (n={n})"
|
||
|
|
return base
|
||
|
|
|
||
|
|
|
||
|
|
def _fmt_pair(n: int, m: int) -> str:
|
||
|
|
return f"{n}/{m}"
|
||
|
|
|
||
|
|
|
||
|
|
def _fmt_mean_std(mean: float | None, std: float | None, n: int | None) -> str:
|
||
|
|
if mean is None or (isinstance(mean, float) and (np.isnan(mean) or np.isinf(mean))):
|
||
|
|
return "-"
|
||
|
|
if std is None or (isinstance(std, float) and (np.isnan(std) or np.isinf(std))):
|
||
|
|
return f"{mean:.2f}"
|
||
|
|
if n is None or n < 1:
|
||
|
|
return f"{mean:.2f} ± {std:.2f}"
|
||
|
|
return f"{mean:.2f} ± {std:.2f} (n={n})"
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_cfg(cfg_json: Optional[str]) -> Dict[str, Any]:
|
||
|
|
if not cfg_json:
|
||
|
|
return {}
|
||
|
|
try:
|
||
|
|
return json.loads(cfg_json)
|
||
|
|
except Exception:
|
||
|
|
return {}
|
||
|
|
|
||
|
|
|
||
|
|
def _key_params(model: str, cfg: Dict[str, Any]) -> str:
|
||
|
|
"""Compact, model-specific parameter string for the table."""
|
||
|
|
if model == "deepsad":
|
||
|
|
bs = cfg.get("batch_size")
|
||
|
|
ne = cfg.get("n_epochs")
|
||
|
|
lr = cfg.get("lr")
|
||
|
|
wd = cfg.get("weight_decay")
|
||
|
|
return f"bs={bs}, epochs={ne}, lr={lr}, wd={wd}"
|
||
|
|
if model == "isoforest":
|
||
|
|
est = cfg.get("isoforest_n_estimators")
|
||
|
|
ms = cfg.get("isoforest_max_samples")
|
||
|
|
cont = cfg.get("isoforest_contamination")
|
||
|
|
return f"n_estimators={est}, max_samples={ms}, cont={cont}"
|
||
|
|
if model == "ocsvm":
|
||
|
|
ker = cfg.get("ocsvm_kernel")
|
||
|
|
nu = cfg.get("ocsvm_nu")
|
||
|
|
return f"kernel={ker}, nu={nu}"
|
||
|
|
return "-"
|
||
|
|
|
||
|
|
|
||
|
|
def _method_col_name(model: str, network: str) -> str:
|
||
|
|
"""
|
||
|
|
Column heading for pivot tables:
|
||
|
|
- deepsad carries the network (e.g., 'DeepSAD / LeNet')
|
||
|
|
- baselines carry their own model name; optionally annotate network
|
||
|
|
"""
|
||
|
|
label = model.lower()
|
||
|
|
if label == "deepsad":
|
||
|
|
return f"DeepSAD / {network}"
|
||
|
|
# baselines; optionally simplify/standardize network name
|
||
|
|
if (
|
||
|
|
BASELINE_NETWORK_HINT
|
||
|
|
and BASELINE_NETWORK_HINT.lower() not in (network or "").lower()
|
||
|
|
):
|
||
|
|
# If you want to collapse baseline duplicates to a single name, you can force it here
|
||
|
|
return model.capitalize()
|
||
|
|
# Otherwise, keep network variant explicit
|
||
|
|
return f"{model.capitalize()} / {network}"
|
||
|
|
|
||
|
|
|
||
|
|
def _prepare_per_fold_metrics(df: pl.DataFrame) -> pl.DataFrame:
|
||
|
|
"""
|
||
|
|
Returns one row per (folder, model, fold) with:
|
||
|
|
- train_time, test_time
|
||
|
|
- n_test (len(scores))
|
||
|
|
- n_epochs (from config_json; DeepSAD only)
|
||
|
|
- latency_ms = 1000 * test_time / n_test
|
||
|
|
- time_per_epoch = train_time / n_epochs (DeepSAD only)
|
||
|
|
"""
|
||
|
|
base = (
|
||
|
|
df.select(
|
||
|
|
"folder",
|
||
|
|
"network",
|
||
|
|
"model",
|
||
|
|
"latent_dim",
|
||
|
|
"semi_normals",
|
||
|
|
"semi_anomalous",
|
||
|
|
"fold",
|
||
|
|
"train_time",
|
||
|
|
"test_time",
|
||
|
|
"scores",
|
||
|
|
"config_json",
|
||
|
|
)
|
||
|
|
.with_columns(
|
||
|
|
n_test=pl.col("scores").list.len(),
|
||
|
|
n_epochs=pl.col("config_json")
|
||
|
|
.str.json_path_match("$.n_epochs")
|
||
|
|
.cast(pl.Int64),
|
||
|
|
)
|
||
|
|
.drop("scores")
|
||
|
|
)
|
||
|
|
|
||
|
|
# de-dup across evals
|
||
|
|
uniq = base.unique(subset=["folder", "model", "fold"])
|
||
|
|
|
||
|
|
# derived metrics
|
||
|
|
uniq = uniq.with_columns(
|
||
|
|
latency_ms=pl.when((pl.col("test_time") > 0) & (pl.col("n_test") > 0))
|
||
|
|
.then(1000.0 * pl.col("test_time") / pl.col("n_test"))
|
||
|
|
.otherwise(None)
|
||
|
|
.cast(pl.Float64),
|
||
|
|
time_per_epoch=pl.when(
|
||
|
|
(pl.col("model") == "deepsad") & (pl.col("n_epochs") > 0)
|
||
|
|
)
|
||
|
|
.then(pl.col("train_time") / pl.col("n_epochs"))
|
||
|
|
.otherwise(None)
|
||
|
|
.cast(pl.Float64),
|
||
|
|
network_disp=pl.col("network")
|
||
|
|
.cast(pl.Utf8)
|
||
|
|
.map_elements(_net_label_for_display, return_dtype=pl.Utf8),
|
||
|
|
)
|
||
|
|
return uniq
|
||
|
|
|
||
|
|
|
||
|
|
def _prepare_aggregates(df: pl.DataFrame) -> pl.DataFrame:
|
||
|
|
"""
|
||
|
|
Deduplicate across evals, then aggregate times across folds for each
|
||
|
|
(network, model, latent_dim, semi_normals, semi_anomalous).
|
||
|
|
"""
|
||
|
|
# Keep only columns we need
|
||
|
|
base = df.select(
|
||
|
|
"folder",
|
||
|
|
"network",
|
||
|
|
"model",
|
||
|
|
"latent_dim",
|
||
|
|
"semi_normals",
|
||
|
|
"semi_anomalous",
|
||
|
|
"fold",
|
||
|
|
"train_time",
|
||
|
|
"test_time",
|
||
|
|
"config_json",
|
||
|
|
)
|
||
|
|
|
||
|
|
# Drop duplicates across evals: same (folder, model, fold) should have identical timings
|
||
|
|
uniq = base.unique(subset=["folder", "model", "fold"]).with_columns(
|
||
|
|
# Normalize network to a simpler display label, if your config used long names
|
||
|
|
pl.col("network").cast(pl.Utf8)
|
||
|
|
)
|
||
|
|
|
||
|
|
# Group across folds
|
||
|
|
agg = (
|
||
|
|
uniq.group_by(
|
||
|
|
["network", "model", "latent_dim", "semi_normals", "semi_anomalous"]
|
||
|
|
)
|
||
|
|
.agg(
|
||
|
|
pl.len().alias("n_folds"),
|
||
|
|
pl.col("train_time").mean().alias("train_mean"),
|
||
|
|
pl.col("train_time").std(ddof=1).alias("train_std"),
|
||
|
|
pl.col("test_time").mean().alias("test_mean"),
|
||
|
|
pl.col("test_time").std(ddof=1).alias("test_std"),
|
||
|
|
pl.col("config_json")
|
||
|
|
.first()
|
||
|
|
.alias("config_json"), # one exemplar cfg per group
|
||
|
|
)
|
||
|
|
.sort(["semi_normals", "semi_anomalous", "latent_dim", "network", "model"])
|
||
|
|
)
|
||
|
|
return agg
|
||
|
|
|
||
|
|
|
||
|
|
def make_training_runtime_table(df: pl.DataFrame) -> str:
|
||
|
|
"""
|
||
|
|
Returns a LaTeX table (string) for TRAIN runtimes: mean ± std (seconds) across folds.
|
||
|
|
Rows: Semi (N/O), Latent Dim
|
||
|
|
Columns: methods split (DeepSAD/LeNet, DeepSAD/Efficient, IsoForest[/net], OCSVM[/net])
|
||
|
|
"""
|
||
|
|
agg = _prepare_aggregates(df)
|
||
|
|
|
||
|
|
# Prepare display strings and column keys
|
||
|
|
tbl = agg.with_columns(
|
||
|
|
pl.format("{}/{}", pl.col("semi_normals"), pl.col("semi_anomalous")).alias(
|
||
|
|
"semi"
|
||
|
|
),
|
||
|
|
pl.col("model").cast(pl.Utf8),
|
||
|
|
pl.col("network").cast(pl.Utf8),
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
# ADD return_dtype here
|
||
|
|
pl.struct(["train_mean", "train_std", "n_folds"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _fmt_mean_std(s["train_mean"], s["train_std"], s["n_folds"]),
|
||
|
|
return_dtype=pl.Utf8,
|
||
|
|
)
|
||
|
|
.alias("train_fmt"),
|
||
|
|
# And here
|
||
|
|
pl.struct(["model", "network"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _method_col_name(s["model"], s["network"]),
|
||
|
|
return_dtype=pl.Utf8,
|
||
|
|
)
|
||
|
|
.alias("method"),
|
||
|
|
).select("semi", "latent_dim", "method", "train_fmt")
|
||
|
|
|
||
|
|
# Pivot to wide form: one cell per (semi, latent_dim, method)
|
||
|
|
wide = tbl.pivot(
|
||
|
|
values="train_fmt",
|
||
|
|
index=["semi", "latent_dim"],
|
||
|
|
columns="method",
|
||
|
|
aggregate_function="first",
|
||
|
|
).sort(["semi", "latent_dim"])
|
||
|
|
|
||
|
|
# Fill missing with '-' and export
|
||
|
|
pdf = wide.fill_null("-").to_pandas()
|
||
|
|
pdf.index = pd.MultiIndex.from_frame(pdf[["semi", "latent_dim"]])
|
||
|
|
pdf = pdf.drop(columns=["semi", "latent_dim"])
|
||
|
|
latex = pdf.to_latex(
|
||
|
|
index=True,
|
||
|
|
escape=True,
|
||
|
|
na_rep="-",
|
||
|
|
multicolumn=True,
|
||
|
|
multicolumn_format="c",
|
||
|
|
bold_rows=False,
|
||
|
|
caption="Training runtime (seconds): mean ± std across folds (n in parentheses).",
|
||
|
|
label="tab:train_runtimes",
|
||
|
|
)
|
||
|
|
return latex
|
||
|
|
|
||
|
|
|
||
|
|
def make_inference_runtime_table(df: pl.DataFrame) -> str:
|
||
|
|
"""
|
||
|
|
Returns a LaTeX table (string) for TEST/INFERENCE runtimes: mean ± std (seconds) across folds.
|
||
|
|
Same layout as training table.
|
||
|
|
"""
|
||
|
|
agg = _prepare_aggregates(df)
|
||
|
|
|
||
|
|
tbl = agg.with_columns(
|
||
|
|
pl.format("{}/{}", pl.col("semi_normals"), pl.col("semi_anomalous")).alias(
|
||
|
|
"semi"
|
||
|
|
),
|
||
|
|
pl.col("model").cast(pl.Utf8),
|
||
|
|
pl.col("network").cast(pl.Utf8),
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.struct(["test_mean", "test_std", "n_folds"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _fmt_mean_std(s["test_mean"], s["test_std"], s["n_folds"]),
|
||
|
|
return_dtype=pl.Utf8,
|
||
|
|
)
|
||
|
|
.alias("test_fmt"),
|
||
|
|
pl.struct(["model", "network"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _method_col_name(s["model"], s["network"]),
|
||
|
|
return_dtype=pl.Utf8,
|
||
|
|
)
|
||
|
|
.alias("method"),
|
||
|
|
).select("semi", "latent_dim", "method", "test_fmt")
|
||
|
|
|
||
|
|
wide = tbl.pivot(
|
||
|
|
values="test_fmt",
|
||
|
|
index=["semi", "latent_dim"],
|
||
|
|
columns="method",
|
||
|
|
aggregate_function="first",
|
||
|
|
).sort(["semi", "latent_dim"])
|
||
|
|
|
||
|
|
pdf = wide.fill_null("-").to_pandas()
|
||
|
|
pdf.index = pd.MultiIndex.from_frame(pdf[["semi", "latent_dim"]])
|
||
|
|
pdf = pdf.drop(columns=["semi", "latent_dim"])
|
||
|
|
latex = pdf.to_latex(
|
||
|
|
index=True,
|
||
|
|
escape=True,
|
||
|
|
na_rep="-",
|
||
|
|
multicolumn=True,
|
||
|
|
multicolumn_format="c",
|
||
|
|
bold_rows=False,
|
||
|
|
caption="Inference/Test runtime (seconds): mean ± std across folds (n in parentheses).",
|
||
|
|
label="tab:test_runtimes",
|
||
|
|
)
|
||
|
|
return latex
|
||
|
|
|
||
|
|
|
||
|
|
def make_longform_train_table_with_params(df: pl.DataFrame) -> str:
|
||
|
|
"""
|
||
|
|
(Optional) Long-form table that includes a 'Params' column extracted from config_json.
|
||
|
|
Useful if you want to show per-model settings alongside the runtimes.
|
||
|
|
"""
|
||
|
|
agg = _prepare_aggregates(df)
|
||
|
|
# Build params column from JSON for readability
|
||
|
|
long = (
|
||
|
|
agg.with_columns(
|
||
|
|
pl.format("{}/{}", pl.col("semi_normals"), pl.col("semi_anomalous")).alias(
|
||
|
|
"semi"
|
||
|
|
),
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.struct(["model", "config_json"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _key_params(s["model"], _parse_cfg(s["config_json"])),
|
||
|
|
return_dtype=pl.Utf8,
|
||
|
|
)
|
||
|
|
.alias("params"),
|
||
|
|
pl.struct(["train_mean", "train_std", "n_folds"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _fmt_mean_std(s["train_mean"], s["train_std"], s["n_folds"])
|
||
|
|
)
|
||
|
|
.alias("train_time_fmt"),
|
||
|
|
)
|
||
|
|
.select(
|
||
|
|
"network",
|
||
|
|
"model",
|
||
|
|
"latent_dim",
|
||
|
|
"semi",
|
||
|
|
"params",
|
||
|
|
"train_time_fmt",
|
||
|
|
)
|
||
|
|
.sort(["semi", "latent_dim", "network", "model"])
|
||
|
|
)
|
||
|
|
|
||
|
|
pdf = long.to_pandas()
|
||
|
|
pdf.rename(
|
||
|
|
columns={
|
||
|
|
"network": "Network",
|
||
|
|
"model": "Method",
|
||
|
|
"latent_dim": "Latent Dim",
|
||
|
|
"semi": "Semi (N/O)",
|
||
|
|
"params": "Params",
|
||
|
|
"train_time_fmt": "Train time [s] (mean ± std)",
|
||
|
|
},
|
||
|
|
inplace=True,
|
||
|
|
)
|
||
|
|
latex = pdf.to_latex(
|
||
|
|
index=False,
|
||
|
|
escape=True,
|
||
|
|
longtable=False,
|
||
|
|
caption="Training runtime with key parameters.",
|
||
|
|
label="tab:train_runtimes_params",
|
||
|
|
)
|
||
|
|
return latex
|
||
|
|
|
||
|
|
|
||
|
|
def make_training_runtime_table_compact(df: pl.DataFrame) -> str:
|
||
|
|
per_fold = _prepare_per_fold_metrics(df)
|
||
|
|
|
||
|
|
# DeepSAD: keep LeNet vs Efficient, collapse semis
|
||
|
|
ds = (
|
||
|
|
per_fold.filter(pl.col("model") == "deepsad")
|
||
|
|
.group_by(["model", "network_disp", "latent_dim"])
|
||
|
|
.agg(
|
||
|
|
n=pl.len(),
|
||
|
|
train_mean=pl.mean("train_time"),
|
||
|
|
train_std=pl.std("train_time", ddof=1),
|
||
|
|
tpe_mean=pl.mean("time_per_epoch"),
|
||
|
|
tpe_std=pl.std("time_per_epoch", ddof=1),
|
||
|
|
)
|
||
|
|
.with_columns(
|
||
|
|
method=pl.format("DeepSAD / {}", pl.col("network_disp")),
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
# Baselines: collapse networks & semis; only vary by latent_dim
|
||
|
|
bl = (
|
||
|
|
per_fold.filter(pl.col("model").is_in(["isoforest", "ocsvm"]))
|
||
|
|
.group_by(["model", "latent_dim"])
|
||
|
|
.agg(
|
||
|
|
n=pl.len(),
|
||
|
|
train_mean=pl.mean("train_time"),
|
||
|
|
train_std=pl.std("train_time", ddof=1),
|
||
|
|
)
|
||
|
|
.with_columns(
|
||
|
|
method=pl.when(pl.col("model") == "isoforest")
|
||
|
|
.then(pl.lit("IsoForest"))
|
||
|
|
.when(pl.col("model") == "ocsvm")
|
||
|
|
.then(pl.lit("OCSVM"))
|
||
|
|
.otherwise(pl.lit("Baseline"))
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
# --- Standardize schemas before concat ---
|
||
|
|
ds_std = ds.select(
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.col("method").cast(pl.Utf8),
|
||
|
|
pl.col("train_mean").cast(pl.Float64),
|
||
|
|
pl.col("train_std").cast(pl.Float64),
|
||
|
|
pl.col("tpe_mean").cast(pl.Float64),
|
||
|
|
pl.col("tpe_std").cast(pl.Float64),
|
||
|
|
pl.col("n").cast(pl.Int64),
|
||
|
|
)
|
||
|
|
|
||
|
|
bl_std = bl.select(
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.col("method").cast(pl.Utf8),
|
||
|
|
pl.col("train_mean").cast(pl.Float64),
|
||
|
|
pl.col("train_std").cast(pl.Float64),
|
||
|
|
pl.lit(None, dtype=pl.Float64).alias("tpe_mean"),
|
||
|
|
pl.lit(None, dtype=pl.Float64).alias("tpe_std"),
|
||
|
|
pl.col("n").cast(pl.Int64),
|
||
|
|
)
|
||
|
|
|
||
|
|
agg = pl.concat([ds_std, bl_std], how="vertical")
|
||
|
|
|
||
|
|
# Format cell: total [s]; DeepSAD also appends (italic) per-epoch
|
||
|
|
def _fmt_train_cell(s: dict) -> str:
|
||
|
|
total = _fmt_mean_std_n(s["train_mean"], s["train_std"], s["n"], "s")
|
||
|
|
if s.get("tpe_mean") is None or (
|
||
|
|
isinstance(s.get("tpe_mean"), float) and np.isnan(s["tpe_mean"])
|
||
|
|
):
|
||
|
|
return total
|
||
|
|
tpe = _fmt_mean_std_n(s["tpe_mean"], s["tpe_std"], None, "s/epoch")
|
||
|
|
return f"{total} (\\textit{{{tpe}}})"
|
||
|
|
|
||
|
|
tbl = agg.with_columns(
|
||
|
|
pl.struct(["train_mean", "train_std", "tpe_mean", "tpe_std", "n"])
|
||
|
|
.map_elements(_fmt_train_cell, return_dtype=pl.Utf8)
|
||
|
|
.alias("train_fmt"),
|
||
|
|
).select("latent_dim", "method", "train_fmt")
|
||
|
|
|
||
|
|
# Pivot and order columns nicely
|
||
|
|
wide = tbl.pivot(
|
||
|
|
values="train_fmt",
|
||
|
|
index=["latent_dim"],
|
||
|
|
columns="method",
|
||
|
|
aggregate_function="first",
|
||
|
|
).sort("latent_dim")
|
||
|
|
|
||
|
|
pdf = wide.fill_null("-").to_pandas().set_index("latent_dim")
|
||
|
|
desired_cols = [
|
||
|
|
c
|
||
|
|
for c in ["DeepSAD / LeNet", "DeepSAD / Efficient", "IsoForest", "OCSVM"]
|
||
|
|
if c in pdf.columns
|
||
|
|
]
|
||
|
|
if desired_cols:
|
||
|
|
pdf = pdf.reindex(columns=desired_cols)
|
||
|
|
|
||
|
|
latex = pdf.to_latex(
|
||
|
|
index=True,
|
||
|
|
escape=True,
|
||
|
|
na_rep="-",
|
||
|
|
multicolumn=True,
|
||
|
|
multicolumn_format="c",
|
||
|
|
bold_rows=False,
|
||
|
|
caption="Training runtime: total seconds (mean ± std). DeepSAD cells also show \\textit{seconds per epoch} in parentheses.",
|
||
|
|
label="tab:train_runtimes_compact",
|
||
|
|
)
|
||
|
|
return latex
|
||
|
|
|
||
|
|
|
||
|
|
def make_inference_latency_table_compact(df: pl.DataFrame) -> str:
|
||
|
|
per_fold = _prepare_per_fold_metrics(df)
|
||
|
|
|
||
|
|
# DeepSAD: keep networks; collapse semis
|
||
|
|
ds = (
|
||
|
|
per_fold.filter(pl.col("model") == "deepsad")
|
||
|
|
.group_by(["model", "network_disp", "latent_dim"])
|
||
|
|
.agg(
|
||
|
|
n=pl.len(),
|
||
|
|
lat_mean=pl.mean("latency_ms"),
|
||
|
|
lat_std=pl.std("latency_ms", ddof=1),
|
||
|
|
)
|
||
|
|
.with_columns(
|
||
|
|
method=pl.format("DeepSAD / {}", pl.col("network_disp")),
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
# Baselines: collapse networks & semis
|
||
|
|
bl = (
|
||
|
|
per_fold.filter(pl.col("model").is_in(["isoforest", "ocsvm"]))
|
||
|
|
.group_by(["model", "latent_dim"])
|
||
|
|
.agg(
|
||
|
|
n=pl.len(),
|
||
|
|
lat_mean=pl.mean("latency_ms"),
|
||
|
|
lat_std=pl.std("latency_ms", ddof=1),
|
||
|
|
)
|
||
|
|
.with_columns(
|
||
|
|
method=pl.when(pl.col("model") == "isoforest")
|
||
|
|
.then(pl.lit("IsoForest"))
|
||
|
|
.when(pl.col("model") == "ocsvm")
|
||
|
|
.then(pl.lit("OCSVM"))
|
||
|
|
.otherwise(pl.lit("Baseline"))
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
# --- Standardize schemas before concat ---
|
||
|
|
ds_std = ds.select(
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.col("method").cast(pl.Utf8),
|
||
|
|
pl.col("lat_mean").cast(pl.Float64),
|
||
|
|
pl.col("lat_std").cast(pl.Float64),
|
||
|
|
pl.col("n").cast(pl.Int64),
|
||
|
|
)
|
||
|
|
|
||
|
|
bl_std = bl.select(
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.col("method").cast(pl.Utf8),
|
||
|
|
pl.col("lat_mean").cast(pl.Float64),
|
||
|
|
pl.col("lat_std").cast(pl.Float64),
|
||
|
|
pl.col("n").cast(pl.Int64),
|
||
|
|
)
|
||
|
|
|
||
|
|
agg = pl.concat([ds_std, bl_std], how="vertical")
|
||
|
|
|
||
|
|
def _fmt_lat_cell(s: dict) -> str:
|
||
|
|
return _fmt_mean_std_n(s["lat_mean"], s["lat_std"], s["n"], "ms")
|
||
|
|
|
||
|
|
tbl = agg.with_columns(
|
||
|
|
pl.struct(["lat_mean", "lat_std", "n"])
|
||
|
|
.map_elements(_fmt_lat_cell, return_dtype=pl.Utf8)
|
||
|
|
.alias("lat_fmt"),
|
||
|
|
).select("latent_dim", "method", "lat_fmt")
|
||
|
|
|
||
|
|
wide = tbl.pivot(
|
||
|
|
values="lat_fmt",
|
||
|
|
index=["latent_dim"],
|
||
|
|
columns="method",
|
||
|
|
aggregate_function="first",
|
||
|
|
).sort("latent_dim")
|
||
|
|
|
||
|
|
pdf = wide.fill_null("-").to_pandas().set_index("latent_dim")
|
||
|
|
desired_cols = [
|
||
|
|
c
|
||
|
|
for c in ["DeepSAD / LeNet", "DeepSAD / Efficient", "IsoForest", "OCSVM"]
|
||
|
|
if c in pdf.columns
|
||
|
|
]
|
||
|
|
if desired_cols:
|
||
|
|
pdf = pdf.reindex(columns=desired_cols)
|
||
|
|
|
||
|
|
latex = pdf.to_latex(
|
||
|
|
index=True,
|
||
|
|
escape=True,
|
||
|
|
na_rep="-",
|
||
|
|
multicolumn=True,
|
||
|
|
multicolumn_format="c",
|
||
|
|
bold_rows=False,
|
||
|
|
caption="Inference latency (ms/sample): mean ± std across folds; baselines collapsed across networks and semi-labeling.",
|
||
|
|
label="tab:inference_latency_compact",
|
||
|
|
)
|
||
|
|
return latex
|
||
|
|
|
||
|
|
|
||
|
|
def make_ae_pretraining_runtime_table(df_pre: pl.DataFrame) -> str:
|
||
|
|
"""
|
||
|
|
LaTeX table: Autoencoder (pretraining) runtime per latent dim.
|
||
|
|
Rows: latent_dim
|
||
|
|
Cols: AE / LeNet, AE / Efficient (mean ± std seconds across folds)
|
||
|
|
"""
|
||
|
|
# minimal columns we need
|
||
|
|
base = df_pre.select(
|
||
|
|
pl.col("network").cast(pl.Utf8),
|
||
|
|
pl.col("latent_dim").cast(pl.Int64),
|
||
|
|
pl.col("fold").cast(pl.Int64),
|
||
|
|
pl.col("train_time").cast(pl.Float64),
|
||
|
|
).drop_nulls(subset=["network", "latent_dim", "train_time"])
|
||
|
|
|
||
|
|
# Nice display label for network
|
||
|
|
network_disp = (
|
||
|
|
pl.when(pl.col("network").str.contains("efficient"))
|
||
|
|
.then(pl.lit("Efficient"))
|
||
|
|
.when(pl.col("network").str.contains("LeNet"))
|
||
|
|
.then(pl.lit("LeNet"))
|
||
|
|
.otherwise(pl.col("network"))
|
||
|
|
.alias("network_disp")
|
||
|
|
)
|
||
|
|
|
||
|
|
agg = (
|
||
|
|
base.with_columns(network_disp)
|
||
|
|
.group_by(["network_disp", "latent_dim"])
|
||
|
|
.agg(
|
||
|
|
n=pl.len(),
|
||
|
|
train_mean=pl.mean("train_time"),
|
||
|
|
train_std=pl.std("train_time", ddof=1),
|
||
|
|
)
|
||
|
|
.with_columns(
|
||
|
|
pl.format("AE / {}", pl.col("network_disp")).alias("method"),
|
||
|
|
pl.struct(["train_mean", "train_std", "n"])
|
||
|
|
.map_elements(
|
||
|
|
lambda s: _fmt_mean_std(s["train_mean"], s["train_std"], s["n"]),
|
||
|
|
return_dtype=pl.Utf8,
|
||
|
|
)
|
||
|
|
.alias("train_fmt"),
|
||
|
|
)
|
||
|
|
.select("latent_dim", "method", "train_fmt")
|
||
|
|
.sort(["latent_dim", "method"])
|
||
|
|
)
|
||
|
|
|
||
|
|
wide = agg.pivot(
|
||
|
|
values="train_fmt",
|
||
|
|
index=["latent_dim"],
|
||
|
|
columns="method",
|
||
|
|
aggregate_function="first",
|
||
|
|
).sort("latent_dim")
|
||
|
|
|
||
|
|
pdf = wide.fill_null("-").to_pandas().set_index("latent_dim")
|
||
|
|
|
||
|
|
# Order columns if both exist
|
||
|
|
desired = [
|
||
|
|
c for c in ["Autoencoder LeNet", "Autoencoder Efficient"] if c in pdf.columns
|
||
|
|
]
|
||
|
|
if desired:
|
||
|
|
pdf = pdf.reindex(columns=desired)
|
||
|
|
|
||
|
|
latex = pdf.to_latex(
|
||
|
|
index=True,
|
||
|
|
escape=True,
|
||
|
|
na_rep="-",
|
||
|
|
multicolumn=True,
|
||
|
|
multicolumn_format="c",
|
||
|
|
bold_rows=False,
|
||
|
|
caption="Autoencoder pretraining runtime (seconds): mean ± std across folds.",
|
||
|
|
label="tab:ae_pretrain_runtimes",
|
||
|
|
)
|
||
|
|
return latex
|
||
|
|
|
||
|
|
|
||
|
|
# ----------------------------
|
||
|
|
# Main
|
||
|
|
# ----------------------------
|
||
|
|
def main():
|
||
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Main results
|
||
|
|
df = load_results_dataframe(RESULTS_ROOT, allow_cache=True)
|
||
|
|
if "config_json" not in df.columns:
|
||
|
|
df = df.with_columns(pl.lit(None).alias("config_json"))
|
||
|
|
|
||
|
|
# AE pretraining results
|
||
|
|
df_pre = load_pretraining_results_dataframe(RESULTS_ROOT, allow_cache=True)
|
||
|
|
|
||
|
|
# Build LaTeX tables
|
||
|
|
latex_train = make_training_runtime_table(df)
|
||
|
|
latex_test = make_inference_runtime_table(df)
|
||
|
|
latex_train_params = make_longform_train_table_with_params(df)
|
||
|
|
latex_ae = make_ae_pretraining_runtime_table(df_pre)
|
||
|
|
|
||
|
|
# Timestamped output dirs
|
||
|
|
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||
|
|
ts_dir = OUTPUT_DIR / "archive" / ts
|
||
|
|
ts_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Write files
|
||
|
|
(ts_dir / "train_runtimes.tex").write_text(latex_train)
|
||
|
|
(ts_dir / "test_runtimes.tex").write_text(latex_test)
|
||
|
|
(ts_dir / "train_runtimes_with_params.tex").write_text(latex_train_params)
|
||
|
|
(ts_dir / "ae_pretraining_runtimes.tex").write_text(latex_ae)
|
||
|
|
|
||
|
|
# Save script & mirror latest
|
||
|
|
script_path = Path(__file__)
|
||
|
|
shutil.copy2(script_path, ts_dir)
|
||
|
|
|
||
|
|
latest = OUTPUT_DIR / "latest"
|
||
|
|
latest.mkdir(exist_ok=True, parents=True)
|
||
|
|
for f in ts_dir.iterdir():
|
||
|
|
if f.is_file():
|
||
|
|
shutil.copy2(f, latest / f.name)
|
||
|
|
|
||
|
|
print(f"Saved LaTeX tables to: {ts_dir}")
|
||
|
|
print(f"Also updated: {latest}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|