tools, lockfile, deps

This commit is contained in:
Jan Kowalczyk
2025-08-13 14:17:12 +02:00
parent cd4dc583e8
commit ef311d862e
17 changed files with 4325 additions and 0 deletions

295
tools/gridsearch_new.py Normal file
View File

@@ -0,0 +1,295 @@
#!/usr/bin/env python3
"""
Grid-search launcher (2-GPU, fixed per-GPU batch sizes)
• one job at a time per GPU
• watchdog heartbeats + error pushes
"""
import datetime
import itertools
import queue
import signal
import subprocess
import sys
import threading
import time
from os import environ
from pathlib import Path
import GPUtil
import requests
# ─────────────── Watchdog / host configuration ────────────────
WATCHDOG_ROOT = "https://mt.the-k.at"
BASIC_AUTH = ("jan", "jruuI3GZ@9Rnt7")
HEARTBEAT_EVERY = 60
HOSTNAME = "gpu-node-01"
OUT_FOLDER = Path("./grid_search")
DONE_FOLDER = OUT_FOLDER / "done"
FAILED_FOLDER = OUT_FOLDER / "failed"
PING_URL, ERR_URL = f"{WATCHDOG_ROOT}/ping", f"{WATCHDOG_ROOT}/error"
HEADERS = {"Content-Type": "application/json"}
# ───────────────────── Hyper-parameter grid ─────────────────────
PARAM_GRID = {
"network_name": ["subter_LeNet", "subter_efficient"],
"latent_space_dim": [32, 64, 128, 256, 512, 768, 1024],
"semi_label": [(0, 0), (50, 10), (500, 100)], # (normal, anomaly)
}
# ────────────────────── FIXED PARAMETERS ──────────────────────
BATCH_SIZE = 128 # fixed batch size per GPU
EPOCHS_AE = 50 # autoencoder epochs
EPOCHS_DEEPSAD = 120 # Deep SAD epochs
CMD_TEMPLATE = (
"python src/main.py train subter {network_name} {exp_folder} "
"~/jan/data --k_fold True --k_fold_num 5 "
"--num_known_normal {num_known_normal} --num_known_outlier {num_known_anomaly} "
"--lr 0.0001 --n_epochs {deepsad_epochs} --lr_milestone 50 --batch_size {batch_size} "
"--weight_decay 0.5e-6 --latent_space_dim {latent_space_dim} "
"--pretrain True --ae_lr 0.0001 --ae_n_epochs {ae_epochs} --ae_batch_size {batch_size} "
"--ae_weight_decay 0.5e-3 --normal_class 0 --known_outlier_class 1 "
"--n_known_outlier_classes 1 --seed 3 --device cuda:0"
)
# ──────────────── global bookkeeping / queues ──────────────────
STOP_EVT = threading.Event()
JOB_TABLE = [] # [{ …, status }]
# ─────────────────────── helper functions ──────────────────────
def post_json(url, payload):
try:
requests.post(
url, json=payload, headers=HEADERS, auth=BASIC_AUTH, timeout=5
).raise_for_status()
except Exception as exc:
print(f"[warn] POST {url} failed: {exc}", file=sys.stderr)
def gpu_stats():
stats = []
for g in GPUtil.getGPUs():
stats.append(
{
"id": g.id,
"util": int(g.load * 100),
"mem": {"used": int(g.memoryUsed), "total": int(g.memoryTotal)},
}
)
return stats
def summarise_jobs():
out = {"pending": 0, "running": 0, "ok": 0, "fail": 0}
for j in JOB_TABLE:
out[j["status"]] += 1
return out
def heartbeater():
while not STOP_EVT.wait(HEARTBEAT_EVERY):
post_json(
PING_URL, {"host": HOSTNAME, "gpu": gpu_stats(), "status": summarise_jobs()}
)
# ───────────────────────── worker logic ────────────────────────
def worker(device: str, q: "queue.Queue[dict]"):
while True:
job = q.get()
if job is None:
break # sentinel for shutdown
wait_until_allowed_hours()
run_job(job, device)
q.task_done()
def exp_folder_name(params):
num_norm, num_anom = params["semi_label"]
return (
f"{params['network_name']}_"
f"latent{params['latent_space_dim']}_"
f"n{num_norm}_a{num_anom}"
)
def move_exp_folder(exp_folder: Path, target_folder: Path):
"""Move the experiment folder to the target folder."""
destination_folder = target_folder / exp_folder.name
if destination_folder.exists():
destination_folder.unlink()
target_folder.mkdir(parents=True, exist_ok=True)
exp_folder.rename(target_folder / exp_folder.name)
def already_done_set():
if not DONE_FOLDER.exists():
return set()
return set([f.name for f in DONE_FOLDER.iterdir() if f.is_dir()])
def run_job(job: dict, device: str):
num_norm, num_anom = job["params"]["semi_label"]
exp_folder = job["exp_folder"]
exp_folder.mkdir(parents=True, exist_ok=True)
cmd = CMD_TEMPLATE.format(
network_name=job["params"]["network_name"],
exp_folder=exp_folder.as_posix(),
num_known_normal=num_norm,
num_known_anomaly=num_anom,
latent_space_dim=job["params"]["latent_space_dim"],
batch_size=BATCH_SIZE,
ae_epochs=EPOCHS_AE,
deepsad_epochs=EPOCHS_DEEPSAD,
)
job.update({"device": device, "status": "running", "start": time.time()})
post_json(
PING_URL, {"host": HOSTNAME, "msg": f"starting {exp_folder.name} on {device}"}
)
print(f"EXEC [{device}] {cmd}", flush=True)
try:
env = environ.copy()
env["CUDA_VISIBLE_DEVICES"] = device.split(":")[1] # "0" or "1"
res = subprocess.run(
cmd, shell=True, capture_output=True, text=True, env=env, check=False
)
job.update({"end": time.time(), "returncode": res.returncode})
job["status"] = "ok" if res.returncode == 0 else "fail"
if res.returncode == 0:
move_exp_folder(exp_folder, DONE_FOLDER)
else:
move_exp_folder(exp_folder, FAILED_FOLDER)
tail = (res.stderr or "no-stderr")[-500:]
post_json(
ERR_URL,
{
"host": HOSTNAME,
"msg": f"rc {res.returncode} {exp_folder.name}{tail}",
},
)
print(
f"ERRR [{device}] rc {res.returncode} {exp_folder.name}{tail}",
flush=True,
)
except Exception as exc:
job.update({"end": time.time(), "status": "fail", "returncode": -999})
move_exp_folder(exp_folder, FAILED_FOLDER)
post_json(
ERR_URL,
{"host": HOSTNAME, "msg": f'exception "{exc}" in {exp_folder.name}'},
)
# ────────────────────── graceful shutdown ──────────────────────
def shutdown(reason):
post_json(ERR_URL, {"host": HOSTNAME, "msg": f"run stopped: {reason}"})
STOP_EVT.set()
sys.exit(0)
for sig in (signal.SIGINT, signal.SIGTERM):
signal.signal(sig, lambda s, f, sig=sig: shutdown(signal.Signals(sig).name))
# ─────────────────────────── main() ────────────────────────────
def wait_until_allowed_hours():
"""Sleep until the current time is between 22:00 and 04:00."""
while True:
now = datetime.datetime.now()
hour = now.hour
# Allowed window: 22:00 (22) to 04:00 (4)
if hour >= 22 or hour < 4:
break
# Calculate seconds until 22:00 today or 22:00 tomorrow if after 4am
if hour < 22:
next_allowed = now.replace(hour=22, minute=0, second=0, microsecond=0)
else:
# After 4am but before 22:00, so next allowed is tonight
next_allowed = now.replace(hour=22, minute=0, second=0, microsecond=0)
if next_allowed < now:
next_allowed += datetime.timedelta(days=1)
wait_seconds = (next_allowed - now).total_seconds()
print(f"[info] Waiting until 22:00 to start new jobs ({int(wait_seconds/60)} min)...")
time.sleep(min(wait_seconds, 60*10)) # Sleep in chunks of max 10 minutes
def main():
threading.Thread(target=heartbeater, daemon=True).start()
# two worker queues, one per GPU
q0, q1 = queue.Queue(), queue.Queue()
threading.Thread(target=worker, args=("cuda:0", q0), daemon=True).start()
threading.Thread(target=worker, args=("cuda:1", q1), daemon=True).start()
# build full parameter combinations as jobs
keys = list(PARAM_GRID)
jobs = []
for combo in itertools.product(*PARAM_GRID.values()):
params = dict(zip(keys, combo))
exp_folder = OUT_FOLDER / exp_folder_name(params)
job = {
"params": params,
"exp_folder": exp_folder,
"status": "pending",
"start": None,
"end": None,
"returncode": None,
}
jobs.append(job)
# Check which jobs are already done and mark them
done_exps = already_done_set()
#print(f"Already done jobs found: {done_exps}")
for job in jobs:
if job["exp_folder"].name in done_exps:
job["status"] = "ok"
# Print summary of job statuses before starting
n_done = sum(1 for job in jobs if job["status"] == "ok")
n_pending = sum(1 for job in jobs if job["status"] != "ok")
print(f"[info] {n_done} jobs already done, {n_pending} jobs pending.")
# Add all jobs to global JOB_TABLE for bookkeeping
JOB_TABLE.extend(jobs)
# Only enqueue jobs that are not already done
for job in jobs:
if job["status"] == "ok":
continue
# Hardcode device assignment
if job["params"]["network_name"] == "subter_LeNet":
q1.put(job) # cuda:1
elif job["params"]["network_name"] == "subter_efficient":
q0.put(job) # cuda:0
else:
# fallback: load-balance
(q0 if q0.qsize() <= q1.qsize() else q1).put(job)
q0.join()
q1.join() # wait for all jobs to drain
q0.put(None)
q1.put(None) # terminate workers
STOP_EVT.set()
post_json(
PING_URL,
{
"host": HOSTNAME,
"msg": "all jobs done",
"gpu": gpu_stats(),
"status": summarise_jobs(),
},
)
print("[info] grid search completed")
if __name__ == "__main__":
main()