Files
mt/Deep-SAD-PyTorch/src/baselines/kde.py
2024-06-28 11:36:46 +02:00

195 lines
6.8 KiB
Python

import json
import logging
import time
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.neighbors import KernelDensity
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import GridSearchCV
from base.base_dataset import BaseADDataset
from networks.main import build_autoencoder
class KDE(object):
"""A class for Kernel Density Estimation models."""
def __init__(self, hybrid=False, kernel="gaussian", n_jobs=-1, seed=None, **kwargs):
"""Init Kernel Density Estimation instance."""
self.kernel = kernel
self.n_jobs = n_jobs
self.seed = seed
self.model = KernelDensity(kernel=kernel, **kwargs)
self.bandwidth = self.model.bandwidth
self.hybrid = hybrid
self.ae_net = None # autoencoder network for the case of a hybrid model
self.results = {
"train_time": None,
"test_time": None,
"test_auc": None,
"test_scores": None,
}
def train(
self,
dataset: BaseADDataset,
device: str = "cpu",
n_jobs_dataloader: int = 0,
bandwidth_GridSearchCV: bool = True,
):
"""Trains the Kernel Density Estimation model on the training data."""
logger = logging.getLogger()
# do not drop last batch for non-SGD optimization shallow_ssad
train_loader = DataLoader(
dataset=dataset.train_set,
batch_size=128,
shuffle=True,
num_workers=n_jobs_dataloader,
drop_last=False,
)
# Get data from loader
X = ()
for data in train_loader:
inputs, _, _, _ = data
inputs = inputs.to(device)
if self.hybrid:
inputs = self.ae_net.encoder(
inputs
) # in hybrid approach, take code representation of AE as features
X_batch = inputs.view(
inputs.size(0), -1
) # X_batch.shape = (batch_size, n_channels * height * width)
X += (X_batch.cpu().data.numpy(),)
X = np.concatenate(X)
# Training
logger.info("Starting training...")
start_time = time.time()
if bandwidth_GridSearchCV:
# use grid search cross-validation to select bandwidth
logger.info("Using GridSearchCV for bandwidth selection...")
params = {"bandwidth": np.logspace(0.5, 5, num=10, base=2)}
hyper_kde = GridSearchCV(
KernelDensity(kernel=self.kernel),
params,
n_jobs=self.n_jobs,
cv=5,
verbose=0,
)
hyper_kde.fit(X)
self.bandwidth = hyper_kde.best_estimator_.bandwidth
logger.info("Best bandwidth: {:.8f}".format(self.bandwidth))
self.model = hyper_kde.best_estimator_
else:
# if exponential kernel, re-initialize kde with bandwidth minimizing the numerical error
if self.kernel == "exponential":
self.bandwidth = np.max(pairwise_distances(X)) ** 2
self.model = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)
self.model.fit(X)
train_time = time.time() - start_time
self.results["train_time"] = train_time
logger.info("Training Time: {:.3f}s".format(self.results["train_time"]))
logger.info("Finished training.")
def test(
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
):
"""Tests the Kernel Density Estimation model on the test data."""
logger = logging.getLogger()
_, test_loader = dataset.loaders(batch_size=128, num_workers=n_jobs_dataloader)
# Get data from loader
idx_label_score = []
X = ()
idxs = []
labels = []
for data in test_loader:
inputs, label_batch, _, idx = data
inputs, label_batch, idx = (
inputs.to(device),
label_batch.to(device),
idx.to(device),
)
if self.hybrid:
inputs = self.ae_net.encoder(
inputs
) # in hybrid approach, take code representation of AE as features
X_batch = inputs.view(
inputs.size(0), -1
) # X_batch.shape = (batch_size, n_channels * height * width)
X += (X_batch.cpu().data.numpy(),)
idxs += idx.cpu().data.numpy().astype(np.int64).tolist()
labels += label_batch.cpu().data.numpy().astype(np.int64).tolist()
X = np.concatenate(X)
# Testing
logger.info("Starting testing...")
start_time = time.time()
scores = (-1.0) * self.model.score_samples(X)
self.results["test_time"] = time.time() - start_time
scores = scores.flatten()
# Save triples of (idx, label, score) in a list
idx_label_score += list(zip(idxs, labels, scores.tolist()))
self.results["test_scores"] = idx_label_score
# Compute AUC
_, labels, scores = zip(*idx_label_score)
labels = np.array(labels)
scores = np.array(scores)
self.results["test_auc"] = roc_auc_score(labels, scores)
# Log results
logger.info("Test AUC: {:.2f}%".format(100.0 * self.results["test_auc"]))
logger.info("Test Time: {:.3f}s".format(self.results["test_time"]))
logger.info("Finished testing.")
def load_ae(self, dataset_name, model_path):
"""Load pretrained autoencoder from model_path for feature extraction in a hybrid KDE model."""
model_dict = torch.load(model_path, map_location="cpu")
ae_net_dict = model_dict["ae_net_dict"]
if dataset_name in ["mnist", "fmnist", "cifar10"]:
net_name = dataset_name + "_LeNet"
else:
net_name = dataset_name + "_mlp"
if self.ae_net is None:
self.ae_net = build_autoencoder(net_name)
# update keys (since there was a change in network definition)
ae_keys = list(self.ae_net.state_dict().keys())
for i in range(len(ae_net_dict)):
k, v = ae_net_dict.popitem(False)
new_key = ae_keys[i]
ae_net_dict[new_key] = v
i += 1
self.ae_net.load_state_dict(ae_net_dict)
self.ae_net.eval()
def save_model(self, export_path):
"""Save KDE model to export_path."""
pass
def load_model(self, import_path, device: str = "cpu"):
"""Load KDE model from import_path."""
pass
def save_results(self, export_json):
"""Save results dict to a JSON-file."""
with open(export_json, "w") as fp:
json.dump(self.results, fp)