full upload so not to lose anything important
This commit is contained in:
@@ -1,10 +1,12 @@
|
||||
import json
|
||||
import pickle
|
||||
|
||||
import torch
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from networks.main import build_network, build_autoencoder
|
||||
from optim.DeepSAD_trainer import DeepSADTrainer
|
||||
from networks.main import build_autoencoder, build_network
|
||||
from optim.ae_trainer import AETrainer
|
||||
from optim.DeepSAD_trainer import DeepSADTrainer
|
||||
|
||||
|
||||
class DeepSAD(object):
|
||||
@@ -65,6 +67,7 @@ class DeepSAD(object):
|
||||
weight_decay: float = 1e-6,
|
||||
device: str = "cuda",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Trains the Deep SAD model on the training data."""
|
||||
|
||||
@@ -82,7 +85,7 @@ class DeepSAD(object):
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
)
|
||||
# Get the model
|
||||
self.net = self.trainer.train(dataset, self.net)
|
||||
self.net = self.trainer.train(dataset, self.net, k_fold_idx=k_fold_idx)
|
||||
self.results["train_time"] = self.trainer.train_time
|
||||
self.c = self.trainer.c.cpu().data.numpy().tolist() # get as list
|
||||
|
||||
@@ -99,7 +102,11 @@ class DeepSAD(object):
|
||||
return self.trainer.infer(dataset, self.net)
|
||||
|
||||
def test(
|
||||
self, dataset: BaseADDataset, device: str = "cuda", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cuda",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Tests the Deep SAD model on the test data."""
|
||||
|
||||
@@ -108,10 +115,13 @@ class DeepSAD(object):
|
||||
self.c, self.eta, device=device, n_jobs_dataloader=n_jobs_dataloader
|
||||
)
|
||||
|
||||
self.trainer.test(dataset, self.net)
|
||||
self.trainer.test(dataset, self.net, k_fold_idx=k_fold_idx)
|
||||
|
||||
# Get results
|
||||
self.results["test_auc"] = self.trainer.test_auc
|
||||
self.results["test_roc"] = self.trainer.test_roc
|
||||
self.results["test_prc"] = self.trainer.test_prc
|
||||
self.results["test_ap"] = self.trainer.test_ap
|
||||
self.results["test_time"] = self.trainer.test_time
|
||||
self.results["test_scores"] = self.trainer.test_scores
|
||||
|
||||
@@ -126,6 +136,7 @@ class DeepSAD(object):
|
||||
weight_decay: float = 1e-6,
|
||||
device: str = "cuda",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Pretrains the weights for the Deep SAD network phi via autoencoder."""
|
||||
|
||||
@@ -144,13 +155,13 @@ class DeepSAD(object):
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
)
|
||||
self.ae_net = self.ae_trainer.train(dataset, self.ae_net)
|
||||
self.ae_net = self.ae_trainer.train(dataset, self.ae_net, k_fold_idx=k_fold_idx)
|
||||
|
||||
# Get train results
|
||||
self.ae_results["train_time"] = self.ae_trainer.train_time
|
||||
|
||||
# Test
|
||||
self.ae_trainer.test(dataset, self.ae_net)
|
||||
self.ae_trainer.test(dataset, self.ae_net, k_fold_idx=k_fold_idx)
|
||||
|
||||
# Get test results
|
||||
self.ae_results["test_auc"] = self.ae_trainer.test_auc
|
||||
@@ -197,10 +208,11 @@ class DeepSAD(object):
|
||||
self.ae_net = build_autoencoder(self.net_name)
|
||||
self.ae_net.load_state_dict(model_dict["ae_net_dict"])
|
||||
|
||||
def save_results(self, export_json):
|
||||
def save_results(self, export_pkl):
|
||||
"""Save results dict to a JSON-file."""
|
||||
with open(export_json, "w") as fp:
|
||||
json.dump(self.results, fp)
|
||||
with open(export_pkl, "wb") as fp:
|
||||
# json.dump(self.results, fp)
|
||||
pickle.dump(self.results, fp)
|
||||
|
||||
def save_ae_results(self, export_json):
|
||||
"""Save autoencoder results dict to a JSON-file."""
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
from sklearn.model_selection import KFold
|
||||
from torch.utils.data import ConcatDataset, DataLoader, Subset
|
||||
|
||||
from .base_dataset import BaseADDataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
|
||||
class TorchvisionDataset(BaseADDataset):
|
||||
"""TorchvisionDataset class for datasets already implemented in torchvision.datasets."""
|
||||
|
||||
def __init__(self, root: str):
|
||||
def __init__(self, root: str, k_fold_number: int = 5):
|
||||
super().__init__(root)
|
||||
self.k_fold_number = k_fold_number
|
||||
self.fold_indices = None
|
||||
|
||||
def loaders(
|
||||
self,
|
||||
@@ -50,3 +54,43 @@ class TorchvisionDataset(BaseADDataset):
|
||||
else None
|
||||
)
|
||||
return train_loader, test_loader, inference_loader
|
||||
|
||||
def loaders_k_fold(
|
||||
self,
|
||||
fold_idx: int,
|
||||
batch_size: int,
|
||||
shuffle_train=True,
|
||||
shuffle_test=False,
|
||||
num_workers: int = 0,
|
||||
) -> (DataLoader, DataLoader):
|
||||
if self.fold_indices is None:
|
||||
# Define the K-fold Cross Validator
|
||||
kfold = KFold(n_splits=self.k_fold_number, shuffle=False)
|
||||
self.fold_indices = []
|
||||
# Generate indices for each fold and store them in a list
|
||||
for train_indices, val_indices in kfold.split(self.data_set):
|
||||
self.fold_indices.append((train_indices, val_indices))
|
||||
|
||||
train_loader = (
|
||||
DataLoader(
|
||||
dataset=Subset(self.data_set, self.fold_indices[fold_idx][0]),
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle_train,
|
||||
num_workers=num_workers,
|
||||
drop_last=True,
|
||||
)
|
||||
if self.data_set is not None
|
||||
else None
|
||||
)
|
||||
test_loader = (
|
||||
DataLoader(
|
||||
dataset=Subset(self.data_set, self.fold_indices[fold_idx][1]),
|
||||
batch_size=batch_size,
|
||||
shuffle=shuffle_test,
|
||||
num_workers=num_workers,
|
||||
drop_last=False,
|
||||
)
|
||||
if self.data_set is not None
|
||||
else None
|
||||
)
|
||||
return train_loader, test_loader
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.metrics import (
|
||||
average_precision_score,
|
||||
precision_recall_curve,
|
||||
roc_auc_score,
|
||||
roc_curve,
|
||||
)
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from networks.main import build_autoencoder
|
||||
|
||||
@@ -22,7 +27,7 @@ class IsoForest(object):
|
||||
contamination=0.1,
|
||||
n_jobs=-1,
|
||||
seed=None,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
"""Init Isolation Forest instance."""
|
||||
self.n_estimators = n_estimators
|
||||
@@ -37,7 +42,7 @@ class IsoForest(object):
|
||||
contamination=contamination,
|
||||
n_jobs=n_jobs,
|
||||
random_state=seed,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.hybrid = hybrid
|
||||
@@ -47,28 +52,44 @@ class IsoForest(object):
|
||||
"train_time": None,
|
||||
"test_time": None,
|
||||
"test_auc": None,
|
||||
"test_roc": None,
|
||||
"test_scores": None,
|
||||
}
|
||||
|
||||
def train(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Trains the Isolation Forest model on the training data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
# do not drop last batch for non-SGD optimization shallow_ssad
|
||||
train_loader = DataLoader(
|
||||
dataset=dataset.train_set,
|
||||
batch_size=128,
|
||||
shuffle=True,
|
||||
num_workers=n_jobs_dataloader,
|
||||
drop_last=False,
|
||||
)
|
||||
# drop_last necessary?
|
||||
# train_loader = DataLoader(
|
||||
# dataset=dataset.train_set,
|
||||
# batch_size=128,
|
||||
# shuffle=True,
|
||||
# num_workers=n_jobs_dataloader,
|
||||
# drop_last=False,
|
||||
# )
|
||||
|
||||
if k_fold_idx is not None:
|
||||
train_loader, _ = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=128,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
# Get data from loader
|
||||
X = ()
|
||||
for data in train_loader:
|
||||
inputs, _, _, _ = data
|
||||
inputs, _, _, _, _ = data
|
||||
inputs = inputs.to(device)
|
||||
if self.hybrid:
|
||||
inputs = self.ae_net.encoder(
|
||||
@@ -91,14 +112,25 @@ class IsoForest(object):
|
||||
logger.info("Finished training.")
|
||||
|
||||
def test(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Tests the Isolation Forest model on the test data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=128,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Get data from loader
|
||||
idx_label_score = []
|
||||
@@ -106,7 +138,7 @@ class IsoForest(object):
|
||||
idxs = []
|
||||
labels = []
|
||||
for data in test_loader:
|
||||
inputs, label_batch, _, idx = data
|
||||
inputs, label_batch, _, idx, _ = data
|
||||
inputs, label_batch, idx = (
|
||||
inputs.to(device),
|
||||
label_batch.to(device),
|
||||
@@ -140,6 +172,9 @@ class IsoForest(object):
|
||||
labels = np.array(labels)
|
||||
scores = np.array(scores)
|
||||
self.results["test_auc"] = roc_auc_score(labels, scores)
|
||||
self.results["test_roc"] = roc_curve(labels, scores)
|
||||
self.results["test_prc"] = precision_recall_curve(labels, scores)
|
||||
self.results["test_ap"] = average_precision_score(labels, scores)
|
||||
|
||||
# Log results
|
||||
logger.info("Test AUC: {:.2f}%".format(100.0 * self.results["test_auc"]))
|
||||
@@ -178,7 +213,8 @@ class IsoForest(object):
|
||||
"""Load Isolation Forest model from import_path."""
|
||||
pass
|
||||
|
||||
def save_results(self, export_json):
|
||||
def save_results(self, export_pkl):
|
||||
"""Save results dict to a JSON-file."""
|
||||
with open(export_json, "w") as fp:
|
||||
json.dump(self.results, fp)
|
||||
with open(export_pkl, "wb") as fp:
|
||||
# json.dump(self.results, fp)
|
||||
pickle.dump(self.results, fp)
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.svm import OneClassSVM
|
||||
from sklearn.metrics import roc_auc_score
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics import (
|
||||
average_precision_score,
|
||||
precision_recall_curve,
|
||||
roc_auc_score,
|
||||
roc_curve,
|
||||
)
|
||||
from thundersvm import OneClassSVM
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from networks.main import build_autoencoder
|
||||
|
||||
@@ -21,7 +27,7 @@ class OCSVM(object):
|
||||
self.rho = None
|
||||
self.gamma = None
|
||||
|
||||
self.model = OneClassSVM(kernel=kernel, nu=nu)
|
||||
self.model = OneClassSVM(kernel=kernel, nu=nu, verbose=True, max_mem_size=4048)
|
||||
|
||||
self.hybrid = hybrid
|
||||
self.ae_net = None # autoencoder network for the case of a hybrid model
|
||||
@@ -40,24 +46,31 @@ class OCSVM(object):
|
||||
}
|
||||
|
||||
def train(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
batch_size: int = 32,
|
||||
):
|
||||
"""Trains the OC-SVM model on the training data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
# do not drop last batch for non-SGD optimization shallow_ssad
|
||||
train_loader = DataLoader(
|
||||
dataset=dataset.train_set,
|
||||
batch_size=128,
|
||||
shuffle=True,
|
||||
num_workers=n_jobs_dataloader,
|
||||
drop_last=False,
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
train_loader, _ = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=batch_size,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=batch_size, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Get data from loader
|
||||
X = ()
|
||||
for data in train_loader:
|
||||
inputs, _, _, _ = data
|
||||
inputs, _, _, _, _ = data
|
||||
inputs = inputs.to(device)
|
||||
if self.hybrid:
|
||||
inputs = self.ae_net.encoder(
|
||||
@@ -77,14 +90,21 @@ class OCSVM(object):
|
||||
best_auc = 0.0
|
||||
|
||||
# Sample hold-out set from test set
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=batch_size,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=batch_size, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
X_test = ()
|
||||
labels = []
|
||||
for data in test_loader:
|
||||
inputs, label_batch, _, _ = data
|
||||
inputs, label_batch, _, _, _ = data
|
||||
inputs, label_batch = inputs.to(device), label_batch.to(device)
|
||||
if self.hybrid:
|
||||
inputs = self.ae_net.encoder(
|
||||
@@ -102,8 +122,9 @@ class OCSVM(object):
|
||||
np.sum(labels == 1),
|
||||
)
|
||||
n_val = int(0.1 * n_test)
|
||||
n_val_normal, n_val_outlier = int(n_val * (n_normal / n_test)), int(
|
||||
n_val * (n_outlier / n_test)
|
||||
n_val_normal, n_val_outlier = (
|
||||
int(n_val * (n_normal / n_test)),
|
||||
int(n_val * (n_outlier / n_test)),
|
||||
)
|
||||
perm = np.random.permutation(n_test)
|
||||
X_val = np.concatenate(
|
||||
@@ -116,9 +137,14 @@ class OCSVM(object):
|
||||
|
||||
i = 1
|
||||
for gamma in gammas:
|
||||
|
||||
# Model candidate
|
||||
model = OneClassSVM(kernel=self.kernel, nu=self.nu, gamma=gamma)
|
||||
model = OneClassSVM(
|
||||
kernel=self.kernel,
|
||||
nu=self.nu,
|
||||
gamma=gamma,
|
||||
verbose=True,
|
||||
max_mem_size=4048,
|
||||
)
|
||||
|
||||
# Train
|
||||
start_time = time.time()
|
||||
@@ -147,7 +173,9 @@ class OCSVM(object):
|
||||
|
||||
# If hybrid, also train a model with linear kernel
|
||||
if self.hybrid:
|
||||
self.linear_model = OneClassSVM(kernel="linear", nu=self.nu)
|
||||
self.linear_model = OneClassSVM(
|
||||
kernel="linear", nu=self.nu, max_mem_size=4048
|
||||
)
|
||||
start_time = time.time()
|
||||
self.linear_model.fit(X)
|
||||
train_time = time.time() - start_time
|
||||
@@ -160,14 +188,26 @@ class OCSVM(object):
|
||||
logger.info("Finished training.")
|
||||
|
||||
def test(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
batch_size: int = 32,
|
||||
):
|
||||
"""Tests the OC-SVM model on the test data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=batch_size,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=batch_size, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Get data from loader
|
||||
idx_label_score = []
|
||||
@@ -175,7 +215,7 @@ class OCSVM(object):
|
||||
idxs = []
|
||||
labels = []
|
||||
for data in test_loader:
|
||||
inputs, label_batch, _, idx = data
|
||||
inputs, label_batch, _, idx, _ = data
|
||||
inputs, label_batch, idx = (
|
||||
inputs.to(device),
|
||||
label_batch.to(device),
|
||||
@@ -212,6 +252,9 @@ class OCSVM(object):
|
||||
labels = np.array(labels)
|
||||
scores = np.array(scores)
|
||||
self.results["test_auc"] = roc_auc_score(labels, scores)
|
||||
self.results["test_roc"] = roc_curve(labels, scores)
|
||||
self.results["test_prc"] = precision_recall_curve(labels, scores)
|
||||
self.results["test_ap"] = average_precision_score(labels, scores)
|
||||
|
||||
# If hybrid, also test model with linear kernel
|
||||
if self.hybrid:
|
||||
@@ -268,7 +311,7 @@ class OCSVM(object):
|
||||
"""Load OC-SVM model from import_path."""
|
||||
pass
|
||||
|
||||
def save_results(self, export_json):
|
||||
"""Save results dict to a JSON-file."""
|
||||
with open(export_json, "w") as fp:
|
||||
json.dump(self.results, fp)
|
||||
def save_results(self, export_pkl):
|
||||
with open(export_pkl, "wb") as fp:
|
||||
# json.dump(self.results, fp)
|
||||
pickle.dump(self.results, fp)
|
||||
|
||||
264
Deep-SAD-PyTorch/src/datasets/esmerasplit.py
Normal file
264
Deep-SAD-PyTorch/src/datasets/esmerasplit.py
Normal file
@@ -0,0 +1,264 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchvision.transforms as transforms
|
||||
from PIL import Image
|
||||
from torch.utils.data import Subset
|
||||
from torchvision.datasets import VisionDataset
|
||||
|
||||
from base.torchvision_dataset import TorchvisionDataset
|
||||
|
||||
from .preprocessing import create_semisupervised_setting
|
||||
|
||||
|
||||
class EsmeraSplit_Dataset(TorchvisionDataset):
|
||||
def __init__(
|
||||
self,
|
||||
root: str,
|
||||
ratio_known_normal: float = 0.0,
|
||||
ratio_known_outlier: float = 0.0,
|
||||
ratio_pollution: float = 0.0,
|
||||
inference: bool = False,
|
||||
):
|
||||
super().__init__(root)
|
||||
|
||||
# Define normal and outlier classes
|
||||
self.n_classes = 2 # 0: normal, 1: outlier
|
||||
self.normal_classes = tuple([0])
|
||||
self.outlier_classes = tuple([1])
|
||||
self.inference_set = None
|
||||
|
||||
# MNIST preprocessing: feature scaling to [0, 1]
|
||||
# FIXME understand mnist feature scaling and check if it or other preprocessing is necessary for elpv
|
||||
transform = transforms.ToTensor()
|
||||
target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes))
|
||||
|
||||
if inference:
|
||||
self.inference_set = EsmeraSplitInference(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
)
|
||||
else:
|
||||
# Get train set
|
||||
train_set = EsmeraSplitTraining(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
train=True,
|
||||
)
|
||||
|
||||
# Create semi-supervised setting
|
||||
idx, _, semi_targets = create_semisupervised_setting(
|
||||
train_set.targets.cpu().data.numpy(),
|
||||
self.normal_classes,
|
||||
self.outlier_classes,
|
||||
self.outlier_classes,
|
||||
ratio_known_normal,
|
||||
ratio_known_outlier,
|
||||
ratio_pollution,
|
||||
)
|
||||
train_set.semi_targets[idx] = torch.tensor(
|
||||
np.array(semi_targets, dtype=np.int8)
|
||||
) # set respective semi-supervised labels
|
||||
|
||||
# Subset train_set to semi-supervised setup
|
||||
self.train_set = Subset(train_set, idx)
|
||||
|
||||
# Get test set
|
||||
self.test_set = EsmeraSplitTraining(
|
||||
root=self.root,
|
||||
train=False,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
)
|
||||
|
||||
|
||||
def split_array_into_subarrays(array, split_height, split_width):
|
||||
original_shape = array.shape
|
||||
height, width = original_shape[-2], original_shape[-1]
|
||||
assert height % split_height == 0, "The height is not divisible by the split_height"
|
||||
assert width % split_width == 0, "The width is not divisible by the split_width"
|
||||
num_splits_height = height // split_height
|
||||
num_splits_width = width // split_width
|
||||
reshaped_array = array.reshape(
|
||||
-1, num_splits_height, split_height, num_splits_width, split_width
|
||||
)
|
||||
transposed_array = reshaped_array.transpose(0, 1, 3, 2, 4)
|
||||
final_array = transposed_array.reshape(-1, split_height, split_width)
|
||||
return final_array
|
||||
|
||||
|
||||
class EsmeraSplitTraining(VisionDataset):
|
||||
def __init__(
|
||||
self,
|
||||
root: str,
|
||||
transforms: Optional[Callable] = None,
|
||||
transform: Optional[Callable] = None,
|
||||
target_transform: Optional[Callable] = None,
|
||||
train=False,
|
||||
split=0.7,
|
||||
seed=0,
|
||||
height=16,
|
||||
width=256,
|
||||
):
|
||||
super(EsmeraSplitTraining, self).__init__(
|
||||
root, transforms, transform, target_transform
|
||||
)
|
||||
|
||||
experiments_data = []
|
||||
experiments_targets = []
|
||||
validation_files = []
|
||||
experiment_files = []
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
for experiment_file in Path(root).iterdir():
|
||||
if experiment_file.is_dir() and experiment_file.name == "validation":
|
||||
for validation_file in experiment_file.iterdir():
|
||||
if validation_file.suffix != ".npy":
|
||||
continue
|
||||
validation_files.append(experiment_file)
|
||||
if experiment_file.suffix != ".npy":
|
||||
continue
|
||||
experiment_files.append(experiment_file)
|
||||
experiment_data = np.load(experiment_file)
|
||||
|
||||
if (
|
||||
experiment_data.shape[1] % height != 0
|
||||
or experiment_data.shape[2] % width != 0
|
||||
):
|
||||
logger.error(
|
||||
f"Experiment {experiment_file.name} has shape {experiment_data.shape} which is not divisible by {height}x{width}"
|
||||
)
|
||||
experiment_data = split_array_into_subarrays(experiment_data, height, width)
|
||||
|
||||
# experiment_data = np.lib.format.open_memmap(experiment_file, mode='r+')
|
||||
experiment_targets = (
|
||||
np.ones(experiment_data.shape[0], dtype=np.int8)
|
||||
if "smoke" in experiment_file.name
|
||||
else np.zeros(experiment_data.shape[0], dtype=np.int8)
|
||||
)
|
||||
experiments_data.append(experiment_data)
|
||||
experiments_targets.append(experiment_targets)
|
||||
|
||||
filtered_validation_files = []
|
||||
for validation_file in validation_files:
|
||||
validation_file_name = validation_file.name
|
||||
file_exists_in_experiments = any(
|
||||
experiment_file.name == validation_file_name
|
||||
for experiment_file in experiment_files
|
||||
)
|
||||
if not file_exists_in_experiments:
|
||||
filtered_validation_files.append(validation_file)
|
||||
validation_files = filtered_validation_files
|
||||
|
||||
logger.info(
|
||||
f"Train/Test experiments: {[experiment_file.name for experiment_file in experiment_files]}"
|
||||
)
|
||||
logger.info(
|
||||
f"Validation experiments: {[validation_file.name for validation_file in validation_files]}"
|
||||
)
|
||||
|
||||
lidar_projections = np.concatenate(experiments_data)
|
||||
smoke_presence = np.concatenate(experiments_targets)
|
||||
|
||||
np.random.seed(seed)
|
||||
|
||||
shuffled_indices = np.random.permutation(lidar_projections.shape[0])
|
||||
shuffled_lidar_projections = lidar_projections[shuffled_indices]
|
||||
shuffled_smoke_presence = smoke_presence[shuffled_indices]
|
||||
|
||||
split_idx = int(split * shuffled_lidar_projections.shape[0])
|
||||
|
||||
if train:
|
||||
self.data = shuffled_lidar_projections[:split_idx]
|
||||
self.targets = shuffled_smoke_presence[:split_idx]
|
||||
|
||||
else:
|
||||
self.data = shuffled_lidar_projections[split_idx:]
|
||||
self.targets = shuffled_smoke_presence[split_idx:]
|
||||
|
||||
self.data = np.nan_to_num(self.data)
|
||||
|
||||
self.data = torch.tensor(self.data)
|
||||
self.targets = torch.tensor(self.targets, dtype=torch.int8)
|
||||
|
||||
self.semi_targets = torch.zeros_like(self.targets, dtype=torch.int8)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Override the original method of the MNIST class.
|
||||
Args:
|
||||
index (int): Index
|
||||
|
||||
Returns:
|
||||
tuple: (image, target, semi_target, index)
|
||||
"""
|
||||
img, target, semi_target = (
|
||||
self.data[index],
|
||||
int(self.targets[index]),
|
||||
int(self.semi_targets[index]),
|
||||
)
|
||||
|
||||
# doing this so that it is consistent with all other datasets
|
||||
# to return a PIL Image
|
||||
img = Image.fromarray(img.numpy(), mode="F")
|
||||
|
||||
if self.transform is not None:
|
||||
img = self.transform(img)
|
||||
|
||||
if self.target_transform is not None:
|
||||
target = self.target_transform(target)
|
||||
|
||||
return img, target, semi_target, index
|
||||
|
||||
|
||||
class EsmeraSplitInference(VisionDataset):
|
||||
def __init__(
|
||||
self,
|
||||
root: str,
|
||||
transforms: Optional[Callable] = None,
|
||||
transform: Optional[Callable] = None,
|
||||
):
|
||||
super(EsmeraSplitInference, self).__init__(root, transforms, transform)
|
||||
logger = logging.getLogger()
|
||||
|
||||
self.experiment_file_path = Path(root)
|
||||
|
||||
if not self.experiment_file_path.is_file():
|
||||
logger.error(
|
||||
"For inference the data path has to be a single experiment file!"
|
||||
)
|
||||
raise Exception("Inference data is not a loadable file!")
|
||||
|
||||
self.data = np.load(self.experiment_file_path)
|
||||
self.data = split_array_into_subarrays(self.data, 16, 256)
|
||||
self.data = np.nan_to_num(self.data)
|
||||
self.data = torch.tensor(self.data)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Override the original method of the MNIST class.
|
||||
Args:
|
||||
index (int): Index
|
||||
|
||||
Returns:
|
||||
tuple: (image, index)
|
||||
"""
|
||||
img = self.data[index]
|
||||
|
||||
# doing this so that it is consistent with all other datasets
|
||||
# to return a PIL Image
|
||||
img = Image.fromarray(img.numpy(), mode="F")
|
||||
|
||||
if self.transform is not None:
|
||||
img = self.transform(img)
|
||||
|
||||
return img, index
|
||||
@@ -18,6 +18,9 @@ def load_dataset(
|
||||
ratio_pollution: float = 0.0,
|
||||
random_state=None,
|
||||
inference: bool = False,
|
||||
k_fold: bool = False,
|
||||
num_known_normal: int = 0,
|
||||
num_known_outlier: int = 0,
|
||||
):
|
||||
"""Loads the dataset."""
|
||||
|
||||
@@ -46,6 +49,9 @@ def load_dataset(
|
||||
ratio_known_outlier=ratio_known_outlier,
|
||||
ratio_pollution=ratio_pollution,
|
||||
inference=inference,
|
||||
k_fold=k_fold,
|
||||
num_known_normal=num_known_normal,
|
||||
num_known_outlier=num_known_outlier,
|
||||
)
|
||||
|
||||
if dataset_name == "subtersplit":
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
from pathlib import Path
|
||||
@@ -6,12 +7,13 @@ from typing import Callable, Optional
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchvision.transforms as transforms
|
||||
from base.torchvision_dataset import TorchvisionDataset
|
||||
from PIL import Image
|
||||
from torch.utils.data import Subset
|
||||
from torch.utils.data.dataset import ConcatDataset
|
||||
from torchvision.datasets import VisionDataset
|
||||
|
||||
from base.torchvision_dataset import TorchvisionDataset
|
||||
|
||||
from .preprocessing import create_semisupervised_setting
|
||||
|
||||
|
||||
@@ -23,8 +25,22 @@ class SubTer_Dataset(TorchvisionDataset):
|
||||
ratio_known_outlier: float = 0.0,
|
||||
ratio_pollution: float = 0.0,
|
||||
inference: bool = False,
|
||||
k_fold: bool = False,
|
||||
num_known_normal: int = 0,
|
||||
num_known_outlier: int = 0,
|
||||
only_use_given_semi_targets_for_evaluation: bool = True,
|
||||
):
|
||||
super().__init__(root)
|
||||
if Path(root).is_dir():
|
||||
with open(Path(root) / "semi_targets.json", "r") as f:
|
||||
data = json.load(f)
|
||||
semi_targets_given = {
|
||||
item["filename"]: (
|
||||
item["semi_target_begin_frame"],
|
||||
item["semi_target_end_frame"],
|
||||
)
|
||||
for item in data["files"]
|
||||
}
|
||||
|
||||
# Define normal and outlier classes
|
||||
self.n_classes = 2 # 0: normal, 1: outlier
|
||||
@@ -43,38 +59,146 @@ class SubTer_Dataset(TorchvisionDataset):
|
||||
transform=transform,
|
||||
)
|
||||
else:
|
||||
# Get train set
|
||||
train_set = SubTerTraining(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
train=True,
|
||||
)
|
||||
if k_fold:
|
||||
# Get train set
|
||||
data_set = SubTerTraining(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
train=True,
|
||||
split=1,
|
||||
semi_targets_given=semi_targets_given,
|
||||
)
|
||||
|
||||
# Create semi-supervised setting
|
||||
idx, _, semi_targets = create_semisupervised_setting(
|
||||
train_set.targets.cpu().data.numpy(),
|
||||
self.normal_classes,
|
||||
self.outlier_classes,
|
||||
self.outlier_classes,
|
||||
ratio_known_normal,
|
||||
ratio_known_outlier,
|
||||
ratio_pollution,
|
||||
)
|
||||
train_set.semi_targets[idx] = torch.tensor(
|
||||
np.array(semi_targets, dtype=np.int8)
|
||||
) # set respective semi-supervised labels
|
||||
np.random.seed(0)
|
||||
semi_targets = data_set.semi_targets.numpy()
|
||||
|
||||
# Subset train_set to semi-supervised setup
|
||||
self.train_set = Subset(train_set, idx)
|
||||
# Find indices where semi_targets is -1 (abnormal) or 1 (normal)
|
||||
normal_indices = np.where(semi_targets == 1)[0]
|
||||
abnormal_indices = np.where(semi_targets == -1)[0]
|
||||
|
||||
# Get test set
|
||||
self.test_set = SubTerTraining(
|
||||
root=self.root,
|
||||
train=False,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
)
|
||||
# Randomly select the specified number of indices to keep for each category
|
||||
if len(normal_indices) > num_known_normal:
|
||||
keep_normal_indices = np.random.choice(
|
||||
normal_indices, size=num_known_normal, replace=False
|
||||
)
|
||||
else:
|
||||
keep_normal_indices = (
|
||||
normal_indices # Keep all if there are fewer than required
|
||||
)
|
||||
|
||||
if len(abnormal_indices) > num_known_outlier:
|
||||
keep_abnormal_indices = np.random.choice(
|
||||
abnormal_indices, size=num_known_outlier, replace=False
|
||||
)
|
||||
else:
|
||||
keep_abnormal_indices = (
|
||||
abnormal_indices # Keep all if there are fewer than required
|
||||
)
|
||||
|
||||
# Set all values to 0, then restore only the selected -1 and 1 values
|
||||
semi_targets[(semi_targets == 1) | (semi_targets == -1)] = 0
|
||||
semi_targets[keep_normal_indices] = 1
|
||||
semi_targets[keep_abnormal_indices] = -1
|
||||
data_set.semi_targets = torch.tensor(semi_targets, dtype=torch.int8)
|
||||
|
||||
self.data_set = data_set
|
||||
|
||||
# # Create semi-supervised setting
|
||||
# idx, _, semi_targets = create_semisupervised_setting(
|
||||
# data_set.targets.cpu().data.numpy(),
|
||||
# self.normal_classes,
|
||||
# self.outlier_classes,
|
||||
# self.outlier_classes,
|
||||
# ratio_known_normal,
|
||||
# ratio_known_outlier,
|
||||
# ratio_pollution,
|
||||
# )
|
||||
# data_set.semi_targets[idx] = torch.tensor(
|
||||
# np.array(semi_targets, dtype=np.int8)
|
||||
# ) # set respective semi-supervised labels
|
||||
|
||||
# # Subset data_set to semi-supervised setup
|
||||
# self.data_set = Subset(data_set, idx)
|
||||
else:
|
||||
# Get train set
|
||||
if only_use_given_semi_targets_for_evaluation:
|
||||
pass
|
||||
train_set = SubTerTrainingSelective(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
train=True,
|
||||
num_known_outlier=num_known_outlier,
|
||||
semi_targets_given=semi_targets_given,
|
||||
)
|
||||
|
||||
np.random.seed(0)
|
||||
semi_targets = train_set.semi_targets.numpy()
|
||||
|
||||
# Find indices where semi_targets is -1 (abnormal) or 1 (normal)
|
||||
normal_indices = np.where(semi_targets == 1)[0]
|
||||
|
||||
# Randomly select the specified number of indices to keep for each category
|
||||
if len(normal_indices) > num_known_normal:
|
||||
keep_normal_indices = np.random.choice(
|
||||
normal_indices, size=num_known_normal, replace=False
|
||||
)
|
||||
else:
|
||||
keep_normal_indices = (
|
||||
normal_indices # Keep all if there are fewer than required
|
||||
)
|
||||
|
||||
# Set all values to 0, then restore only the selected -1 and 1 values
|
||||
semi_targets[semi_targets == 1] = 0
|
||||
semi_targets[keep_normal_indices] = 1
|
||||
train_set.semi_targets = torch.tensor(
|
||||
semi_targets, dtype=torch.int8
|
||||
)
|
||||
|
||||
self.train_set = train_set
|
||||
self.test_set = SubTerTrainingSelective(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
num_known_outlier=num_known_outlier,
|
||||
train=False,
|
||||
semi_targets_given=semi_targets_given,
|
||||
)
|
||||
else:
|
||||
train_set = SubTerTraining(
|
||||
root=self.root,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
train=True,
|
||||
semi_targets_given=semi_targets_given,
|
||||
)
|
||||
|
||||
# Create semi-supervised setting
|
||||
idx, _, semi_targets = create_semisupervised_setting(
|
||||
train_set.targets.cpu().data.numpy(),
|
||||
self.normal_classes,
|
||||
self.outlier_classes,
|
||||
self.outlier_classes,
|
||||
ratio_known_normal,
|
||||
ratio_known_outlier,
|
||||
ratio_pollution,
|
||||
)
|
||||
train_set.semi_targets[idx] = torch.tensor(
|
||||
np.array(semi_targets, dtype=np.int8)
|
||||
) # set respective semi-supervised labels
|
||||
|
||||
# Subset train_set to semi-supervised setup
|
||||
self.train_set = Subset(train_set, idx)
|
||||
|
||||
# Get test set
|
||||
self.test_set = SubTerTraining(
|
||||
root=self.root,
|
||||
train=False,
|
||||
transform=transform,
|
||||
target_transform=target_transform,
|
||||
semi_targets_given=semi_targets_given,
|
||||
)
|
||||
|
||||
|
||||
class SubTerTraining(VisionDataset):
|
||||
@@ -87,6 +211,8 @@ class SubTerTraining(VisionDataset):
|
||||
train=False,
|
||||
split=0.7,
|
||||
seed=0,
|
||||
semi_targets_given=None,
|
||||
only_use_given_semi_targets_for_evaluation=False,
|
||||
):
|
||||
super(SubTerTraining, self).__init__(
|
||||
root, transforms, transform, target_transform
|
||||
@@ -94,73 +220,120 @@ class SubTerTraining(VisionDataset):
|
||||
|
||||
experiments_data = []
|
||||
experiments_targets = []
|
||||
validation_files = []
|
||||
experiments_semi_targets = []
|
||||
# validation_files = []
|
||||
experiment_files = []
|
||||
experiment_frame_ids = []
|
||||
experiment_file_ids = []
|
||||
file_names = {}
|
||||
|
||||
for experiment_file in Path(root).iterdir():
|
||||
if experiment_file.is_dir() and experiment_file.name == "validation":
|
||||
for validation_file in experiment_file.iterdir():
|
||||
if validation_file.suffix != ".npy":
|
||||
continue
|
||||
validation_files.append(experiment_file)
|
||||
for file_idx, experiment_file in enumerate(sorted(Path(root).iterdir())):
|
||||
# if experiment_file.is_dir() and experiment_file.name == "validation":
|
||||
# for validation_file in experiment_file.iterdir():
|
||||
# if validation_file.suffix != ".npy":
|
||||
# continue
|
||||
# validation_files.append(experiment_file)
|
||||
if experiment_file.suffix != ".npy":
|
||||
continue
|
||||
file_names[file_idx] = experiment_file.name
|
||||
experiment_files.append(experiment_file)
|
||||
experiment_data = np.load(experiment_file)
|
||||
# experiment_data = np.lib.format.open_memmap(experiment_file, mode='r+')
|
||||
experiment_targets = (
|
||||
np.ones(experiment_data.shape[0], dtype=np.int8)
|
||||
if "smoke" in experiment_file.name
|
||||
else np.zeros(experiment_data.shape[0], dtype=np.int8)
|
||||
)
|
||||
# experiment_data = np.lib.format.open_memmap(experiment_file, mode='r+')
|
||||
experiment_semi_targets = np.zeros(experiment_data.shape[0], dtype=np.int8)
|
||||
if "smoke" not in experiment_file.name:
|
||||
experiment_semi_targets = np.ones(
|
||||
experiment_data.shape[0], dtype=np.int8
|
||||
)
|
||||
else:
|
||||
if semi_targets_given:
|
||||
if experiment_file.name in semi_targets_given:
|
||||
semi_target_begin_frame, semi_target_end_frame = (
|
||||
semi_targets_given[experiment_file.name]
|
||||
)
|
||||
experiment_semi_targets[
|
||||
semi_target_begin_frame:semi_target_end_frame
|
||||
] = -1
|
||||
else:
|
||||
experiment_semi_targets = (
|
||||
np.ones(experiment_data.shape[0], dtype=np.int8) * -1
|
||||
)
|
||||
|
||||
experiment_file_ids.append(
|
||||
np.full(experiment_data.shape[0], file_idx, dtype=np.int8)
|
||||
)
|
||||
experiment_frame_ids.append(
|
||||
np.arange(experiment_data.shape[0], dtype=np.int32)
|
||||
)
|
||||
experiments_data.append(experiment_data)
|
||||
experiments_targets.append(experiment_targets)
|
||||
experiments_semi_targets.append(experiment_semi_targets)
|
||||
|
||||
filtered_validation_files = []
|
||||
for validation_file in validation_files:
|
||||
validation_file_name = validation_file.name
|
||||
file_exists_in_experiments = any(
|
||||
experiment_file.name == validation_file_name
|
||||
for experiment_file in experiment_files
|
||||
)
|
||||
if not file_exists_in_experiments:
|
||||
filtered_validation_files.append(validation_file)
|
||||
validation_files = filtered_validation_files
|
||||
# filtered_validation_files = []
|
||||
# for validation_file in validation_files:
|
||||
# validation_file_name = validation_file.name
|
||||
# file_exists_in_experiments = any(
|
||||
# experiment_file.name == validation_file_name
|
||||
# for experiment_file in experiment_files
|
||||
# )
|
||||
# if not file_exists_in_experiments:
|
||||
# filtered_validation_files.append(validation_file)
|
||||
# validation_files = filtered_validation_files
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
logger.info(
|
||||
f"Train/Test experiments: {[experiment_file.name for experiment_file in experiment_files]}"
|
||||
)
|
||||
logger.info(
|
||||
f"Validation experiments: {[validation_file.name for validation_file in validation_files]}"
|
||||
)
|
||||
# logger.info(
|
||||
# f"Validation experiments: {[validation_file.name for validation_file in validation_files]}"
|
||||
# )
|
||||
|
||||
lidar_projections = np.concatenate(experiments_data)
|
||||
smoke_presence = np.concatenate(experiments_targets)
|
||||
semi_targets = np.concatenate(experiments_semi_targets)
|
||||
file_ids = np.concatenate(experiment_file_ids)
|
||||
frame_ids = np.concatenate(experiment_frame_ids)
|
||||
self.file_names = file_names
|
||||
|
||||
np.random.seed(seed)
|
||||
|
||||
shuffled_indices = np.random.permutation(lidar_projections.shape[0])
|
||||
shuffled_lidar_projections = lidar_projections[shuffled_indices]
|
||||
shuffled_smoke_presence = smoke_presence[shuffled_indices]
|
||||
shuffled_file_ids = file_ids[shuffled_indices]
|
||||
shuffled_frame_ids = frame_ids[shuffled_indices]
|
||||
shuffled_semis = semi_targets[shuffled_indices]
|
||||
|
||||
split_idx = int(split * shuffled_lidar_projections.shape[0])
|
||||
|
||||
if train:
|
||||
self.data = shuffled_lidar_projections[:split_idx]
|
||||
self.targets = shuffled_smoke_presence[:split_idx]
|
||||
semi_targets = shuffled_semis[:split_idx]
|
||||
self.shuffled_file_ids = shuffled_file_ids[:split_idx]
|
||||
self.shuffled_frame_ids = shuffled_frame_ids[:split_idx]
|
||||
|
||||
else:
|
||||
self.data = shuffled_lidar_projections[split_idx:]
|
||||
self.targets = shuffled_smoke_presence[split_idx:]
|
||||
semi_targets = shuffled_semis[split_idx:]
|
||||
self.shuffled_file_ids = shuffled_file_ids[split_idx:]
|
||||
self.shuffled_frame_ids = shuffled_frame_ids[split_idx:]
|
||||
|
||||
self.data = np.nan_to_num(self.data)
|
||||
|
||||
self.data = torch.tensor(self.data)
|
||||
self.targets = torch.tensor(self.targets, dtype=torch.int8)
|
||||
|
||||
self.semi_targets = torch.zeros_like(self.targets, dtype=torch.int8)
|
||||
if semi_targets_given is not None:
|
||||
self.semi_targets = torch.tensor(semi_targets, dtype=torch.int8)
|
||||
else:
|
||||
self.semi_targets = torch.zeros_like(self.targets, dtype=torch.int8)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
@@ -173,10 +346,12 @@ class SubTerTraining(VisionDataset):
|
||||
Returns:
|
||||
tuple: (image, target, semi_target, index)
|
||||
"""
|
||||
img, target, semi_target = (
|
||||
img, target, semi_target, file_id, frame_id = (
|
||||
self.data[index],
|
||||
int(self.targets[index]),
|
||||
int(self.semi_targets[index]),
|
||||
int(self.shuffled_file_ids[index]),
|
||||
int(self.shuffled_frame_ids[index]),
|
||||
)
|
||||
|
||||
# doing this so that it is consistent with all other datasets
|
||||
@@ -189,7 +364,10 @@ class SubTerTraining(VisionDataset):
|
||||
if self.target_transform is not None:
|
||||
target = self.target_transform(target)
|
||||
|
||||
return img, target, semi_target, index
|
||||
return img, target, semi_target, index, (file_id, frame_id)
|
||||
|
||||
def get_file_name_from_idx(self, idx: int):
|
||||
return self.file_names[idx]
|
||||
|
||||
|
||||
class SubTerInference(VisionDataset):
|
||||
@@ -235,3 +413,191 @@ class SubTerInference(VisionDataset):
|
||||
img = self.transform(img)
|
||||
|
||||
return img, index
|
||||
|
||||
|
||||
class SubTerTrainingSelective(VisionDataset):
|
||||
def __init__(
|
||||
self,
|
||||
root: str,
|
||||
transforms: Optional[Callable] = None,
|
||||
transform: Optional[Callable] = None,
|
||||
target_transform: Optional[Callable] = None,
|
||||
train=False,
|
||||
num_known_outlier=0,
|
||||
seed=0,
|
||||
semi_targets_given=None,
|
||||
ratio_test_normal_to_anomalous=3,
|
||||
):
|
||||
super(SubTerTrainingSelective, self).__init__(
|
||||
root, transforms, transform, target_transform
|
||||
)
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
if semi_targets_given is None:
|
||||
raise ValueError(
|
||||
"semi_targets_given must be provided for selective training"
|
||||
)
|
||||
|
||||
experiments_data = []
|
||||
experiments_targets = []
|
||||
experiments_semi_targets = []
|
||||
# validation_files = []
|
||||
experiment_files = []
|
||||
experiment_frame_ids = []
|
||||
experiment_file_ids = []
|
||||
file_names = {}
|
||||
|
||||
for file_idx, experiment_file in enumerate(sorted(Path(root).iterdir())):
|
||||
if experiment_file.suffix != ".npy":
|
||||
continue
|
||||
|
||||
file_names[file_idx] = experiment_file.name
|
||||
experiment_files.append(experiment_file)
|
||||
experiment_data = np.load(experiment_file)
|
||||
|
||||
experiment_targets = (
|
||||
np.ones(experiment_data.shape[0], dtype=np.int8)
|
||||
if "smoke" in experiment_file.name
|
||||
else np.zeros(experiment_data.shape[0], dtype=np.int8)
|
||||
)
|
||||
|
||||
experiment_semi_targets = np.zeros(experiment_data.shape[0], dtype=np.int8)
|
||||
if "smoke" not in experiment_file.name:
|
||||
experiment_semi_targets = np.ones(
|
||||
experiment_data.shape[0], dtype=np.int8
|
||||
)
|
||||
elif experiment_file.name in semi_targets_given:
|
||||
semi_target_begin_frame, semi_target_end_frame = semi_targets_given[
|
||||
experiment_file.name
|
||||
]
|
||||
experiment_semi_targets[
|
||||
semi_target_begin_frame:semi_target_end_frame
|
||||
] = -1
|
||||
else:
|
||||
raise ValueError(
|
||||
"smoke experiment not in given semi_targets. required for selective training"
|
||||
)
|
||||
|
||||
experiment_file_ids.append(
|
||||
np.full(experiment_data.shape[0], file_idx, dtype=np.int8)
|
||||
)
|
||||
experiment_frame_ids.append(
|
||||
np.arange(experiment_data.shape[0], dtype=np.int32)
|
||||
)
|
||||
experiments_data.append(experiment_data)
|
||||
experiments_targets.append(experiment_targets)
|
||||
experiments_semi_targets.append(experiment_semi_targets)
|
||||
|
||||
logger.info(
|
||||
f"Train/Test experiments: {[experiment_file.name for experiment_file in experiment_files]}"
|
||||
)
|
||||
|
||||
lidar_projections = np.concatenate(experiments_data)
|
||||
smoke_presence = np.concatenate(experiments_targets)
|
||||
semi_targets = np.concatenate(experiments_semi_targets)
|
||||
file_ids = np.concatenate(experiment_file_ids)
|
||||
frame_ids = np.concatenate(experiment_frame_ids)
|
||||
self.file_names = file_names
|
||||
|
||||
np.random.seed(seed)
|
||||
|
||||
shuffled_indices = np.random.permutation(lidar_projections.shape[0])
|
||||
shuffled_lidar_projections = lidar_projections[shuffled_indices]
|
||||
shuffled_smoke_presence = smoke_presence[shuffled_indices]
|
||||
shuffled_file_ids = file_ids[shuffled_indices]
|
||||
shuffled_frame_ids = frame_ids[shuffled_indices]
|
||||
shuffled_semis = semi_targets[shuffled_indices]
|
||||
|
||||
# check if there are enough known normal and known outlier samples
|
||||
outlier_indices = np.where(shuffled_semis == -1)[0]
|
||||
normal_indices = np.where(shuffled_semis == 1)[0]
|
||||
|
||||
if len(outlier_indices) < num_known_outlier:
|
||||
raise ValueError(
|
||||
f"Not enough known outliers in dataset. Required: {num_known_outlier}, Found: {len(outlier_indices)}"
|
||||
)
|
||||
|
||||
# randomly select known normal and outlier samples
|
||||
keep_outlier_indices = np.random.choice(
|
||||
outlier_indices, size=num_known_outlier, replace=False
|
||||
)
|
||||
|
||||
# put outliers that are not kept into test set and the same number of normal samples aside for testing
|
||||
test_outlier_indices = np.setdiff1d(outlier_indices, keep_outlier_indices)
|
||||
num_test_outliers = len(test_outlier_indices)
|
||||
test_normal_indices = np.random.choice(
|
||||
normal_indices,
|
||||
size=num_test_outliers * ratio_test_normal_to_anomalous,
|
||||
replace=False,
|
||||
)
|
||||
|
||||
# combine test indices
|
||||
test_indices = np.concatenate([test_outlier_indices, test_normal_indices])
|
||||
|
||||
# training indices are the rest
|
||||
train_indices = np.setdiff1d(np.arange(len(shuffled_semis)), test_indices)
|
||||
|
||||
if train:
|
||||
self.data = shuffled_lidar_projections[train_indices]
|
||||
self.targets = shuffled_smoke_presence[train_indices]
|
||||
semi_targets = shuffled_semis[train_indices]
|
||||
self.shuffled_file_ids = shuffled_file_ids[train_indices]
|
||||
self.shuffled_frame_ids = shuffled_frame_ids[train_indices]
|
||||
|
||||
else:
|
||||
self.data = shuffled_lidar_projections[test_indices]
|
||||
self.targets = shuffled_smoke_presence[test_indices]
|
||||
semi_targets = shuffled_semis[test_indices]
|
||||
self.shuffled_file_ids = shuffled_file_ids[test_indices]
|
||||
self.shuffled_frame_ids = shuffled_frame_ids[test_indices]
|
||||
|
||||
self.data = np.nan_to_num(self.data)
|
||||
|
||||
self.data = torch.tensor(self.data)
|
||||
self.targets = torch.tensor(self.targets, dtype=torch.int8)
|
||||
self.semi_targets = torch.tensor(semi_targets, dtype=torch.int8)
|
||||
|
||||
# log some stats to ensure the data is loaded correctly
|
||||
if train:
|
||||
logger.info(
|
||||
f"Training set: {len(self.data)} samples, {sum(self.semi_targets == -1)} semi-supervised samples"
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Test set: {len(self.data)} samples, {sum(self.semi_targets == -1)} semi-supervised samples"
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
"""Override the original method of the MNIST class.
|
||||
Args:
|
||||
index (int): Index
|
||||
|
||||
Returns:
|
||||
tuple: (image, target, semi_target, index)
|
||||
"""
|
||||
img, target, semi_target, file_id, frame_id = (
|
||||
self.data[index],
|
||||
int(self.targets[index]),
|
||||
int(self.semi_targets[index]),
|
||||
int(self.shuffled_file_ids[index]),
|
||||
int(self.shuffled_frame_ids[index]),
|
||||
)
|
||||
|
||||
# doing this so that it is consistent with all other datasets
|
||||
# to return a PIL Image
|
||||
img = Image.fromarray(img.numpy(), mode="F")
|
||||
|
||||
if self.transform is not None:
|
||||
img = self.transform(img)
|
||||
|
||||
if self.target_transform is not None:
|
||||
target = self.target_transform(target)
|
||||
|
||||
return img, target, semi_target, index, (file_id, frame_id)
|
||||
|
||||
def get_file_name_from_idx(self, idx: int):
|
||||
return self.file_names[idx]
|
||||
|
||||
@@ -5,6 +5,9 @@ from pathlib import Path
|
||||
import click
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from baselines.isoforest import IsoForest
|
||||
from baselines.ocsvm import OCSVM
|
||||
from datasets.main import load_dataset
|
||||
from DeepSAD import DeepSAD
|
||||
from utils.config import Config
|
||||
@@ -64,6 +67,30 @@ from utils.visualization.plot_images_grid import plot_images_grid
|
||||
)
|
||||
@click.argument("xp_path", type=click.Path(exists=True))
|
||||
@click.argument("data_path", type=click.Path(exists=True))
|
||||
@click.option(
|
||||
"--k_fold",
|
||||
type=bool,
|
||||
default=False,
|
||||
help="Use k-fold cross-validation for training (default: False).",
|
||||
)
|
||||
@click.option(
|
||||
"--k_fold_num",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of folds for k-fold cross-validation (default: 5).",
|
||||
)
|
||||
@click.option(
|
||||
"--num_known_normal",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of max known normal samples (semi-supervised-setting) (default: 0).",
|
||||
)
|
||||
@click.option(
|
||||
"--num_known_outlier",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of max known outlier samples (semi-supervised-setting) (default: 0).",
|
||||
)
|
||||
@click.option(
|
||||
"--load_config",
|
||||
type=click.Path(exists=True),
|
||||
@@ -214,12 +241,52 @@ from utils.visualization.plot_images_grid import plot_images_grid
|
||||
"If 1, outlier class as specified in --known_outlier_class option."
|
||||
"If > 1, the specified number of outlier classes will be sampled at random.",
|
||||
)
|
||||
@click.option(
|
||||
"--ocsvm_kernel",
|
||||
type=click.Choice(["rbf", "linear", "poly"]),
|
||||
default="rbf",
|
||||
help="Kernel for the OC-SVM",
|
||||
)
|
||||
@click.option(
|
||||
"--ocsvm_nu",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="OC-SVM hyperparameter nu (must be 0 < nu <= 1).",
|
||||
)
|
||||
@click.option(
|
||||
"--isoforest_n_estimators",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Set the number of base estimators in the ensemble (default: 100).",
|
||||
)
|
||||
@click.option(
|
||||
"--isoforest_max_samples",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Set the number of samples drawn to train each base estimator (default: 256).",
|
||||
)
|
||||
@click.option(
|
||||
"--isoforest_contamination",
|
||||
type=float,
|
||||
default=0.1,
|
||||
help="Expected fraction of anomalies in the training set. (default: 0.1).",
|
||||
)
|
||||
@click.option(
|
||||
"--isoforest_n_jobs_model",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Number of jobs for model training.",
|
||||
)
|
||||
def main(
|
||||
action,
|
||||
dataset_name,
|
||||
net_name,
|
||||
xp_path,
|
||||
data_path,
|
||||
k_fold,
|
||||
k_fold_num,
|
||||
num_known_normal,
|
||||
num_known_outlier,
|
||||
load_config,
|
||||
load_model,
|
||||
eta,
|
||||
@@ -246,6 +313,12 @@ def main(
|
||||
normal_class,
|
||||
known_outlier_class,
|
||||
n_known_outlier_classes,
|
||||
ocsvm_kernel,
|
||||
ocsvm_nu,
|
||||
isoforest_n_estimators,
|
||||
isoforest_max_samples,
|
||||
isoforest_contamination,
|
||||
isoforest_n_jobs_model,
|
||||
):
|
||||
"""
|
||||
Deep SAD, a method for deep semi-supervised anomaly detection.
|
||||
@@ -318,6 +391,7 @@ def main(
|
||||
|
||||
if action == "train":
|
||||
# Load data
|
||||
# TODO: pass num of folds
|
||||
dataset = load_dataset(
|
||||
dataset_name,
|
||||
data_path,
|
||||
@@ -328,135 +402,297 @@ def main(
|
||||
ratio_known_outlier,
|
||||
ratio_pollution,
|
||||
random_state=np.random.RandomState(cfg.settings["seed"]),
|
||||
k_fold=k_fold,
|
||||
num_known_normal=num_known_normal,
|
||||
num_known_outlier=num_known_outlier,
|
||||
)
|
||||
# Log random sample of known anomaly classes if more than 1 class
|
||||
if n_known_outlier_classes > 1:
|
||||
logger.info("Known anomaly classes: %s" % (dataset.known_outlier_classes,))
|
||||
|
||||
# Initialize DeepSAD model and set neural network phi
|
||||
deepSAD = DeepSAD(cfg.settings["eta"])
|
||||
deepSAD.set_network(net_name)
|
||||
train_passes = range(k_fold_num) if k_fold else [None]
|
||||
|
||||
# If specified, load Deep SAD model (center c, network weights, and possibly autoencoder weights)
|
||||
if load_model:
|
||||
deepSAD.load_model(model_path=load_model, load_ae=True, map_location=device)
|
||||
logger.info("Loading model from %s." % load_model)
|
||||
train_isoforest = True
|
||||
train_ocsvm = False
|
||||
train_deepsad = True
|
||||
|
||||
logger.info("Pretraining: %s" % pretrain)
|
||||
if pretrain:
|
||||
# Log pretraining details
|
||||
logger.info("Pretraining optimizer: %s" % cfg.settings["ae_optimizer_name"])
|
||||
logger.info("Pretraining learning rate: %g" % cfg.settings["ae_lr"])
|
||||
logger.info("Pretraining epochs: %d" % cfg.settings["ae_n_epochs"])
|
||||
for fold_idx in train_passes:
|
||||
if fold_idx is None:
|
||||
logger.info("Single training without k-fold")
|
||||
else:
|
||||
logger.info(f"Fold {fold_idx + 1}/{k_fold_num}")
|
||||
|
||||
# Initialize OC-SVM model
|
||||
if train_ocsvm:
|
||||
ocsvm = OCSVM(kernel=ocsvm_kernel, nu=ocsvm_nu, hybrid=False)
|
||||
|
||||
# Initialize Isolation Forest model
|
||||
if train_isoforest:
|
||||
Isoforest = IsoForest(
|
||||
hybrid=False,
|
||||
n_estimators=isoforest_n_estimators,
|
||||
max_samples=isoforest_max_samples,
|
||||
contamination=isoforest_contamination,
|
||||
n_jobs=isoforest_n_jobs_model,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
# Initialize DeepSAD model and set neural network phi
|
||||
if train_deepsad:
|
||||
deepSAD = DeepSAD(cfg.settings["eta"])
|
||||
deepSAD.set_network(net_name)
|
||||
|
||||
# If specified, load Deep SAD model (center c, network weights, and possibly autoencoder weights)
|
||||
if train_deepsad and load_model:
|
||||
deepSAD.load_model(
|
||||
model_path=load_model, load_ae=True, map_location=device
|
||||
)
|
||||
logger.info("Loading model from %s." % load_model)
|
||||
|
||||
logger.info("Pretraining: %s" % pretrain)
|
||||
if train_deepsad and pretrain:
|
||||
# Log pretraining details
|
||||
logger.info(
|
||||
"Pretraining optimizer: %s" % cfg.settings["ae_optimizer_name"]
|
||||
)
|
||||
logger.info("Pretraining learning rate: %g" % cfg.settings["ae_lr"])
|
||||
logger.info("Pretraining epochs: %d" % cfg.settings["ae_n_epochs"])
|
||||
logger.info(
|
||||
"Pretraining learning rate scheduler milestones: %s"
|
||||
% (cfg.settings["ae_lr_milestone"],)
|
||||
)
|
||||
logger.info(
|
||||
"Pretraining batch size: %d" % cfg.settings["ae_batch_size"]
|
||||
)
|
||||
logger.info(
|
||||
"Pretraining weight decay: %g" % cfg.settings["ae_weight_decay"]
|
||||
)
|
||||
|
||||
# Pretrain model on dataset (via autoencoder)
|
||||
deepSAD.pretrain(
|
||||
dataset,
|
||||
optimizer_name=cfg.settings["ae_optimizer_name"],
|
||||
lr=cfg.settings["ae_lr"],
|
||||
n_epochs=cfg.settings["ae_n_epochs"],
|
||||
lr_milestones=cfg.settings["ae_lr_milestone"],
|
||||
batch_size=cfg.settings["ae_batch_size"],
|
||||
weight_decay=cfg.settings["ae_weight_decay"],
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
)
|
||||
|
||||
# Save pretraining results
|
||||
if fold_idx is None:
|
||||
deepSAD.save_ae_results(export_json=xp_path + "/ae_results.json")
|
||||
else:
|
||||
deepSAD.save_ae_results(
|
||||
export_json=xp_path + f"/ae_results_{fold_idx}.json"
|
||||
)
|
||||
|
||||
# Log training details
|
||||
logger.info("Training optimizer: %s" % cfg.settings["optimizer_name"])
|
||||
logger.info("Training learning rate: %g" % cfg.settings["lr"])
|
||||
logger.info("Training epochs: %d" % cfg.settings["n_epochs"])
|
||||
logger.info(
|
||||
"Pretraining learning rate scheduler milestones: %s"
|
||||
% (cfg.settings["ae_lr_milestone"],)
|
||||
)
|
||||
logger.info("Pretraining batch size: %d" % cfg.settings["ae_batch_size"])
|
||||
logger.info(
|
||||
"Pretraining weight decay: %g" % cfg.settings["ae_weight_decay"]
|
||||
"Training learning rate scheduler milestones: %s"
|
||||
% (cfg.settings["lr_milestone"],)
|
||||
)
|
||||
logger.info("Training batch size: %d" % cfg.settings["batch_size"])
|
||||
logger.info("Training weight decay: %g" % cfg.settings["weight_decay"])
|
||||
|
||||
# Pretrain model on dataset (via autoencoder)
|
||||
deepSAD.pretrain(
|
||||
dataset,
|
||||
optimizer_name=cfg.settings["ae_optimizer_name"],
|
||||
lr=cfg.settings["ae_lr"],
|
||||
n_epochs=cfg.settings["ae_n_epochs"],
|
||||
lr_milestones=cfg.settings["ae_lr_milestone"],
|
||||
batch_size=cfg.settings["ae_batch_size"],
|
||||
weight_decay=cfg.settings["ae_weight_decay"],
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
)
|
||||
|
||||
# Save pretraining results
|
||||
deepSAD.save_ae_results(export_json=xp_path + "/ae_results.json")
|
||||
|
||||
# Log training details
|
||||
logger.info("Training optimizer: %s" % cfg.settings["optimizer_name"])
|
||||
logger.info("Training learning rate: %g" % cfg.settings["lr"])
|
||||
logger.info("Training epochs: %d" % cfg.settings["n_epochs"])
|
||||
logger.info(
|
||||
"Training learning rate scheduler milestones: %s"
|
||||
% (cfg.settings["lr_milestone"],)
|
||||
)
|
||||
logger.info("Training batch size: %d" % cfg.settings["batch_size"])
|
||||
logger.info("Training weight decay: %g" % cfg.settings["weight_decay"])
|
||||
|
||||
# Train model on dataset
|
||||
deepSAD.train(
|
||||
dataset,
|
||||
optimizer_name=cfg.settings["optimizer_name"],
|
||||
lr=cfg.settings["lr"],
|
||||
n_epochs=cfg.settings["n_epochs"],
|
||||
lr_milestones=cfg.settings["lr_milestone"],
|
||||
batch_size=cfg.settings["batch_size"],
|
||||
weight_decay=cfg.settings["weight_decay"],
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
)
|
||||
|
||||
# Test model
|
||||
deepSAD.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
|
||||
|
||||
# Save results, model, and configuration
|
||||
deepSAD.save_results(export_json=xp_path + "/results.json")
|
||||
deepSAD.save_model(export_model=xp_path + "/model.tar")
|
||||
cfg.save_config(export_json=xp_path + "/config.json")
|
||||
|
||||
# Plot most anomalous and most normal test samples
|
||||
indices, labels, scores = zip(*deepSAD.results["test_scores"])
|
||||
indices, labels, scores = np.array(indices), np.array(labels), np.array(scores)
|
||||
idx_all_sorted = indices[np.argsort(scores)] # from lowest to highest score
|
||||
idx_normal_sorted = indices[labels == 0][
|
||||
np.argsort(scores[labels == 0])
|
||||
] # from lowest to highest score
|
||||
|
||||
if dataset_name in ("mnist", "fmnist", "cifar10", "elpv"):
|
||||
if dataset_name in ("mnist", "fmnist", "elpv"):
|
||||
X_all_low = dataset.test_set.data[idx_all_sorted[:32], ...].unsqueeze(1)
|
||||
X_all_high = dataset.test_set.data[idx_all_sorted[-32:], ...].unsqueeze(
|
||||
1
|
||||
# Train model on dataset
|
||||
if train_deepsad:
|
||||
deepSAD.train(
|
||||
dataset,
|
||||
optimizer_name=cfg.settings["optimizer_name"],
|
||||
lr=cfg.settings["lr"],
|
||||
n_epochs=cfg.settings["n_epochs"],
|
||||
lr_milestones=cfg.settings["lr_milestone"],
|
||||
batch_size=cfg.settings["batch_size"],
|
||||
weight_decay=cfg.settings["weight_decay"],
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
)
|
||||
X_normal_low = dataset.test_set.data[
|
||||
idx_normal_sorted[:32], ...
|
||||
].unsqueeze(1)
|
||||
X_normal_high = dataset.test_set.data[
|
||||
idx_normal_sorted[-32:], ...
|
||||
].unsqueeze(1)
|
||||
|
||||
if dataset_name == "cifar10":
|
||||
X_all_low = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_all_sorted[:32], ...], (0, 3, 1, 2)
|
||||
# Train model on dataset
|
||||
if train_ocsvm:
|
||||
ocsvm.train(
|
||||
dataset,
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
batch_size=8,
|
||||
)
|
||||
|
||||
# Train model on dataset
|
||||
if train_isoforest:
|
||||
Isoforest.train(
|
||||
dataset,
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
)
|
||||
|
||||
# Test model
|
||||
if train_deepsad:
|
||||
deepSAD.test(
|
||||
dataset,
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
)
|
||||
|
||||
# Test model
|
||||
if train_ocsvm:
|
||||
ocsvm.test(
|
||||
dataset,
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
batch_size=8,
|
||||
)
|
||||
|
||||
# Test model
|
||||
if train_isoforest:
|
||||
Isoforest.test(
|
||||
dataset,
|
||||
device=device,
|
||||
n_jobs_dataloader=n_jobs_dataloader,
|
||||
k_fold_idx=fold_idx,
|
||||
)
|
||||
|
||||
# Save results, model, and configuration
|
||||
if fold_idx is None:
|
||||
if train_deepsad:
|
||||
deepSAD.save_results(export_pkl=xp_path + "/results.pkl")
|
||||
deepSAD.save_model(export_model=xp_path + "/model.tar")
|
||||
if train_ocsvm:
|
||||
ocsvm.save_results(export_pkl=xp_path + "/results_ocsvm.pkl")
|
||||
if train_isoforest:
|
||||
Isoforest.save_results(
|
||||
export_pkl=xp_path + "/results_isoforest.pkl"
|
||||
)
|
||||
)
|
||||
X_all_high = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_all_sorted[-32:], ...], (0, 3, 1, 2)
|
||||
else:
|
||||
if train_deepsad:
|
||||
deepSAD.save_results(
|
||||
export_pkl=xp_path + f"/results_{fold_idx}.pkl"
|
||||
)
|
||||
)
|
||||
X_normal_low = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_normal_sorted[:32], ...], (0, 3, 1, 2)
|
||||
deepSAD.save_model(export_model=xp_path + f"/model_{fold_idx}.tar")
|
||||
if train_ocsvm:
|
||||
ocsvm.save_results(
|
||||
export_pkl=xp_path + f"/results_ocsvm_{fold_idx}.pkl"
|
||||
)
|
||||
)
|
||||
X_normal_high = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_normal_sorted[-32:], ...],
|
||||
(0, 3, 1, 2),
|
||||
if train_isoforest:
|
||||
Isoforest.save_results(
|
||||
export_pkl=xp_path + f"/results_isoforest_{fold_idx}.pkl"
|
||||
)
|
||||
)
|
||||
|
||||
plot_images_grid(X_all_low, export_img=xp_path + "/all_low", padding=2)
|
||||
plot_images_grid(X_all_high, export_img=xp_path + "/all_high", padding=2)
|
||||
plot_images_grid(
|
||||
X_normal_low, export_img=xp_path + "/normals_low", padding=2
|
||||
)
|
||||
plot_images_grid(
|
||||
X_normal_high, export_img=xp_path + "/normals_high", padding=2
|
||||
)
|
||||
cfg.save_config(export_json=xp_path + "/config.json")
|
||||
|
||||
# Plot most anomalous and most normal test samples
|
||||
if train_deepsad:
|
||||
indices, labels, scores = zip(*deepSAD.results["test_scores"])
|
||||
indices, labels, scores = (
|
||||
np.array(indices),
|
||||
np.array(labels),
|
||||
np.array(scores),
|
||||
)
|
||||
idx_all_sorted = indices[
|
||||
np.argsort(scores)
|
||||
] # from lowest to highest score
|
||||
idx_normal_sorted = indices[labels == 0][
|
||||
np.argsort(scores[labels == 0])
|
||||
] # from lowest to highest score
|
||||
|
||||
if dataset_name in (
|
||||
"mnist",
|
||||
"fmnist",
|
||||
"cifar10",
|
||||
"elpv",
|
||||
):
|
||||
if dataset_name in (
|
||||
"mnist",
|
||||
"fmnist",
|
||||
"elpv",
|
||||
):
|
||||
X_all_low = dataset.test_set.data[
|
||||
idx_all_sorted[:32], ...
|
||||
].unsqueeze(1)
|
||||
X_all_high = dataset.test_set.data[
|
||||
idx_all_sorted[-32:], ...
|
||||
].unsqueeze(1)
|
||||
X_normal_low = dataset.test_set.data[
|
||||
idx_normal_sorted[:32], ...
|
||||
].unsqueeze(1)
|
||||
X_normal_high = dataset.test_set.data[
|
||||
idx_normal_sorted[-32:], ...
|
||||
].unsqueeze(1)
|
||||
|
||||
if dataset_name == "cifar10":
|
||||
X_all_low = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_all_sorted[:32], ...],
|
||||
(0, 3, 1, 2),
|
||||
)
|
||||
)
|
||||
X_all_high = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_all_sorted[-32:], ...],
|
||||
(0, 3, 1, 2),
|
||||
)
|
||||
)
|
||||
X_normal_low = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_normal_sorted[:32], ...],
|
||||
(0, 3, 1, 2),
|
||||
)
|
||||
)
|
||||
X_normal_high = torch.tensor(
|
||||
np.transpose(
|
||||
dataset.test_set.data[idx_normal_sorted[-32:], ...],
|
||||
(0, 3, 1, 2),
|
||||
)
|
||||
)
|
||||
|
||||
if fold_idx is None:
|
||||
plot_images_grid(
|
||||
X_all_low, export_img=xp_path + "/all_low", padding=2
|
||||
)
|
||||
plot_images_grid(
|
||||
X_all_high, export_img=xp_path + "/all_high", padding=2
|
||||
)
|
||||
plot_images_grid(
|
||||
X_normal_low, export_img=xp_path + "/normals_low", padding=2
|
||||
)
|
||||
plot_images_grid(
|
||||
X_normal_high,
|
||||
export_img=xp_path + "/normals_high",
|
||||
padding=2,
|
||||
)
|
||||
else:
|
||||
plot_images_grid(
|
||||
X_all_low,
|
||||
export_img=xp_path + f"/all_low_{fold_idx}",
|
||||
padding=2,
|
||||
)
|
||||
plot_images_grid(
|
||||
X_all_high,
|
||||
export_img=xp_path + f"/all_high_{fold_idx}",
|
||||
padding=2,
|
||||
)
|
||||
plot_images_grid(
|
||||
X_normal_low,
|
||||
export_img=xp_path + f"/normals_low_{fold_idx}",
|
||||
padding=2,
|
||||
)
|
||||
plot_images_grid(
|
||||
X_normal_high,
|
||||
export_img=xp_path + f"/normals_high_{fold_idx}",
|
||||
padding=2,
|
||||
)
|
||||
|
||||
elif action == "infer":
|
||||
dataset = load_dataset(
|
||||
dataset_name,
|
||||
@@ -488,14 +724,23 @@ def main(
|
||||
deepSAD.load_model(model_path=load_model, load_ae=True, map_location=device)
|
||||
logger.info("Loading model from %s." % load_model)
|
||||
|
||||
inference_results = deepSAD.inference(
|
||||
inference_results, all_outputs = deepSAD.inference(
|
||||
dataset, device=device, n_jobs_dataloader=n_jobs_dataloader
|
||||
)
|
||||
inference_results_path = (
|
||||
Path(xp_path) / "inference" / Path(dataset.root).with_suffix(".npy").stem
|
||||
Path(xp_path)
|
||||
/ "inference"
|
||||
/ Path(Path(dataset.root).stem).with_suffix(".npy")
|
||||
)
|
||||
inference_outputs_path = (
|
||||
Path(xp_path)
|
||||
/ "inference"
|
||||
/ Path(Path(dataset.root).stem + "_outputs").with_suffix(".npy")
|
||||
)
|
||||
|
||||
inference_results_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
np.save(inference_results_path, inference_results, fix_imports=False)
|
||||
np.save(inference_outputs_path, all_outputs, fix_imports=False)
|
||||
|
||||
logger.info(
|
||||
f"Inference: median={np.median(inference_results)} mean={np.mean(inference_results)} min={inference_results.min()} max={inference_results.max()}"
|
||||
|
||||
@@ -1,18 +1,23 @@
|
||||
from base.base_trainer import BaseTrainer
|
||||
from base.base_dataset import BaseADDataset
|
||||
from base.base_net import BaseNet
|
||||
from torch.utils.data.dataloader import DataLoader
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
from sklearn.metrics import (
|
||||
average_precision_score,
|
||||
precision_recall_curve,
|
||||
roc_auc_score,
|
||||
roc_curve,
|
||||
)
|
||||
from torch.utils.data.dataloader import DataLoader
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from base.base_net import BaseNet
|
||||
from base.base_trainer import BaseTrainer
|
||||
|
||||
|
||||
class DeepSADTrainer(BaseTrainer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
c,
|
||||
@@ -50,13 +55,22 @@ class DeepSADTrainer(BaseTrainer):
|
||||
self.test_time = None
|
||||
self.test_scores = None
|
||||
|
||||
def train(self, dataset: BaseADDataset, net: BaseNet):
|
||||
def train(
|
||||
self, dataset: BaseADDataset, net: BaseNet, k_fold_idx: int = None
|
||||
) -> BaseNet:
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Get train data loader
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
train_loader, _ = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Set device for network
|
||||
net = net.to(self.device)
|
||||
@@ -82,14 +96,14 @@ class DeepSADTrainer(BaseTrainer):
|
||||
start_time = time.time()
|
||||
net.train()
|
||||
for epoch in range(self.n_epochs):
|
||||
|
||||
epoch_loss = 0.0
|
||||
n_batches = 0
|
||||
epoch_start_time = time.time()
|
||||
for data in train_loader:
|
||||
inputs, _, semi_targets, _ = data
|
||||
inputs, semi_targets = inputs.to(self.device), semi_targets.to(
|
||||
self.device
|
||||
inputs, _, semi_targets, _, _ = data
|
||||
inputs, semi_targets = (
|
||||
inputs.to(self.device),
|
||||
semi_targets.to(self.device),
|
||||
)
|
||||
|
||||
# Zero the network parameter gradients
|
||||
@@ -145,6 +159,7 @@ class DeepSADTrainer(BaseTrainer):
|
||||
logger.info("Starting inference...")
|
||||
n_batches = 0
|
||||
start_time = time.time()
|
||||
all_outputs = np.zeros((len(inference_loader.dataset), 1024), dtype=np.float32)
|
||||
scores = []
|
||||
net.eval()
|
||||
with torch.no_grad():
|
||||
@@ -155,6 +170,10 @@ class DeepSADTrainer(BaseTrainer):
|
||||
idx = idx.to(self.device)
|
||||
|
||||
outputs = net(inputs)
|
||||
all_idx = n_batches * self.batch_size
|
||||
all_outputs[all_idx : all_idx + len(inputs)] = (
|
||||
outputs.cpu().data.numpy()
|
||||
)
|
||||
dist = torch.sum((outputs - self.c) ** 2, dim=1)
|
||||
scores += dist.cpu().data.numpy().tolist()
|
||||
|
||||
@@ -166,15 +185,22 @@ class DeepSADTrainer(BaseTrainer):
|
||||
logger.info("Inference Time: {:.3f}s".format(self.inference_time))
|
||||
logger.info("Finished inference.")
|
||||
|
||||
return np.array(scores)
|
||||
return np.array(scores), all_outputs
|
||||
|
||||
def test(self, dataset: BaseADDataset, net: BaseNet):
|
||||
def test(self, dataset: BaseADDataset, net: BaseNet, k_fold_idx: int = None):
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Get test data loader
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Set device for network
|
||||
net = net.to(self.device)
|
||||
@@ -188,7 +214,7 @@ class DeepSADTrainer(BaseTrainer):
|
||||
net.eval()
|
||||
with torch.no_grad():
|
||||
for data in test_loader:
|
||||
inputs, labels, semi_targets, idx = data
|
||||
inputs, labels, semi_targets, idx, _ = data
|
||||
|
||||
inputs = inputs.to(self.device)
|
||||
labels = labels.to(self.device)
|
||||
@@ -225,6 +251,9 @@ class DeepSADTrainer(BaseTrainer):
|
||||
labels = np.array(labels)
|
||||
scores = np.array(scores)
|
||||
self.test_auc = roc_auc_score(labels, scores)
|
||||
self.test_roc = roc_curve(labels, scores)
|
||||
self.test_prc = precision_recall_curve(labels, scores)
|
||||
self.test_ap = average_precision_score(labels, scores)
|
||||
|
||||
# Log results
|
||||
logger.info("Test Loss: {:.6f}".format(epoch_loss / n_batches))
|
||||
@@ -241,7 +270,7 @@ class DeepSADTrainer(BaseTrainer):
|
||||
with torch.no_grad():
|
||||
for data in train_loader:
|
||||
# get the inputs of the batch
|
||||
inputs, _, _, _ = data
|
||||
inputs, _, _, _, _ = data
|
||||
inputs = inputs.to(self.device)
|
||||
outputs = net(inputs)
|
||||
n_samples += outputs.shape[0]
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
from base.base_trainer import BaseTrainer
|
||||
from base.base_dataset import BaseADDataset
|
||||
from base.base_net import BaseNet
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from base.base_net import BaseNet
|
||||
from base.base_trainer import BaseTrainer
|
||||
|
||||
|
||||
class AETrainer(BaseTrainer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
optimizer_name: str = "adam",
|
||||
@@ -40,13 +40,20 @@ class AETrainer(BaseTrainer):
|
||||
self.test_auc = None
|
||||
self.test_time = None
|
||||
|
||||
def train(self, dataset: BaseADDataset, ae_net: BaseNet):
|
||||
def train(self, dataset: BaseADDataset, ae_net: BaseNet, k_fold_idx: int = None):
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Get train data loader
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
train_loader, _ = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Set loss
|
||||
criterion = nn.MSELoss(reduction="none")
|
||||
@@ -69,14 +76,23 @@ class AETrainer(BaseTrainer):
|
||||
logger.info("Starting pretraining...")
|
||||
start_time = time.time()
|
||||
ae_net.train()
|
||||
for epoch in range(self.n_epochs):
|
||||
|
||||
all_training_data = []
|
||||
for epoch in range(self.n_epochs):
|
||||
epoch_loss = 0.0
|
||||
n_batches = 0
|
||||
epoch_start_time = time.time()
|
||||
for data in train_loader:
|
||||
inputs, _, _, _ = data
|
||||
inputs, _, _, _, file_frame_ids = data
|
||||
inputs = inputs.to(self.device)
|
||||
all_training_data.append(
|
||||
np.dstack(
|
||||
(
|
||||
file_frame_ids[0].detach().cpu().numpy(),
|
||||
file_frame_ids[1].detach().cpu().numpy(),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Zero the network parameter gradients
|
||||
optimizer.zero_grad()
|
||||
@@ -107,17 +123,31 @@ class AETrainer(BaseTrainer):
|
||||
|
||||
self.train_time = time.time() - start_time
|
||||
logger.info("Pretraining Time: {:.3f}s".format(self.train_time))
|
||||
|
||||
all_training_data = np.concatenate([x.squeeze() for x in all_training_data])
|
||||
|
||||
sorted_training_data = all_training_data[
|
||||
np.lexsort((all_training_data[:, 1], all_training_data[:, 0]))
|
||||
]
|
||||
|
||||
logger.info("Finished pretraining.")
|
||||
|
||||
return ae_net
|
||||
|
||||
def test(self, dataset: BaseADDataset, ae_net: BaseNet):
|
||||
def test(self, dataset: BaseADDataset, ae_net: BaseNet, k_fold_idx: int = None):
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Get test data loader
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=self.batch_size,
|
||||
num_workers=self.n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=self.batch_size, num_workers=self.n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Set loss
|
||||
criterion = nn.MSELoss(reduction="none")
|
||||
@@ -133,15 +163,25 @@ class AETrainer(BaseTrainer):
|
||||
start_time = time.time()
|
||||
idx_label_score = []
|
||||
ae_net.eval()
|
||||
all_training_data = []
|
||||
with torch.no_grad():
|
||||
for data in test_loader:
|
||||
inputs, labels, _, idx = data
|
||||
inputs, labels, _, idx, file_frame_ids = data
|
||||
inputs, labels, idx = (
|
||||
inputs.to(self.device),
|
||||
labels.to(self.device),
|
||||
idx.to(self.device),
|
||||
)
|
||||
|
||||
all_training_data.append(
|
||||
np.dstack(
|
||||
(
|
||||
file_frame_ids[0].detach().cpu().numpy(),
|
||||
file_frame_ids[1].detach().cpu().numpy(),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
rec = ae_net(inputs)
|
||||
rec_loss = criterion(rec, inputs)
|
||||
scores = torch.mean(rec_loss, dim=tuple(range(1, rec.dim())))
|
||||
@@ -161,6 +201,12 @@ class AETrainer(BaseTrainer):
|
||||
|
||||
self.test_time = time.time() - start_time
|
||||
|
||||
all_training_data = np.concatenate([x.squeeze() for x in all_training_data])
|
||||
|
||||
sorted_training_data = all_training_data[
|
||||
np.lexsort((all_training_data[:, 1], all_training_data[:, 0]))
|
||||
]
|
||||
|
||||
# Compute AUC
|
||||
_, labels, scores = zip(*idx_label_score)
|
||||
labels = np.array(labels)
|
||||
|
||||
Reference in New Issue
Block a user