Compare commits

..

3 Commits

Author SHA1 Message Date
Jan Kowalczyk
8f36bd2e07 new complete auc table 2025-09-17 11:43:38 +02:00
Jan Kowalczyk
936d2ecb6e correct auc table scrip 2025-09-17 11:43:26 +02:00
Jan Kowalczyk
95867bde7a table plot 2025-09-17 11:07:07 +02:00
3 changed files with 603 additions and 11 deletions

Binary file not shown.

View File

@@ -77,6 +77,16 @@
\newcolumntype{Y}{>{\centering\arraybackslash}X} \newcolumntype{Y}{>{\centering\arraybackslash}X}
% Define a slanted column type
\newcolumntype{R}[1]{>{\raggedleft\arraybackslash}p{#1}}
\newcommand{\rotheader}[1]{\rotatebox{90}{\parbox{2cm}{\centering #1}}}
% Full-width labeling row for regime blocks
\newcommand{\regimerow}[1]{%
\addlinespace[2pt]%
\multicolumn{9}{l}{\textbf{#1}}\\
\addlinespace[2pt]%
}
\DeclareRobustCommand{\threadtodo}[4]{% \DeclareRobustCommand{\threadtodo}[4]{%
\todo[inline, \todo[inline,
% \todo[disable, % \todo[disable,
@@ -1425,19 +1435,278 @@ Inference latency per sample is presented in Table~\ref{tab:inference_latency_co
Together, these results provide a comprehensive overview of the computational requirements of our experimental setup. They show that while our deep semi-supervised approach is significantly more demanding during training than classical baselines, it remains highly efficient at inference, which is the decisive factor for deployment in time-critical domains such as rescue robotics. Together, these results provide a comprehensive overview of the computational requirements of our experimental setup. They show that while our deep semi-supervised approach is significantly more demanding during training than classical baselines, it remains highly efficient at inference, which is the decisive factor for deployment in time-critical domains such as rescue robotics.
\newchapter{results_discussion}{Results and Discussion} \newchapter{results_discussion}{Results and Discussion}
\newsection{results}{Results}
% \threadtodo
% {give overview about hardware setup and how long things take to train}
% {we know what we trained but not how long that takes}
% {table of hardware and of how long different trainings took}
% {experiment setup understood $\rightarrow$ what were the experiments' results}
\todo{}
\newsection{hyperparameter_analysis}{Hyperparameter Analysis} \threadtodo
\todo[inline]{result for different amounts of labeled data} {Introduce the structure and scope of the results chapter}
{The reader knows the experiments from the previous chapter, but not the outcomes}
{State that we will first analyze autoencoder results, then anomaly detection performance, and finally inference experiments}
{Clear roadmap $\rightarrow$ prepares reader for detailed sections}
% --- Section: Autoencoder Pretraining Results ---
\section{Autoencoder Pretraining Results}
\threadtodo
{Present autoencoder reconstruction performance across architectures and latent sizes}
{Important because latent size and architecture determine representation quality, which may affect DeepSAD later}
{Show reconstruction losses over latent dimensions, compare Efficient vs LeNet}
{Understanding representation capacity $\rightarrow$ motivates analyzing if AE results transfer to DeepSAD}
%\fig{ae_loss_overall}{figures/ae_loss_overall.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient architectures.}
\threadtodo
{Analyze anomaly reconstruction performance specifically}
{Critical because degraded inputs may reconstruct differently, showing whether networks capture degradation structure}
{Show reconstruction losses on anomalous-only data subset}
{This analysis $\rightarrow$ motivates testing whether better AE reconstructions imply better anomaly detection}
%\fig{ae_loss_degraded}{figures/ae_loss_degraded.png}{Reconstruction loss on degraded-only subsets.}
% --- Section: DeepSAD Training Results ---
\section{DeepSAD Detection Performance}
\begin{table}[t]
\centering
\caption{ROC AUC (mean \textpm std) across 5 folds for \texttt{experiment-based evaluation}, semi-labeling regime: 0 normal samples 0 anomalous samples.}
\label{tab:auc_exp_based_semi_0_0}
\begin{tabularx}{\textwidth}{cYYYY}
\toprule
\textbf{Latent Dim.} & \textbf{DeepSAD (LeNet)} & \textbf{DeepSAD (Efficient)} & \textbf{IsolationForest} & \textbf{OC\text{-}SVM} \\
\midrule
32 & \textbf{0.801 \textpm 0.019} & 0.791 \textpm 0.011 & 0.717 \textpm 0.006 & 0.752 \textpm 0.045 \\
64 & 0.776 \textpm 0.009 & \textbf{0.786 \textpm 0.012} & 0.718 \textpm 0.010 & 0.742 \textpm 0.018 \\
128 & \textbf{0.784 \textpm 0.024} & 0.784 \textpm 0.017 & 0.719 \textpm 0.017 & 0.775 \textpm 0.009 \\
256 & 0.762 \textpm 0.028 & 0.772 \textpm 0.016 & 0.712 \textpm 0.006 & \textbf{0.793 \textpm 0.022} \\
512 & 0.759 \textpm 0.020 & 0.784 \textpm 0.021 & 0.712 \textpm 0.007 & \textbf{0.804 \textpm 0.027} \\
768 & 0.749 \textpm 0.041 & 0.754 \textpm 0.024 & 0.713 \textpm 0.011 & \textbf{0.812 \textpm 0.023} \\
1024 & 0.757 \textpm 0.020 & 0.750 \textpm 0.017 & 0.716 \textpm 0.012 & \textbf{0.821 \textpm 0.019} \\
\bottomrule
\end{tabularx}
\end{table}
\begin{table}[t]
\centering
\caption{ROC AUC (mean \textpm std) across 5 folds for \texttt{experiment-based evaluation}, semi-labeling regime: 50 normal samples 10 anomalous samples.}
\label{tab:auc_exp_based_semi_50_10}
\begin{tabularx}{\textwidth}{cYYYY}
\toprule
\textbf{Latent Dim.} & \textbf{DeepSAD (LeNet)} & \textbf{DeepSAD (Efficient)} & \textbf{IsolationForest} & \textbf{OC\text{-}SVM} \\
\midrule
32 & 0.741 \textpm 0.013 & 0.747 \textpm 0.015 & 0.717 \textpm 0.006 & \textbf{0.752 \textpm 0.045} \\
64 & \textbf{0.757 \textpm 0.011} & 0.750 \textpm 0.017 & 0.718 \textpm 0.010 & 0.742 \textpm 0.018 \\
128 & 0.746 \textpm 0.019 & 0.751 \textpm 0.016 & 0.719 \textpm 0.017 & \textbf{0.775 \textpm 0.009} \\
256 & 0.746 \textpm 0.015 & 0.750 \textpm 0.015 & 0.712 \textpm 0.006 & \textbf{0.793 \textpm 0.022} \\
512 & 0.760 \textpm 0.057 & 0.763 \textpm 0.027 & 0.712 \textpm 0.007 & \textbf{0.804 \textpm 0.027} \\
768 & 0.749 \textpm 0.016 & 0.747 \textpm 0.036 & 0.713 \textpm 0.011 & \textbf{0.812 \textpm 0.023} \\
1024 & 0.748 \textpm 0.021 & 0.732 \textpm 0.015 & 0.716 \textpm 0.012 & \textbf{0.821 \textpm 0.019} \\
\bottomrule
\end{tabularx}
\end{table}
\begin{table}[t]
\centering
\caption{ROC AUC (mean \textpm std) across 5 folds for \texttt{experiment-based evaluation}, semi-labeling regime: 500 normal samples 100 anomalous samples.}
\label{tab:auc_exp_based_semi_500_100}
\begin{tabularx}{\textwidth}{cYYYY}
\toprule
\textbf{Latent Dim.} & \textbf{DeepSAD (LeNet)} & \textbf{DeepSAD (Efficient)} & \textbf{IsolationForest} & \textbf{OC\text{-}SVM} \\
\midrule
32 & 0.765 \textpm 0.005 & \textbf{0.775 \textpm 0.010} & 0.717 \textpm 0.006 & 0.752 \textpm 0.045 \\
64 & 0.754 \textpm 0.013 & \textbf{0.773 \textpm 0.020} & 0.718 \textpm 0.010 & 0.742 \textpm 0.018 \\
128 & 0.758 \textpm 0.009 & 0.769 \textpm 0.014 & 0.719 \textpm 0.017 & \textbf{0.775 \textpm 0.009} \\
256 & 0.749 \textpm 0.016 & 0.768 \textpm 0.021 & 0.712 \textpm 0.006 & \textbf{0.793 \textpm 0.022} \\
512 & 0.766 \textpm 0.043 & 0.770 \textpm 0.026 & 0.712 \textpm 0.007 & \textbf{0.804 \textpm 0.027} \\
768 & 0.746 \textpm 0.016 & 0.750 \textpm 0.027 & 0.713 \textpm 0.011 & \textbf{0.812 \textpm 0.023} \\
1024 & 0.743 \textpm 0.023 & 0.739 \textpm 0.016 & 0.716 \textpm 0.012 & \textbf{0.821 \textpm 0.019} \\
\bottomrule
\end{tabularx}
\end{table}
\begin{table}[t]
\centering
\caption{ROC AUC (mean \textpm std) across 5 folds for \texttt{handlabeling-based evaluation}, semi-labeling regime: 0 normal samples 0 anomalous samples.}
\label{tab:auc_manual_based_semi_0_0}
\begin{tabularx}{\textwidth}{cYYYY}
\toprule
\textbf{Latent Dim.} & \textbf{DeepSAD (LeNet)} & \textbf{DeepSAD (Efficient)} & \textbf{IsolationForest} & \textbf{OC\text{-}SVM} \\
\midrule
32 & \textbf{1.000 \textpm 0.000} & \textbf{1.000 \textpm 0.000} & 0.921 \textpm 0.010 & 0.917 \textpm 0.014 \\
64 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.917 \textpm 0.007 & 0.931 \textpm 0.023 \\
128 & \textbf{1.000 \textpm 0.000} & \textbf{1.000 \textpm 0.000} & 0.921 \textpm 0.008 & 0.967 \textpm 0.029 \\
256 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.918 \textpm 0.009 & 0.966 \textpm 0.016 \\
512 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.920 \textpm 0.010 & 0.949 \textpm 0.021 \\
768 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.923 \textpm 0.007 & 0.960 \textpm 0.024 \\
1024 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.919 \textpm 0.005 & 0.956 \textpm 0.011 \\
\bottomrule
\end{tabularx}
\end{table}
\begin{table}[t]
\centering
\caption{ROC AUC (mean \textpm std) across 5 folds for \texttt{handlabeling-based evaluation}, semi-labeling regime: 50 normal samples 10 anomalous samples.}
\label{tab:auc_manual_based_semi_50_10}
\begin{tabularx}{\textwidth}{cYYYY}
\toprule
\textbf{Latent Dim.} & \textbf{DeepSAD (LeNet)} & \textbf{DeepSAD (Efficient)} & \textbf{IsolationForest} & \textbf{OC\text{-}SVM} \\
\midrule
32 & 0.990 \textpm 0.019 & \textbf{0.998 \textpm 0.001} & 0.921 \textpm 0.010 & 0.917 \textpm 0.014 \\
64 & 0.998 \textpm 0.003 & \textbf{0.999 \textpm 0.000} & 0.917 \textpm 0.007 & 0.931 \textpm 0.023 \\
128 & 0.991 \textpm 0.018 & \textbf{0.999 \textpm 0.000} & 0.921 \textpm 0.008 & 0.967 \textpm 0.029 \\
256 & 0.999 \textpm 0.002 & \textbf{0.999 \textpm 0.001} & 0.918 \textpm 0.009 & 0.966 \textpm 0.016 \\
512 & 0.972 \textpm 0.060 & \textbf{0.999 \textpm 0.001} & 0.920 \textpm 0.010 & 0.949 \textpm 0.021 \\
768 & \textbf{1.000 \textpm 0.000} & 0.998 \textpm 0.001 & 0.923 \textpm 0.007 & 0.960 \textpm 0.024 \\
1024 & \textbf{0.999 \textpm 0.001} & 0.998 \textpm 0.001 & 0.919 \textpm 0.005 & 0.956 \textpm 0.011 \\
\bottomrule
\end{tabularx}
\end{table}
\begin{table}[t]
\centering
\caption{ROC AUC (mean \textpm std) across 5 folds for \texttt{handlabeling-based evaluation}, semi-labeling regime: 500 normal samples 100 anomalous samples.}
\label{tab:auc_manual_based_semi_500_100}
\begin{tabularx}{\textwidth}{cYYYY}
\toprule
\textbf{Latent Dim.} & \textbf{DeepSAD (LeNet)} & \textbf{DeepSAD (Efficient)} & \textbf{IsolationForest} & \textbf{OC\text{-}SVM} \\
\midrule
32 & \textbf{1.000 \textpm 0.000} & 1.000 \textpm 0.000 & 0.921 \textpm 0.010 & 0.917 \textpm 0.014 \\
64 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.917 \textpm 0.007 & 0.931 \textpm 0.023 \\
128 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.921 \textpm 0.008 & 0.967 \textpm 0.029 \\
256 & 0.999 \textpm 0.001 & \textbf{1.000 \textpm 0.000} & 0.918 \textpm 0.009 & 0.966 \textpm 0.016 \\
512 & 0.989 \textpm 0.025 & \textbf{1.000 \textpm 0.000} & 0.920 \textpm 0.010 & 0.949 \textpm 0.021 \\
768 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.923 \textpm 0.007 & 0.960 \textpm 0.024 \\
1024 & 1.000 \textpm 0.000 & \textbf{1.000 \textpm 0.000} & 0.919 \textpm 0.005 & 0.956 \textpm 0.011 \\
\bottomrule
\end{tabularx}
\end{table}
\threadtodo
{Introduce DeepSAD anomaly detection results compared to baselines}
{Core part of evaluation: shows if DeepSAD provides benefit beyond standard methods}
{Explain ROC/PRC as evaluation metrics, show curves for all latent sizes, unsupervised case}
{Results here $\rightarrow$ baseline comparison and semi-supervised effects}
\begin{table}[t]
\centering
\setlength{\tabcolsep}{4pt}
\renewcommand{\arraystretch}{1.2}
\begin{tabularx}{\textwidth}{c*{4}{Y}|*{4}{Y}}
\toprule
& \multicolumn{4}{c}{Experiment-based eval.} & \multicolumn{4}{c}{Handlabeled eval.} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
Latent Dim. & \rotheader{DeepSAD \\(LeNet)} & \rotheader{DeepSAD\\(Efficient)} & \rotheader{IsoForest} & \rotheader{OC-SVM} & \rotheader{DeepSAD\\(LeNet)} & \rotheader{DeepSAD\\(Efficient)} & \rotheader{IsoForest} & \rotheader{OC-SVM} \\
\midrule
\multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{0/0}\) \textit{(normal/anomalous samples labeled)}} \\
\addlinespace[2pt]
32 & \textbf{0.801} & 0.791 & 0.717 & 0.752 & \textbf{1.000} & \textbf{1.000} & 0.921 & 0.917 \\
64 & 0.776 & \textbf{0.786} & 0.718 & 0.742 & \textbf{1.000} & \textbf{1.000} & 0.917 & 0.931 \\
128 & \textbf{0.784} & \textbf{0.784} & 0.719 & 0.775 & \textbf{1.000} & \textbf{1.000} & 0.921 & 0.967 \\
256 & 0.762 & 0.772 & 0.712 & \textbf{0.793} & \textbf{1.000} & \textbf{1.000} & 0.918 & 0.966 \\
512 & 0.759 & 0.784 & 0.712 & \textbf{0.804} & \textbf{1.000} & \textbf{1.000} & 0.920 & 0.949 \\
768 & 0.749 & 0.754 & 0.713 & \textbf{0.812} & \textbf{1.000} & \textbf{1.000} & 0.923 & 0.960 \\
1024 & 0.757 & 0.750 & 0.716 & \textbf{0.821} & \textbf{1.000} & \textbf{1.000} & 0.919 & 0.956 \\
\midrule
\multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{50/10}\) \textit{(normal/anomalous samples labeled)}} \\
\addlinespace[2pt]
32 & 0.741 & 0.747 & 0.717 & \textbf{0.752} & 0.990 & \textbf{0.998} & 0.921 & 0.917 \\
64 & \textbf{0.757} & 0.750 & 0.718 & 0.742 & 0.998 & \textbf{0.999} & 0.917 & 0.931 \\
128 & 0.746 & 0.751 & 0.719 & \textbf{0.775} & 0.991 & \textbf{0.999} & 0.921 & 0.967 \\
256 & 0.746 & 0.750 & 0.712 & \textbf{0.793} & \textbf{0.999} & \textbf{0.999} & 0.918 & 0.966 \\
512 & 0.760 & 0.763 & 0.712 & \textbf{0.804} & 0.972 & \textbf{0.999} & 0.920 & 0.949 \\
768 & 0.749 & 0.747 & 0.713 & \textbf{0.812} & \textbf{1.000} & 0.998 & 0.923 & 0.960 \\
1024 & 0.748 & 0.732 & 0.716 & \textbf{0.821} & \textbf{0.999} & 0.998 & 0.919 & 0.956 \\
\midrule
\multicolumn{9}{l}{\textbf{Labeling regime: }\(\mathbf{500/100}\) \textit{(normal/anomalous samples labeled)}} \\
\addlinespace[2pt]
32 & 0.765 & \textbf{0.775} & 0.717 & 0.752 & \textbf{1.000} & \textbf{1.000} & 0.921 & 0.917 \\
64 & 0.754 & \textbf{0.773} & 0.718 & 0.742 & \textbf{1.000} & \textbf{1.000} & 0.917 & 0.931 \\
128 & 0.758 & 0.769 & 0.719 & \textbf{0.775} & \textbf{1.000} & \textbf{1.000} & 0.921 & 0.967 \\
256 & 0.749 & 0.768 & 0.712 & \textbf{0.793} & 0.999 & \textbf{1.000} & 0.918 & 0.966 \\
512 & 0.766 & 0.770 & 0.712 & \textbf{0.804} & 0.989 & \textbf{1.000} & 0.920 & 0.949 \\
768 & 0.746 & 0.750 & 0.713 & \textbf{0.812} & \textbf{1.000} & \textbf{1.000} & 0.923 & 0.960 \\
1024 & 0.743 & 0.739 & 0.716 & \textbf{0.821} & \textbf{1.000} & \textbf{1.000} & 0.919 & 0.956 \\
\bottomrule
\end{tabularx}
\caption{AUC means across 5 folds for both evaluations, grouped by labeling regime. Maximum observed standard deviation across all cells (not shown in table): 0.060.}
\end{table}
%\fig{roc_prc_unsup}{figures/roc_prc_unsup.png}{ROC and PRC curves for DeepSAD, Isolation Forest, and OCSVM (unsupervised, all latent dimensions).}
\threadtodo
{Interpret unsupervised results across architectures and baselines}
{Important to establish the baseline performance levels}
{Compare AUCs: Isolation Forest weakest, OCSVM moderate (uses encoder), DeepSAD best}
{Sets expectation for whether supervision improves or harms performance}
\threadtodo
{Present semi-supervised regimes and their effects}
{Semi-supervision is central to DeepSAD; must show how labels change outcomes}
{Show ROC/PRC plots for selected latent sizes under different labeling regimes}
{This leads $\rightarrow$ analysis of why few labels harmed but many labels improved}
%\fig{roc_prc_semi}{figures/roc_prc_semi.png}{ROC and PRC curves for selected latent sizes under different semi-supervised regimes.}
\threadtodo
{Discuss surprising supervision dynamics}
{Reader expects supervision to always help; but results show nuance}
{Interpret why few labels overfit, many labels help, unsupervised sometimes best}
{This discussion $\rightarrow$ motivates looking at model behavior over time via inference}
% --- Section: Inference Experiments ---
\section{Inference on Held-Out Experiments}
\threadtodo
{Introduce inference evaluation on unseen experiments}
{This tests real-world usefulness: continuous scan-level degradation quantification}
{Explain setup: EMA-smoothed z-scores compared against heuristic degradation indicators}
{From static metrics $\rightarrow$ to temporal behavior analysis}
%\fig{inference_indicators}{figures/inference_indicators.png}{Example inference traces: EMA-smoothed anomaly scores compared to missing-point percentage and near-sensor returns.}
\threadtodo
{Analyze correlation of anomaly scores with degradation indicators}
{Important because it shows methods behave as intended even without perfect ground truth}
{Discuss qualitative similarity, emphasize scores as degradation proxies}
{Sets stage $\rightarrow$ for clean vs degraded comparison}
\threadtodo
{Compare anomaly score dynamics between clean and degraded experiments}
{Tests whether scores separate normal vs degraded traversals reliably}
{Show normalized z-score plots using clean-experiment parameters}
{Final confirmation $\rightarrow$ methods are meaningful for degradation quantification}
%\fig{inference_clean_vs_smoke}{figures/inference_clean_vs_smoke.png}{Normalized anomaly scores for a clean vs degraded experiment. Clear amplitude separation is visible.}
% --- Section: Results Summary ---
\section{Summary of Results}
\threadtodo
{Summarize main findings across all results}
{Reader should leave with a compact understanding of what was learned}
{State that Efficient autoencoder reconstructs better, DeepSAD beats baselines, semi-supervision shows tradeoffs, inference confirms degradation quantification works}
{Clear closure $\rightarrow$ prepares transition to discussion, limitations, and future work}
% \todo[inline]{introductory paragraph results}
% \todo[inline]{autoencoder results, compare lenet to efficient, shows that efficient is better and especially at lower latent dims, interesting to see in future exps if autencoder results appear to transfer to deepsad training results, therefore not a single latent dim in later exps, but rather all so it can be compared. also interesting to see if efficient better than lenet since reconstruction loss is better for efficient}
%
% \todo[inline]{we already have results graphs loss over latent dims with both lenet and effficient arch in plot, we also have overall plot as well as one for evaluation only with degraded data (anomalies) to see how good the networks are in reconstructing anomalies, not only normal data, plots enough or table with results necessary?}
%
% \todo[inline]{transition to main training results, should we show ROC/PRC comparisons of methods first or should we first show inference as score over time for one (during training left out) experiment?}
%
% \todo[inline]{main training compare roc/prc of 7 latent dimensionalities with 0 normal 0 anomalous semi regime, plot with 7 subplots, both deepsad better than baselines in both labeling regimes (experiment based and subjective hand-labeled evaluations). as expected isoforest worst since its simplest, ocsvm better since it profits from pre-trained encoder which should be good at dim reduction while maximizing retained information, efficient and lenet have similar results, although efficient has less variance between folds which could either mean its more effective at finding patterns (due to maybe more channels, better receptive field, etc) or it could mean it overfits more readily to data? not sure tbh and I don't think we can interpret from these limited evaluations, but better evaluation not possible without good ground truth}
%
% \todo[inline]{main training compare roc/prc of semi-regimes from 2 or 3 latent dimensionalities, show that unsupervised was best, then heavily semi-supervised then a few labeled samples in last position, why was this? maybe the few labeled examples create overfit already and lot of them improve overfit but are worse than generalized unsupervised version?}
%
% \todo[inline]{inference results showing the general workings of the methods on two experiments (one degraded, one normal - so no smoke) which were left out during training of these methods. inference plots of which 2 kinds exist: one that compares the smoothed z-score of the methods (to reduce noise in plots with EMA, which is not dependent on future data, so could be used in realtime and reacts way faster than moving averages and z-score is used since the analog output values from the different methods have different signs and magnitudes) with two statistical values we discussed in data section, namely missing percentage of points per lidar scan and erroneous near-sensor returns which have to be early returns per scan. these show that all methods have comparative qualities to these statistics, although the should not be taken as a ground truth, just as an indicator showing that generally the intended use case appears to be fulfilled by all methods (which was to interpret the anomaly score as a degradtaion quantification of each individual scan)}
% \todo[inline]{the second kind of inference plots shows the difference between scores produced on normal (non-degraded) experiment data vs scores produced on anomalous (degraded) data by normalizing the timeline of two experiments of which one contains no smoke and one has been degraded with artificial smoke. this has been achieved by using the z-score parameters of the clean data scores on both the clean experiment scores and the degraded experiment scores to show that there is a large difference between the amplitudes of these methods' scores for the two types of experiments}
%
% \todo[inline]{anything else for results or simply transition to conclusion and future work?}
%
%
% \newsection{hyperparameter_analysis}{Hyperparameter Analysis}
% \todo[inline]{result for different amounts of labeled data}
\newchapter{conclusion_future_work}{Conclusion and Future Work} \newchapter{conclusion_future_work}{Conclusion and Future Work}
\newsection{conclusion}{Conclusion} \newsection{conclusion}{Conclusion}

View File

@@ -0,0 +1,323 @@
from __future__ import annotations
import shutil
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import polars as pl
# CHANGE THIS IMPORT IF YOUR LOADER MODULE IS NAMED DIFFERENTLY
from load_results import load_results_dataframe
# ----------------------------
# Config
# ----------------------------
ROOT = Path("/home/fedex/mt/results/copy") # experiments root you pass to the loader
OUTPUT_DIR = Path("/home/fedex/mt/plots/results_latent_space_tables")
# Semi-labeling regimes (semi_normals, semi_anomalous) in display order
SEMI_LABELING_REGIMES: list[tuple[int, int]] = [(0, 0), (50, 10), (500, 100)]
# Both evals are shown side-by-side in one table
EVALS_BOTH: tuple[str, str] = ("exp_based", "manual_based")
# Row order (latent dims)
LATENT_DIMS: list[int] = [32, 64, 128, 256, 512, 768, 1024]
# Column order (method shown to the user)
# We split DeepSAD into the two network backbones, like your plots.
METHOD_COLUMNS = [
("deepsad", "LeNet"), # DeepSAD (LeNet)
("deepsad", "Efficient"), # DeepSAD (Efficient)
("isoforest", "Efficient"), # IsolationForest (Efficient baseline)
("ocsvm", "Efficient"), # OC-SVM (Efficient baseline)
]
# Formatting
DECIMALS = 3 # cells look like 1.000 or 0.928 (3 decimals)
# ----------------------------
# Helpers
# ----------------------------
def _with_net_label(df: pl.DataFrame) -> pl.DataFrame:
"""Add a canonical 'net_label' column like the plotting script (LeNet/Efficient/fallback)."""
return df.with_columns(
pl.when(
pl.col("network").cast(pl.Utf8).str.to_lowercase().str.contains("lenet")
)
.then(pl.lit("LeNet"))
.when(
pl.col("network").cast(pl.Utf8).str.to_lowercase().str.contains("efficient")
)
.then(pl.lit("Efficient"))
.otherwise(pl.col("network").cast(pl.Utf8))
.alias("net_label")
)
def _filter_base(df: pl.DataFrame) -> pl.DataFrame:
"""Restrict to valid dims/models and needed columns (no eval/regime filtering here)."""
return df.filter(
(pl.col("latent_dim").is_in(LATENT_DIMS))
& (pl.col("model").is_in(["deepsad", "isoforest", "ocsvm"]))
& (pl.col("eval").is_in(list(EVALS_BOTH)))
).select(
"model",
"net_label",
"latent_dim",
"fold",
"auc",
"eval",
"semi_normals",
"semi_anomalous",
)
@dataclass(frozen=True)
class Cell:
mean: float | None
std: float | None
def _compute_cells(df: pl.DataFrame) -> dict[tuple[str, int, str, str, int, int], Cell]:
"""
Compute per-(eval, latent_dim, model, net_label, semi_normals, semi_anomalous)
mean/std for AUC across folds.
"""
if df.is_empty():
return {}
# For baselines (isoforest/ocsvm) constrain to Efficient backbone
df = df.filter(
pl.when(pl.col("model").is_in(["isoforest", "ocsvm"]))
.then(pl.col("net_label") == "Efficient")
.otherwise(True)
)
agg = (
df.group_by(
[
"eval",
"latent_dim",
"model",
"net_label",
"semi_normals",
"semi_anomalous",
]
)
.agg(
pl.col("auc").mean().alias("mean_auc"), pl.col("auc").std().alias("std_auc")
)
.to_dicts()
)
out: dict[tuple[str, int, str, str, int, int], Cell] = {}
for row in agg:
key = (
str(row["eval"]),
int(row["latent_dim"]),
str(row["model"]),
str(row["net_label"]),
int(row["semi_normals"]),
int(row["semi_anomalous"]),
)
out[key] = Cell(mean=row.get("mean_auc"), std=row.get("std_auc"))
return out
def _fmt_mean(mean: float | None) -> str:
return "--" if (mean is None or not (mean == mean)) else f"{mean:.{DECIMALS}f}"
def _bold_best_mask_display(values: list[float | None], decimals: int) -> list[bool]:
"""
Bolding mask based on *displayed* precision. Any entries that round (via f-string)
to the maximum at 'decimals' places are bolded (ties bolded).
"""
def disp(v: float | None) -> float | None:
if v is None or not (v == v):
return None
return float(f"{v:.{decimals}f}")
rounded = [disp(v) for v in values]
finite = [v for v in rounded if v is not None]
if not finite:
return [False] * len(values)
maxv = max(finite)
return [(v is not None and v == maxv) for v in rounded]
def _build_single_table(
cells: dict[tuple[str, int, str, str, int, int], Cell],
*,
semi_labeling_regimes: list[tuple[int, int]],
) -> tuple[str, float | None]:
"""
Build the LaTeX table string with grouped headers and regime blocks.
Returns (latex, max_std_overall).
"""
# Rotated header labels (90° slanted)
header_cols = [
r"\rotheader{DeepSAD\\(LeNet)}",
r"\rotheader{DeepSAD\\(Efficient)}",
r"\rotheader{IsoForest}",
r"\rotheader{OC-SVM}",
]
# Track max std across all cells
max_std: float | None = None
def push_std(std_val: float | None):
nonlocal max_std
if std_val is None or not (std_val == std_val):
return
if max_std is None or std_val > max_std:
max_std = std_val
lines: list[str] = []
# Table preamble / structure
lines.append(r"\begin{table}[t]")
lines.append(r"\centering")
lines.append(r"\setlength{\tabcolsep}{4pt}")
lines.append(r"\renewcommand{\arraystretch}{1.2}")
# Vertical rule between the two groups for data/header rows:
lines.append(r"\begin{tabularx}{\textwidth}{c*{4}{Y}|*{4}{Y}}")
lines.append(r"\toprule")
lines.append(
r" & \multicolumn{4}{c}{Experiment-based eval.} & \multicolumn{4}{c}{Handlabeled eval.} \\"
)
lines.append(r"\cmidrule(lr){2-5} \cmidrule(lr){6-9}")
lines.append(
r"Latent Dim. & "
+ " & ".join(header_cols)
+ " & "
+ " & ".join(header_cols)
+ r" \\"
)
lines.append(r"\midrule")
# Iterate regimes and rows
for idx, (semi_n, semi_a) in enumerate(semi_labeling_regimes):
# Regime label row (multicolumn suppresses the vertical bar in this row)
lines.append(
rf"\multicolumn{{9}}{{l}}{{\textbf{{Labeling regime: }}\(\mathbf{{{semi_n}/{semi_a}}}\) "
rf"\textit{{(normal/anomalous samples labeled)}}}} \\"
)
lines.append(r"\addlinespace[2pt]")
for dim in LATENT_DIMS:
# Values in order: left group (exp_based) 4 cols, right group (manual_based) 4 cols
means_left: list[float | None] = []
means_right: list[float | None] = []
cell_strs_left: list[str] = []
cell_strs_right: list[str] = []
# Left group: exp_based
eval_type = EVALS_BOTH[0]
for model, net in METHOD_COLUMNS:
key = (eval_type, dim, model, net, semi_n, semi_a)
cell = cells.get(key, Cell(None, None))
means_left.append(cell.mean)
cell_strs_left.append(_fmt_mean(cell.mean))
push_std(cell.std)
# Right group: manual_based
eval_type = EVALS_BOTH[1]
for model, net in METHOD_COLUMNS:
key = (eval_type, dim, model, net, semi_n, semi_a)
cell = cells.get(key, Cell(None, None))
means_right.append(cell.mean)
cell_strs_right.append(_fmt_mean(cell.mean))
push_std(cell.std)
# Bolding per group based on displayed precision
mask_left = _bold_best_mask_display(means_left, DECIMALS)
mask_right = _bold_best_mask_display(means_right, DECIMALS)
pretty_left = [
(r"\textbf{" + s + "}") if (do_bold and s != "--") else s
for s, do_bold in zip(cell_strs_left, mask_left)
]
pretty_right = [
(r"\textbf{" + s + "}") if (do_bold and s != "--") else s
for s, do_bold in zip(cell_strs_right, mask_right)
]
# Join with the vertical bar between groups automatically handled by column spec
lines.append(
f"{dim} & "
+ " & ".join(pretty_left)
+ " & "
+ " & ".join(pretty_right)
+ r" \\"
)
# Separator between regime blocks (but not after the last one)
if idx < len(semi_labeling_regimes) - 1:
lines.append(r"\midrule")
lines.append(r"\bottomrule")
lines.append(r"\end{tabularx}")
# Caption with max std (not shown in table)
max_std_str = "n/a" if max_std is None else f"{max_std:.{DECIMALS}f}"
lines.append(
rf"\caption{{AUC means across 5 folds for both evaluations, grouped by labeling regime. "
rf"Maximum observed standard deviation across all cells (not shown in table): {max_std_str}.}}"
)
lines.append(r"\end{table}")
return "\n".join(lines), max_std
def main():
# Load full results DF (cache behavior handled by your loader)
df = load_results_dataframe(ROOT, allow_cache=True)
df = _with_net_label(df)
df = _filter_base(df)
# Prepare output dirs
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
archive_dir = OUTPUT_DIR / "archive"
archive_dir.mkdir(parents=True, exist_ok=True)
ts_dir = archive_dir / datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
ts_dir.mkdir(parents=True, exist_ok=True)
# Pre-compute aggregated cells (mean/std) for all evals/regimes
cells = _compute_cells(df)
# Build the single big table
tex, max_std = _build_single_table(
cells, semi_labeling_regimes=SEMI_LABELING_REGIMES
)
out_name = "auc_table_all_evals_all_regimes.tex"
out_path = ts_dir / out_name
out_path.write_text(tex, encoding="utf-8")
# Copy this script to preserve the code used for the outputs
script_path = Path(__file__)
shutil.copy2(script_path, ts_dir / script_path.name)
# Mirror latest
latest = OUTPUT_DIR / "latest"
latest.mkdir(exist_ok=True, parents=True)
for f in latest.iterdir():
if f.is_file():
f.unlink()
for f in ts_dir.iterdir():
if f.is_file():
shutil.copy2(f, latest / f.name)
print(f"Saved table to: {ts_dir}")
print(f"Also updated: {latest}")
print(f" - {out_name}")
if __name__ == "__main__":
main()