aspell start
This commit is contained in:
@@ -2010,80 +2010,6 @@
|
|||||||
\verb http://dx.doi.org/10.1109/5.726791
|
\verb http://dx.doi.org/10.1109/5.726791
|
||||||
\endverb
|
\endverb
|
||||||
\endentry
|
\endentry
|
||||||
\entry{ef_concept_source}{article}{}{}
|
|
||||||
\name{author}{8}{}{%
|
|
||||||
{{hash=cde98454252ce53a9838df6015a87166}{%
|
|
||||||
family={Ye},
|
|
||||||
familyi={Y\bibinitperiod},
|
|
||||||
given={Min},
|
|
||||||
giveni={M\bibinitperiod}}}%
|
|
||||||
{{hash=99c2139712828885b388f7f7e83f4134}{%
|
|
||||||
family={Nie},
|
|
||||||
familyi={N\bibinitperiod},
|
|
||||||
given={Jie},
|
|
||||||
giveni={J\bibinitperiod}}}%
|
|
||||||
{{hash=8b4ed7eebb280dc2d0df1bec91e27c18}{%
|
|
||||||
family={Liu},
|
|
||||||
familyi={L\bibinitperiod},
|
|
||||||
given={Anan},
|
|
||||||
giveni={A\bibinitperiod}}}%
|
|
||||||
{{hash=9f4446a6d583221d3d173f806ecf8627}{%
|
|
||||||
family={Wang},
|
|
||||||
familyi={W\bibinitperiod},
|
|
||||||
given={Zhigang},
|
|
||||||
giveni={Z\bibinitperiod}}}%
|
|
||||||
{{hash=ebcb470ee777cbc56984a83379d29819}{%
|
|
||||||
family={Huang},
|
|
||||||
familyi={H\bibinitperiod},
|
|
||||||
given={Lei},
|
|
||||||
giveni={L\bibinitperiod}}}%
|
|
||||||
{{hash=d761d577d421d6a0566473ae6b8b342f}{%
|
|
||||||
family={Tian},
|
|
||||||
familyi={T\bibinitperiod},
|
|
||||||
given={Hao},
|
|
||||||
giveni={H\bibinitperiod}}}%
|
|
||||||
{{hash=2e463ab20a44bc493252994ca77f0fca}{%
|
|
||||||
family={Song},
|
|
||||||
familyi={S\bibinitperiod},
|
|
||||||
given={Dehai},
|
|
||||||
giveni={D\bibinitperiod}}}%
|
|
||||||
{{hash=eeea6461d631d4e9cb07d2abc2de6885}{%
|
|
||||||
family={Wei},
|
|
||||||
familyi={W\bibinitperiod},
|
|
||||||
given={Zhiqiang},
|
|
||||||
giveni={Z\bibinitperiod}}}%
|
|
||||||
}
|
|
||||||
\list{publisher}{1}{%
|
|
||||||
{Frontiers Media SA}%
|
|
||||||
}
|
|
||||||
\strng{namehash}{0fca66725a9966a967fc7893b180ddef}
|
|
||||||
\strng{fullhash}{0e37676c60146890b0c3819a1c8e441b}
|
|
||||||
\strng{fullhashraw}{0e37676c60146890b0c3819a1c8e441b}
|
|
||||||
\strng{bibnamehash}{0e37676c60146890b0c3819a1c8e441b}
|
|
||||||
\strng{authorbibnamehash}{0e37676c60146890b0c3819a1c8e441b}
|
|
||||||
\strng{authornamehash}{0fca66725a9966a967fc7893b180ddef}
|
|
||||||
\strng{authorfullhash}{0e37676c60146890b0c3819a1c8e441b}
|
|
||||||
\strng{authorfullhashraw}{0e37676c60146890b0c3819a1c8e441b}
|
|
||||||
\field{sortinit}{6}
|
|
||||||
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
|
|
||||||
\field{labelnamesource}{author}
|
|
||||||
\field{labeltitlesource}{title}
|
|
||||||
\field{issn}{2296-7745}
|
|
||||||
\field{journaltitle}{Frontiers in Marine Science}
|
|
||||||
\field{month}{8}
|
|
||||||
\field{title}{Multi-Year ENSO Forecasts Using Parallel Convolutional Neural Networks With Heterogeneous Architecture}
|
|
||||||
\field{volume}{8}
|
|
||||||
\field{year}{2021}
|
|
||||||
\verb{doi}
|
|
||||||
\verb 10.3389/fmars.2021.717184
|
|
||||||
\endverb
|
|
||||||
\verb{urlraw}
|
|
||||||
\verb http://dx.doi.org/10.3389/fmars.2021.717184
|
|
||||||
\endverb
|
|
||||||
\verb{url}
|
|
||||||
\verb http://dx.doi.org/10.3389/fmars.2021.717184
|
|
||||||
\endverb
|
|
||||||
\endentry
|
|
||||||
\entry{mobilenet}{misc}{}{}
|
\entry{mobilenet}{misc}{}{}
|
||||||
\name{author}{8}{}{%
|
\name{author}{8}{}{%
|
||||||
{{hash=0cedb03f907400fc304fdfaa1f7e2085}{%
|
{{hash=0cedb03f907400fc304fdfaa1f7e2085}{%
|
||||||
|
|||||||
BIN
thesis/Main.pdf
BIN
thesis/Main.pdf
Binary file not shown.
@@ -65,6 +65,7 @@
|
|||||||
% \draftcopyName{ENTWURF}{160}
|
% \draftcopyName{ENTWURF}{160}
|
||||||
|
|
||||||
\usepackage{xcolor}
|
\usepackage{xcolor}
|
||||||
|
\usepackage{soul}
|
||||||
\usepackage{xfrac}
|
\usepackage{xfrac}
|
||||||
\usepackage{booktabs}
|
\usepackage{booktabs}
|
||||||
\usepackage{multirow}
|
\usepackage{multirow}
|
||||||
@@ -91,7 +92,7 @@
|
|||||||
\makecell[l]{#1 \\ \emph{#2}}
|
\makecell[l]{#1 \\ \emph{#2}}
|
||||||
}
|
}
|
||||||
|
|
||||||
\newcommand\rev[1]{\colorbox{yellow}{#1}}
|
\DeclareRobustCommand{\rev}[1]{\textcolor{red}{#1}}
|
||||||
|
|
||||||
% correct bad hyphenation
|
% correct bad hyphenation
|
||||||
\hyphenation{}
|
\hyphenation{}
|
||||||
@@ -252,13 +253,13 @@ Figure~\ref{fig:anomaly_detection_overview} depicts a simple but illustrative ex
|
|||||||
|
|
||||||
\figc{anomaly_detection_overview}{figures/anomaly_detection_overview}{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$ as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}\rev{.}}{width=0.5\textwidth}
|
\figc{anomaly_detection_overview}{figures/anomaly_detection_overview}{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$ as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}\rev{.}}{width=0.5\textwidth}
|
||||||
|
|
||||||
By their very nature anomalies are rare occurences and oftentimes unpredictable in nature, which makes it hard to define all possible anomalies in any system. It also makes it very challenging to create an algorithm which is capable of detecting anomalies which may have never occured before and may not have been known to exist during the creation of the detection algorithm. There are many possible approaches to this problem, though they can be roughly grouped into six distinct categories based on the techniques used~\cite{anomaly_detection_survey}:
|
By their very nature anomalies are rare occurrences and oftentimes unpredictable in nature, which makes it hard to define all possible anomalies in any system. It also makes it very challenging to create an algorithm which is capable of detecting anomalies which may have never occurred before and may not have been known to exist during the creation of the detection algorithm. There are many possible approaches to this problem, though they can be roughly grouped into six distinct categories based on the techniques used~\cite{anomaly_detection_survey}:
|
||||||
|
|
||||||
\begin{enumerate}
|
\begin{enumerate}
|
||||||
\item \textbf{Classification Based} \\ A classification technique such as \rev{Support Vector Machine (SVM)~\cite{bg_svm}} is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples.
|
\item \textbf{Classification Based} \\ A classification technique such as \rev{Support Vector Machine (SVM)~\cite{bg_svm}} is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples.
|
||||||
\item \textbf{Clustering Based} \\ Clustering techniques such as \rev{K-Means~\cite{bg_kmeans}} or DBSCAN\rev{~\cite{bg_dbscan}} aim to group similar \rev{data into} clusters, differentiating it from dissimilar data which may belong to another or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not \rev{belonging} to any cluster at all or at least be \rev{at} an appreciable distance from the closest normal cluster's center.
|
\item \textbf{Clustering Based} \\ Clustering techniques such as \rev{K-Means~\cite{bg_kmeans}} or DBSCAN\rev{~\cite{bg_dbscan}} aim to group similar \rev{data into} clusters, differentiating it from dissimilar data which may belong to another or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not \rev{belonging} to any cluster at all or at least be \rev{at} an appreciable distance from the closest normal cluster's center.
|
||||||
\item \textbf{Nearest Neighbor Based} \\ Similar to the clustering based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to their $k^{th}$ nearest neighbor or the density of their local neighborhood, to judge wether a sample is anomalous.
|
\item \textbf{Nearest Neighbor Based} \\ Similar to the clustering based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to their $k^{th}$ nearest neighbor or the density of their local neighborhood, to judge whether a sample is anomalous.
|
||||||
\item \textbf{Statistical} \\ These methods try to fit a statistical model of the normal behaviour to the data. After the distribution from which normal data originates is defined, samples can be found to be normal or anomalous based on their likelihood to \rev{arising from that} distribution.
|
\item \textbf{Statistical} \\ These methods try to fit a statistical model of the normal behavior to the data. After the distribution from which normal data originates is defined, samples can be found to be normal or anomalous based on their likelihood to \rev{arising from that} distribution.
|
||||||
\item \textbf{Information Theoretic} \\ The main assumption for information theoretic anomaly detection methods, is that anomalies differ somehow in their information content from anomalous data. An information theoretic measure is therefore used to determine \rev{irregularities} in the data's information content, enabling the detection of anomalous samples.
|
\item \textbf{Information Theoretic} \\ The main assumption for information theoretic anomaly detection methods, is that anomalies differ somehow in their information content from anomalous data. An information theoretic measure is therefore used to determine \rev{irregularities} in the data's information content, enabling the detection of anomalous samples.
|
||||||
\item \textbf{Spectral} \\ Spectral approaches assume the possibility to map data into a lower-dimensional space, where normal data appears significantly different from anomalous data. To this end a dimensionality reduction technique such as Principal Component Analysis (PCA)\rev{~\cite{bg_pca}} is used to embed the data into a lower dimensional \rev{subspace. Spectral} methods are oftentimes used as a pre-processing step followed by another anomaly detection method operating on the data's subspace.
|
\item \textbf{Spectral} \\ Spectral approaches assume the possibility to map data into a lower-dimensional space, where normal data appears significantly different from anomalous data. To this end a dimensionality reduction technique such as Principal Component Analysis (PCA)\rev{~\cite{bg_pca}} is used to embed the data into a lower dimensional \rev{subspace. Spectral} methods are oftentimes used as a pre-processing step followed by another anomaly detection method operating on the data's subspace.
|
||||||
\end{enumerate}
|
\end{enumerate}
|
||||||
@@ -268,15 +269,15 @@ In this thesis we used an anomaly detection method, namely \citetitle{deepsad}\r
|
|||||||
|
|
||||||
Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering based approach with a spectral pre-processing component, in that it uses a neural network to reduce the inputs dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high dimensional it makes sense to use a spectral method to reduce \rev{its} dimensionality\rev{, furthermore} an approach which results in an analog value rather than a binary classification is useful for our use case since we want to quantify not only classify the data degradation.
|
Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering based approach with a spectral pre-processing component, in that it uses a neural network to reduce the inputs dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high dimensional it makes sense to use a spectral method to reduce \rev{its} dimensionality\rev{, furthermore} an approach which results in an analog value rather than a binary classification is useful for our use case since we want to quantify not only classify the data degradation.
|
||||||
|
|
||||||
There is a wide \rev{set} of problems in domains similar to the one we research in this \rev{thesis}, for which modeling them as anomaly detection problems has been proven successful. The degradation of pointclouds, produced by an industrial 3D sensor, has been modeled as an anomaly detection task in \rev{\cite{bg_ad_pointclouds_scans}}. \citeauthor{bg_ad_pointclouds_scans} propose a student-teacher model capable of infering a pointwise anomaly score for degradation in point clouds. The teacher network is trained on an anomaly-free dataset to extract dense features of the point clouds' local geometries, after which an identical student network is trained to emulate the teacher networks' outputs. For degraded pointclouds the regression between the teacher's and student's outputs is calculated and interpreted as the anomaly score, with the rationalization that the student network has not observed features produced by anomalous geometries during training, leaving it incapable of producing a similar output as the teacher for those regions. Another example would be \rev{\cite{bg_ad_pointclouds_poles}}, which proposes a method to detect and classify pole-like objects in urban point cloud data, to differentiate between natural and man-made objects such as street signs, for autonomous driving purposes. An anomaly detection method was used to identify the vertical pole-like objects in the point clouds and then the preprocessed objects were grouped by similarity using a clustering algorithm to then classify them as either trees or man-made poles.
|
There is a wide \rev{set} of problems in domains similar to the one we research in this \rev{thesis}, for which modeling them as anomaly detection problems has been proven successful. The degradation of pointclouds, produced by an industrial 3D sensor, has been modeled as an anomaly detection task in \rev{\cite{bg_ad_pointclouds_scans}}. \citeauthor{bg_ad_pointclouds_scans} propose a student-teacher model capable of inferring a pointwise anomaly score for degradation in point clouds. The teacher network is trained on an anomaly-free dataset to extract dense features of the point clouds' local geometries, after which an identical student network is trained to emulate the teacher networks' outputs. For degraded pointclouds the regression between the teacher's and student's outputs is calculated and interpreted as the anomaly score, with the rationalization that the student network has not observed features produced by anomalous geometries during training, leaving it incapable of producing a similar output as the teacher for those regions. Another example would be \rev{\cite{bg_ad_pointclouds_poles}}, which proposes a method to detect and classify pole-like objects in urban point cloud data, to differentiate between natural and man-made objects such as street signs, for autonomous driving purposes. An anomaly detection method was used to identify the vertical pole-like objects in the point clouds and then the preprocessed objects were grouped by similarity using a clustering algorithm to then classify them as either trees or man-made poles.
|
||||||
|
|
||||||
As already shortly mentioned at the beginning of this section, anomaly detection methods and their usage are oftentimes challenged by the limited availability of anomalous data, owing to the very nature of anomalies which are rare occurences. Oftentimes the intended use case is to even find unknown anomalies in a given dataset which have not yet been identified. In addition, it can be challenging to classify anomalies correctly for complex data, since the very definition of an anomaly is dependent on many factors such as the type of data, the intended use case or even how the data evolves over time. For these reasons most types of anomaly detection approaches limit their reliance on anomalous data during training and many of them do not differentiate between normal and anomalous data at all. DeepSAD is a semi-supervised method which is characterized by using a mixture of labeled and unlabeled data.
|
As already shortly mentioned at the beginning of this section, anomaly detection methods and their usage are oftentimes challenged by the limited availability of anomalous data, owing to the very nature of anomalies which are rare occurrences. Oftentimes the intended use case is to even find unknown anomalies in a given dataset which have not yet been identified. In addition, it can be challenging to classify anomalies correctly for complex data, since the very definition of an anomaly is dependent on many factors such as the type of data, the intended use case or even how the data evolves over time. For these reasons most types of anomaly detection approaches limit their reliance on anomalous data during training and many of them do not differentiate between normal and anomalous data at all. DeepSAD is a semi-supervised method which is characterized by using a mixture of labeled and unlabeled data.
|
||||||
|
|
||||||
|
|
||||||
\newsection{semi_supervised}{Semi-Supervised Learning Algorithms}
|
\newsection{semi_supervised}{Semi-Supervised Learning Algorithms}
|
||||||
|
|
||||||
|
|
||||||
Machine learning refers to algorithms capable of learning patterns from existing data to perform tasks on previously unseen data, without being explicitely programmed to do so~\cite{machine_learning_first_definition}. Central to many approaches is the definition of an objective function that measures how well the model is performing. The model’s parameters are then adjusted to optimize this objective. By leveraging these data-driven methods, machine learning can handle complex tasks across a wide range of domains.
|
Machine learning refers to algorithms capable of learning patterns from existing data to perform tasks on previously unseen data, without being explicitly programmed to do so~\cite{machine_learning_first_definition}. Central to many approaches is the definition of an objective function that measures how well the model is performing. The model’s parameters are then adjusted to optimize this objective. By leveraging these data-driven methods, machine learning can handle complex tasks across a wide range of domains.
|
||||||
|
|
||||||
Among the techniques employed in machine \rev{learning,} neural networks have become especially prominent over the past few decades due to their ability to achieve state-of-the-art results across a wide variety of domains. They are most commonly composed of layers of interconnected artificial neurons. Each neuron computes a weighted sum of its inputs, adds a bias term, and then applies a nonlinear activation function, enabling them to model complex non-linear relationships. These layers are typically organized into three types:
|
Among the techniques employed in machine \rev{learning,} neural networks have become especially prominent over the past few decades due to their ability to achieve state-of-the-art results across a wide variety of domains. They are most commonly composed of layers of interconnected artificial neurons. Each neuron computes a weighted sum of its inputs, adds a bias term, and then applies a nonlinear activation function, enabling them to model complex non-linear relationships. These layers are typically organized into three types:
|
||||||
|
|
||||||
@@ -290,7 +291,7 @@ As outlined above, neural network training is formulated as an optimization prob
|
|||||||
|
|
||||||
Aside from the underlying technique, one can also categorize machine learning algorithms by the type of feedback provided during learning, for the network to improve. Broadly speaking, three main categories—supervised, unsupervised and reinforcement learning—exist, although many other approaches do not exactly fit any of these categories and have spawned less common categories like semi-supervised or self-supervised learning.
|
Aside from the underlying technique, one can also categorize machine learning algorithms by the type of feedback provided during learning, for the network to improve. Broadly speaking, three main categories—supervised, unsupervised and reinforcement learning—exist, although many other approaches do not exactly fit any of these categories and have spawned less common categories like semi-supervised or self-supervised learning.
|
||||||
|
|
||||||
In supervised learning, each input sample is paired with a “ground-truth” label representing the desired output. During training, the model makes a prediction and a loss function quantifies the difference between the prediction and the truth label. The learning algorithm then adjusts its parameters to minimize this loss, improving its performance over time. Labels are typically categorical (used for classification tasks, such as distinguishing “cat” from “dog”) or continuous (used for regression tasks, like predicting a temperature or distance). Figure~\ref{fig:ml_learning_schema_concept}~\rev{(b)} illustrates this principle with a classification example, where labelled data is used to learn a boundary between two classes.
|
In supervised learning, each input sample is paired with a “ground-truth” label representing the desired output. During training, the model makes a prediction and a loss function quantifies the difference between the prediction and the truth label. The learning algorithm then adjusts its parameters to minimize this loss, improving its performance over time. Labels are typically categorical (used for classification tasks, such as distinguishing “cat” from “dog”) or continuous (used for regression tasks, like predicting a temperature or distance). Figure~\ref{fig:ml_learning_schema_concept}~\rev{(b)} illustrates this principle with a classification example, where labeled data is used to learn a boundary between two classes.
|
||||||
|
|
||||||
|
|
||||||
\figc{ml_learning_schema_concept}{figures/ml_learning_schema_concept.png}{Conceptual illustration of unsupervised (a) and supervised (b) learning. In (a), the inputs are two-dimensional data without labels, and the algorithm groups them into clusters without external guidance. In (b), the inputs have class labels (colors), which serve as training signals for learning a boundary between the two classes. Reproduced from~\cite{ml_supervised_unsupervised_figure_source}.}{width=0.6\textwidth}
|
\figc{ml_learning_schema_concept}{figures/ml_learning_schema_concept.png}{Conceptual illustration of unsupervised (a) and supervised (b) learning. In (a), the inputs are two-dimensional data without labels, and the algorithm groups them into clusters without external guidance. In (b), the inputs have class labels (colors), which serve as training signals for learning a boundary between the two classes. Reproduced from~\cite{ml_supervised_unsupervised_figure_source}.}{width=0.6\textwidth}
|
||||||
@@ -302,7 +303,7 @@ In reinforcement learning, an agent learns by trial and error while interacting
|
|||||||
|
|
||||||
Semi-Supervised learning algorithms are an \rev{in-between} category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. Semi-supervised methods are oftentimes an effort to improve a machine learning algorithm belonging to either the supervised or unsupervised category. Supervised methods such as classification tasks are enhanced by using large amounts of unlabeled data to augment the supervised training without additional need of labeling work. Alternatively, unsupervised methods like clustering algorithms may not only use unlabeled data but improve their performance by considering some hand-labeled data during training.
|
Semi-Supervised learning algorithms are an \rev{in-between} category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. Semi-supervised methods are oftentimes an effort to improve a machine learning algorithm belonging to either the supervised or unsupervised category. Supervised methods such as classification tasks are enhanced by using large amounts of unlabeled data to augment the supervised training without additional need of labeling work. Alternatively, unsupervised methods like clustering algorithms may not only use unlabeled data but improve their performance by considering some hand-labeled data during training.
|
||||||
|
|
||||||
Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their suitability varies. While supervised anomaly detection methods exist, their usability not only depends on the availability of labeled training data but also on a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labour intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making capture of enough anomalous behaviour a hard problem. Semi-Supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods which is to model the normal class behaviour and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behaviour to improve their perfomance over fully unsupervised methods. DeepSAD is a semi-supervised method which extends its unsupervised predecessor Deep SVDD~\cite{deep_svdd} by including some labeled samples during training. Both, DeepSAD and Deep SVDD also utilize an autoencoder in a pre-training step, a machine learning architecture\rev{, which we will look at next}.
|
Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their suitability varies. While supervised anomaly detection methods exist, their usability not only depends on the availability of labeled training data but also on a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labor intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making capture of enough anomalous behavior a hard problem. Semi-Supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods which is to model the normal class behavior and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behavior to improve their performance over fully unsupervised methods. DeepSAD is a semi-supervised method which extends its unsupervised predecessor Deep SVDD~\cite{deep_svdd} by including some labeled samples during training. Both, DeepSAD and Deep SVDD also utilize an autoencoder in a pretraining step, a machine learning architecture\rev{, which we will look at next}.
|
||||||
|
|
||||||
\newsection{autoencoder}{Autoencoder}
|
\newsection{autoencoder}{Autoencoder}
|
||||||
|
|
||||||
@@ -312,9 +313,9 @@ Autoencoders are a type of neural network architecture, whose main goal is learn
|
|||||||
\fig{autoencoder_general}{figures/autoencoder_principle.png}{Illustration of an autoencoder’s working principle. The encoder $\mathbf{g_\phi}$ compresses the input into a lower-dimensional bottleneck representation $\mathbf{z}$, which is then reconstructed by the decoder $\mathbf{f_\theta}$. During training, the difference between input and output serves as the loss signal to optimize both the encoder’s feature extraction and the decoder’s reconstruction. Reproduced from~\cite{ml_autoencoder_figure_source}.
|
\fig{autoencoder_general}{figures/autoencoder_principle.png}{Illustration of an autoencoder’s working principle. The encoder $\mathbf{g_\phi}$ compresses the input into a lower-dimensional bottleneck representation $\mathbf{z}$, which is then reconstructed by the decoder $\mathbf{f_\theta}$. During training, the difference between input and output serves as the loss signal to optimize both the encoder’s feature extraction and the decoder’s reconstruction. Reproduced from~\cite{ml_autoencoder_figure_source}.
|
||||||
}
|
}
|
||||||
|
|
||||||
One key use case of autoencoders is to employ them as a dimensionality reduction technique. In that case, the latent space \rev{in between} the encoder and decoder is of a lower dimensionality than the input data itself. Due to the aforementioned reconstruction goal, the shared information between the input data and its latent space representation is maximized, which is known as following the Infomax principle\rev{~\cite{bg_infomax}}. After training such an autoencoder, it may be used to generate lower-dimensional representations of the given datatype, enabling more performant computations which may have been infeasible to achieve on the original data. DeepSAD uses an autoencoder in a pre-training step to achieve this goal among others.
|
One key use case of autoencoders is to employ them as a dimensionality reduction technique. In that case, the latent space \rev{in between} the encoder and decoder is of a lower dimensionality than the input data itself. Due to the aforementioned reconstruction goal, the shared information between the input data and its latent space representation is maximized, which is known as following the Infomax principle\rev{~\cite{bg_infomax}}. After training such an autoencoder, it may be used to generate lower-dimensional representations of the given datatype, enabling more performant computations which may have been infeasible to achieve on the original data. DeepSAD uses an autoencoder in a pretraining step to achieve this goal among others.
|
||||||
|
|
||||||
Autoencoders have been shown to be useful in the anomaly detection domain by assuming that autoencoders trained on more normal than anomalous data are better at reconstructing normal behaviour than anomalous one. This assumption allows methods to utilize the reconstruction error as an anomaly score. Examples of this are the methods in \rev{\cite{bg_autoencoder_ad} or \cite{bg_autoencoder_ad_2}} which both employ an autoencoder and the aforementioned assumption. Autoencoders have also been shown to be a suitable dimensionality reduction technique for \rev{LiDAR} data, which is oftentimes high-dimensional and sparse, making feature extraction and dimensionality reduction popular preprocessing steps. As an example, \rev{\cite{bg_autoencoder_lidar}} shows the feasibility and advantages of using an autoencoder architecture to reduce \rev{LiDAR}-orthophoto fused feature's dimensionality for their building detection method, which can recognize buildings in visual data taken from an airplane. Similarly, we can make use of the dimensionality reduction in DeepSAD's pre-training step, since our method is intended to work with high-dimensional \rev{LiDAR} data.
|
Autoencoders have been shown to be useful in the anomaly detection domain by assuming that autoencoders trained on more normal than anomalous data are better at reconstructing normal behavior than anomalous one. This assumption allows methods to utilize the reconstruction error as an anomaly score. Examples of this are the methods in \rev{\cite{bg_autoencoder_ad} or \cite{bg_autoencoder_ad_2}} which both employ an autoencoder and the aforementioned assumption. Autoencoders have also been shown to be a suitable dimensionality reduction technique for \rev{LiDAR} data, which is oftentimes high-dimensional and sparse, making feature extraction and dimensionality reduction popular preprocessing steps. As an example, \rev{\cite{bg_autoencoder_lidar}} shows the feasibility and advantages of using an autoencoder architecture to reduce \rev{LiDAR}-orthophoto fused feature's dimensionality for their building detection method, which can recognize buildings in visual data taken from an airplane. Similarly, we can make use of the dimensionality reduction in DeepSAD's pretraining step, since our method is intended to work with high-dimensional \rev{LiDAR} data.
|
||||||
|
|
||||||
\newsection{lidar_related_work}{\rev{LiDAR} - Light Detection and Ranging}
|
\newsection{lidar_related_work}{\rev{LiDAR} - Light Detection and Ranging}
|
||||||
|
|
||||||
@@ -349,15 +350,15 @@ DeepSAD's overall mechanics are similar to clustering-based anomaly detection me
|
|||||||
\fig{deep_svdd_transformation}{figures/deep_svdd_transformation}{DeepSAD teaches a neural network to transform data into a latent space and minimize the volume of an data-encompassing hypersphere centered around a predetermined centroid $\textbf{c}$. \\Reproduced from~\cite{deep_svdd}.}
|
\fig{deep_svdd_transformation}{figures/deep_svdd_transformation}{DeepSAD teaches a neural network to transform data into a latent space and minimize the volume of an data-encompassing hypersphere centered around a predetermined centroid $\textbf{c}$. \\Reproduced from~\cite{deep_svdd}.}
|
||||||
|
|
||||||
|
|
||||||
Before DeepSAD's training can begin, a pre-training step is required, during which an autoencoder is trained on all available input data. One of DeepSAD's goals is to map input data onto a lower dimensional latent space, in which the separation between normal and anomalous data can be achieved. To this end DeepSAD and its predecessor Deep SVDD make use of the autoencoder's reconstruction goal, whose successful training ensures confidence in the encoder architecture's suitability for extracting the input datas' most prominent information to the latent space \rev{in between} the encoder and decoder. DeepSAD goes on to use just the encoder as its main network architecture, discarding the decoder at this step, since reconstruction of the input is unnecessary.
|
Before DeepSAD's training can begin, a pretraining step is required, during which an autoencoder is trained on all available input data. One of DeepSAD's goals is to map input data onto a lower dimensional latent space, in which the separation between normal and anomalous data can be achieved. To this end DeepSAD and its predecessor Deep SVDD make use of the autoencoder's reconstruction goal, whose successful training ensures confidence in the encoder architecture's suitability for extracting the input datas' most prominent information to the latent space \rev{in between} the encoder and decoder. DeepSAD goes on to use just the encoder as its main network architecture, discarding the decoder at this step, since reconstruction of the input is unnecessary.
|
||||||
|
|
||||||
|
|
||||||
The pre-training results are used in two more key ways. First, the encoder weights obtained from the autoencoder pre-training initialize DeepSAD’s network for the main training phase. Second, we perform an initial forward pass through the encoder on all training samples, and the mean of these latent representations is set as the hypersphere center, $\mathbf{c}$. According to \citeauthor{deepsad}, this initialization method leads to faster convergence during the main training phase compared to using a randomly selected centroid. An alternative would be to compute $\mathbf{c}$ using only the labeled normal examples, which would prevent the center from being influenced by anomalous samples; however, this requires a sufficient number of labeled normal samples. Once defined, the hypersphere center $\mathbf{c}$ remains fixed, as allowing it to be optimized freely could in the unsupervised case lead to a hypersphere collapse-a trivial solution where the network learns to map all inputs directly onto the centroid $\mathbf{c}$.
|
The pretraining results are used in two more key ways. First, the encoder weights obtained from the autoencoder pretraining initialize DeepSAD’s network for the main training phase. Second, we perform an initial forward pass through the encoder on all training samples, and the mean of these latent representations is set as the hypersphere center, $\mathbf{c}$. According to \citeauthor{deepsad}, this initialization method leads to faster convergence during the main training phase compared to using a randomly selected centroid. An alternative would be to compute $\mathbf{c}$ using only the labeled normal examples, which would prevent the center from being influenced by anomalous samples; however, this requires a sufficient number of labeled normal samples. Once defined, the hypersphere center $\mathbf{c}$ remains fixed, as allowing it to be optimized freely could in the unsupervised case lead to a hypersphere collapse-a trivial solution where the network learns to map all inputs directly onto the centroid $\mathbf{c}$.
|
||||||
|
|
||||||
|
|
||||||
In the main training step, DeepSAD's network is trained using SGD backpropagation. The unlabeled training data is used with the goal to minimize an data-encompassing hypersphere. Since one of the pre-conditions of training was the significant prevelance of normal data over anomalies in the training set, normal samples collectively cluster more tightly around the centroid, while the rarer anomalous samples do not contribute as significantly to the optimization, resulting in them staying further from the hypersphere center. The labeled data includes binary class labels signifying their status as either normal or anomalous samples. Labeled anomalies are pushed away from the center by defining their optimization target as maximizing the distance between them and $\mathbf{c}$. Labeled normal samples are treated similar to unlabeled samples with the difference that DeepSAD includes a hyperparameter capable of controling the proportion with which labeled and unlabeled data contribute to the overall optimization. The resulting network has learned to map normal data samples closer to $\mathbf{c}$ in the latent space and anomalies further away.
|
In the main training step, DeepSAD's network is trained using SGD backpropagation. The unlabeled training data is used with the goal to minimize an data-encompassing hypersphere. Since one of the pre-conditions of training was the significant prevelance of normal data over anomalies in the training set, normal samples collectively cluster more tightly around the centroid, while the rarer anomalous samples do not contribute as significantly to the optimization, resulting in them staying further from the hypersphere center. The labeled data includes binary class labels signifying their status as either normal or anomalous samples. Labeled anomalies are pushed away from the center by defining their optimization target as maximizing the distance between them and $\mathbf{c}$. Labeled normal samples are treated similar to unlabeled samples with the difference that DeepSAD includes a hyperparameter capable of controling the proportion with which labeled and unlabeled data contribute to the overall optimization. The resulting network has learned to map normal data samples closer to $\mathbf{c}$ in the latent space and anomalies further away.
|
||||||
|
|
||||||
\fig{deepsad_procedure}{diagrams/deepsad_procedure/deepsad_procedure}{Overview of the DeepSAD workflow. Training starts with unlabeled data and optional labeled samples, which are used to pre-train an autoencoder, compute the hypersphere center, and then perform main training with adjustable weighting of labeled versus unlabeled data. During inference, new samples are encoded and their distance to the hypersphere center is used as an anomaly score, with larger distances indicating stronger anomalies.}
|
\fig{deepsad_procedure}{diagrams/deepsad_procedure/deepsad_procedure}{Overview of the DeepSAD workflow. Training starts with unlabeled data and optional labeled samples, which are used to pretrain an autoencoder, compute the hypersphere center, and then perform main training with adjustable weighting of labeled versus unlabeled data. During inference, new samples are encoded and their distance to the hypersphere center is used as an anomaly score, with larger distances indicating stronger anomalies.}
|
||||||
|
|
||||||
To infer if a previously unknown data sample is normal or anomalous, the sample is fed in a forward-pass through the fully trained network. During inference, the centroid $\mathbf{c}$ needs to be known, to calculate the geometric distance between the samples latent representation and $\mathbf{c}$. This distance \rev{serves as} an anomaly score, which correlates with the likelihood of the sample being anomalous. Due to differences in input data type, training success and latent space dimensionality, the anomaly score's magnitude has to be judged on an individual basis for each trained network. This means, scores produced by one network that signify normal data, may very well clearly indicate an anomaly for another network. The geometric distance between two points in space is a scalar analog value, therefore post-processing of the score is necessary to achieve a binary classification of normal and anomalous if desired.
|
To infer if a previously unknown data sample is normal or anomalous, the sample is fed in a forward-pass through the fully trained network. During inference, the centroid $\mathbf{c}$ needs to be known, to calculate the geometric distance between the samples latent representation and $\mathbf{c}$. This distance \rev{serves as} an anomaly score, which correlates with the likelihood of the sample being anomalous. Due to differences in input data type, training success and latent space dimensionality, the anomaly score's magnitude has to be judged on an individual basis for each trained network. This means, scores produced by one network that signify normal data, may very well clearly indicate an anomaly for another network. The geometric distance between two points in space is a scalar analog value, therefore post-processing of the score is necessary to achieve a binary classification of normal and anomalous if desired.
|
||||||
|
|
||||||
@@ -376,7 +377,7 @@ Since DeepSAD is heavily based on its predecessor \rev{Deep SVDD}~\cite{deep_svd
|
|||||||
|
|
||||||
Deep SVDD is an unsupervised method which does not rely on labeled data to train the network to differentiate between normal and anomalous data. The first term of its optimization objective depicts the shrinking of the data-encompassing hypersphere around the given center $\mathbf{c}$. For each data sample $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, its geometric distance to $\mathbf{c}$ in the latent space produced by the neural network $\phi(\wc; \mathcal{W})$ is minimized proportionally to the amount of data samples $n$. The second term is a standard L2 regularization term which prevents overfitting with hyperparameter $\lambda > 0$ and $\|\wc\|_F$ denoting the Frobenius norm.
|
Deep SVDD is an unsupervised method which does not rely on labeled data to train the network to differentiate between normal and anomalous data. The first term of its optimization objective depicts the shrinking of the data-encompassing hypersphere around the given center $\mathbf{c}$. For each data sample $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, its geometric distance to $\mathbf{c}$ in the latent space produced by the neural network $\phi(\wc; \mathcal{W})$ is minimized proportionally to the amount of data samples $n$. The second term is a standard L2 regularization term which prevents overfitting with hyperparameter $\lambda > 0$ and $\|\wc\|_F$ denoting the Frobenius norm.
|
||||||
|
|
||||||
\citeauthor{deepsad} argue that the pre-training step employing an autoencoder—originally introduced in Deep SVDD—not only allows a geometric interpretation of the method as minimum volume estimation i.e., the shrinking of the data encompassing hypersphere but also a probabilistic one as entropy minimization over the latent distribution. The autoencoding objective during pre-training implicitly maximizes the mutual information between the data and its latent representation, aligning the approach with the Infomax principle while encouraging a latent space with minimal entropy. This insight enabled \citeauthor{deepsad} to introduce an additional term in DeepSAD’s objective, beyond that of its predecessor Deep SVDD, which incorporates labeled data to better capture the characteristics of normal and anomalous data. They demonstrate that DeepSAD’s objective effectively models the latent distribution of normal data as having low entropy, while that of anomalous data is characterized by higher entropy. In this framework, anomalies are interpreted as being generated from an infinite mixture of distributions that differ from the normal data distribution. The introduction of this aforementioned term in DeepSAD's objective allows it to learn in a semi-supervised way, which helps the model better position known normal samples near the hypersphere center and push known anomalies farther away, thereby enhancing its ability to differentiate between normal and anomalous data.
|
\citeauthor{deepsad} argue that the pretraining step employing an autoencoder—originally introduced in Deep SVDD—not only allows a geometric interpretation of the method as minimum volume estimation i.e., the shrinking of the data encompassing hypersphere but also a probabilistic one as entropy minimization over the latent distribution. The autoencoding objective during pretraining implicitly maximizes the mutual information between the data and its latent representation, aligning the approach with the Infomax principle while encouraging a latent space with minimal entropy. This insight enabled \citeauthor{deepsad} to introduce an additional term in DeepSAD’s objective, beyond that of its predecessor Deep SVDD, which incorporates labeled data to better capture the characteristics of normal and anomalous data. They demonstrate that DeepSAD’s objective effectively models the latent distribution of normal data as having low entropy, while that of anomalous data is characterized by higher entropy. In this framework, anomalies are interpreted as being generated from an infinite mixture of distributions that differ from the normal data distribution. The introduction of this aforementioned term in DeepSAD's objective allows it to learn in a semi-supervised way, which helps the model better position known normal samples near the hypersphere center and push known anomalies farther away, thereby enhancing its ability to differentiate between normal and anomalous data.
|
||||||
|
|
||||||
From \rev{Equation}~\ref{eq:deepsvdd_optimization_objective} it is easy to understand DeepSAD's optimization objective seen in \rev{Equation}~\ref{eq:deepsad_optimization_objective} which additionally \rev{uses} $m$ number of labeled data samples $\{(\mathbf{\tilde{x}}_1, \tilde{y}_1), \dots, (\mathbf{\tilde{x}}_m, \tilde{y}_1)\} \in \mathcal{X} \times \mathcal{Y}$ and $\mathcal{Y} = \{-1,+1\}$ for which $\tilde{y} = +1$ denotes normal and $\tilde{y} = -1$ anomalous samples as well as a new hyperparameter $\eta > 0$ which can be used to balance the strength with which labeled and unlabeled samples contribute to the training.
|
From \rev{Equation}~\ref{eq:deepsvdd_optimization_objective} it is easy to understand DeepSAD's optimization objective seen in \rev{Equation}~\ref{eq:deepsad_optimization_objective} which additionally \rev{uses} $m$ number of labeled data samples $\{(\mathbf{\tilde{x}}_1, \tilde{y}_1), \dots, (\mathbf{\tilde{x}}_m, \tilde{y}_1)\} \in \mathcal{X} \times \mathcal{Y}$ and $\mathcal{Y} = \{-1,+1\}$ for which $\tilde{y} = +1$ denotes normal and $\tilde{y} = -1$ anomalous samples as well as a new hyperparameter $\eta > 0$ which can be used to balance the strength with which labeled and unlabeled samples contribute to the training.
|
||||||
|
|
||||||
@@ -398,7 +399,7 @@ DeepSAD relies on several tuneable hyperparameters that influence different stag
|
|||||||
|
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item \textbf{Network architecture $\mathcal{\phi}$} \\
|
\item \textbf{Network architecture $\mathcal{\phi}$} \\
|
||||||
The encoder architecture determines the representational capacity of the model. Because DeepSAD builds on a pre-training autoencoder, the architecture must be expressive enough to reconstruct input data during pretraining, but also compact enough to support separation of normal and anomalous samples in the latent space. The choice of architecture is therefore data-dependent: convolutional encoders are often used for images, while fully connected encoders or other architectures may be more suitable for various data modalities. The architecture directly constrains which patterns the network can learn and thus strongly shapes the latent space structure.
|
The encoder architecture determines the representational capacity of the model. Because DeepSAD builds on a pretraining autoencoder, the architecture must be expressive enough to reconstruct input data during pretraining, but also compact enough to support separation of normal and anomalous samples in the latent space. The choice of architecture is therefore data-dependent: convolutional encoders are often used for images, while fully connected encoders or other architectures may be more suitable for various data modalities. The architecture directly constrains which patterns the network can learn and thus strongly shapes the latent space structure.
|
||||||
\item \textbf{Latent space dimensionality $\mathbb{R}^d$} \\
|
\item \textbf{Latent space dimensionality $\mathbb{R}^d$} \\
|
||||||
The size of the latent bottleneck is a critical parameter. If $\mathbb{R}^d$ is too small, the network cannot encode all relevant information, leading to information loss and weak representations. If $\mathbb{R}^d$ is too large, the network risks overfitting by encoding irrelevant detail, while also increasing computational cost. These insights stem from autoencoder literature \cite{deep_learning_book}, but it is unclear whether they apply directly to DeepSAD: here the autoencoder serves only for pretraining, and the encoder is subsequently fine-tuned with a different objective. Thus, the optimal choice of $\mathbb{R}^d$ may not coincide with the value that would be ideal for autoencoder reconstruction alone.
|
The size of the latent bottleneck is a critical parameter. If $\mathbb{R}^d$ is too small, the network cannot encode all relevant information, leading to information loss and weak representations. If $\mathbb{R}^d$ is too large, the network risks overfitting by encoding irrelevant detail, while also increasing computational cost. These insights stem from autoencoder literature \cite{deep_learning_book}, but it is unclear whether they apply directly to DeepSAD: here the autoencoder serves only for pretraining, and the encoder is subsequently fine-tuned with a different objective. Thus, the optimal choice of $\mathbb{R}^d$ may not coincide with the value that would be ideal for autoencoder reconstruction alone.
|
||||||
\item \textbf{Label weighting $\eta$} \\
|
\item \textbf{Label weighting $\eta$} \\
|
||||||
@@ -561,7 +562,7 @@ We built our experiments starting from the official DeepSAD PyTorch implementati
|
|||||||
In the following sections, we detail our adaptations to this framework:
|
In the following sections, we detail our adaptations to this framework:
|
||||||
|
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item Data integration: preprocessing and loading the dataset from \citetitle{subter}.
|
\item Data integration: preprocessing and loading the dataset \rev{introduced in Chapter~\ref{chp:data_preprocessing}}.
|
||||||
\item Model architecture: configuring DeepSAD’s encoder to match our pointcloud input format, contrasting two distinct neural network architectures to investigate their impact on the method's output.
|
\item Model architecture: configuring DeepSAD’s encoder to match our pointcloud input format, contrasting two distinct neural network architectures to investigate their impact on the method's output.
|
||||||
\item Training \& evaluation: training DeepSAD alongside two classical baselines—Isolation Forest and One-class SVM (OCSVM)—and comparing their degradation-quantification performance.
|
\item Training \& evaluation: training DeepSAD alongside two classical baselines—Isolation Forest and One-class SVM (OCSVM)—and comparing their degradation-quantification performance.
|
||||||
\item Experimental environment: the hardware and software stack used, with typical training and inference runtimes.
|
\item Experimental environment: the hardware and software stack used, with typical training and inference runtimes.
|
||||||
@@ -572,7 +573,7 @@ Together, these components define the full experimental pipeline, from data load
|
|||||||
\section{Framework \& Data Preparation}
|
\section{Framework \& Data Preparation}
|
||||||
|
|
||||||
|
|
||||||
DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added functionality for dataloading our chosen dataset, added DeepSAD models that work with the \rev{LiDAR} projections datatype, added more evaluation methods and an inference module.
|
DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added \rev{functionality. We allowed loading data from our of} chosen dataset, added DeepSAD models that work with the \rev{LiDAR} projections datatype, added more evaluation methods and an inference module.
|
||||||
|
|
||||||
The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 \rev{LiDAR}. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in \rev{Section}~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern \rev{LiDARs} can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
|
The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 \rev{LiDAR}. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in \rev{Section}~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern \rev{LiDARs} can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
|
||||||
|
|
||||||
@@ -609,7 +610,7 @@ For inference (i.e.\ model validation on held-out experiments), we provide a sec
|
|||||||
|
|
||||||
\section{Model Configuration \& Evaluation Protocol}
|
\section{Model Configuration \& Evaluation Protocol}
|
||||||
|
|
||||||
Since the neural network architecture trained in the deepsad method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth reffered to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
|
Since the neural network architecture trained in the \rev{DeepSAD} method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \rev{\cite{degradation_quantification_rain}} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth \rev{refered} to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
|
||||||
|
|
||||||
The LeNet-inspired autoencoder can be split into an encoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) with a latent space \rev{in between} the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in \rev{Section}~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained.
|
The LeNet-inspired autoencoder can be split into an encoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) with a latent space \rev{in between} the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in \rev{Section}~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained.
|
||||||
|
|
||||||
@@ -645,9 +646,9 @@ The decoder network (see \rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) mirror
|
|||||||
|
|
||||||
Even though the LeNet-inspired encoder proved capable of achieving our degradation quantification objective in initial experiments, we identified several shortcomings that motivated the design of a second, more efficient architecture. The most important issue concerns the shape of the CNN's receptive field (RF) which describes the region of the input that influences a single output activation. Its size and aspect ratio determine which structures the network can effectively capture: if the RF is too small, larger patterns cannot be detected, while an excessively large RF may hinder the network from learning to recognize fine details. For standard image data, the RF is often expressed as a symmetric $n \times n$ region, but in principle it can be computed independently per axis.
|
Even though the LeNet-inspired encoder proved capable of achieving our degradation quantification objective in initial experiments, we identified several shortcomings that motivated the design of a second, more efficient architecture. The most important issue concerns the shape of the CNN's receptive field (RF) which describes the region of the input that influences a single output activation. Its size and aspect ratio determine which structures the network can effectively capture: if the RF is too small, larger patterns cannot be detected, while an excessively large RF may hinder the network from learning to recognize fine details. For standard image data, the RF is often expressed as a symmetric $n \times n$ region, but in principle it can be computed independently per axis.
|
||||||
|
|
||||||
\figc{setup_ef_concept}{figures/setup_ef_concept}{Receptive fields in a CNN. Each output activation aggregates information from a region of the input; stacking layers expands this region, while kernel size, stride, and padding control how quickly it grows and what shape it takes. (A) illustrates slower, fine-grained growth; (B) shows faster expansion, producing a larger—potentially anisotropic—receptive field and highlighting the trade-off between detail and context. Reproduced from~\cite{ef_concept_source}}{width=.6\textwidth}
|
%\figc{setup_ef_concept}{figures/setup_ef_concept}{Receptive fields in a CNN. Each output activation aggregates information from a region of the input; stacking layers expands this region, while kernel size, stride, and padding control how quickly it grows and what shape it takes. (A) illustrates slower, fine-grained growth; (B) shows faster expansion, producing a larger—potentially anisotropic—receptive field and highlighting the trade-off between detail and context. Reproduced from~\cite{ef_concept_source}}{width=.6\textwidth}
|
||||||
|
|
||||||
The RF shape's issue arises from the fact that spinning multi-beam \rev{LiDAR} oftentimes produce point clouds posessing dense horizontal but limited vertical resolution. In our case this, this results in a pixel-per-degree resolution of approximately $5.69\,\sfrac{pixel}{deg}$ vertically and $1.01\,\sfrac{pixel}{deg}$ horizontally. Consequently, the LeNet-inspired encoder’s calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the network’s ability to capture degradation patterns that extend differently across the two axes.
|
The RF shape's issue arises from the fact that spinning multi-beam \rev{LiDAR} oftentimes produce point clouds posessing dense horizontal but limited vertical resolution. In our \rev{case, this} results in a pixel-per-degree resolution of approximately $5.69\,\sfrac{pixel}{deg}$ vertically and $1.01\,\sfrac{pixel}{deg}$ horizontally. Consequently, the LeNet-inspired encoder’s calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the network’s ability to capture degradation patterns that extend differently across the two axes.
|
||||||
|
|
||||||
|
|
||||||
To adjust for this, we decided to modify the network architecture and included further modificatons to improve the method's performance. The encoder (see \rev{Figure}~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modificatons:
|
To adjust for this, we decided to modify the network architecture and included further modificatons to improve the method's performance. The encoder (see \rev{Figure}~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modificatons:
|
||||||
@@ -678,7 +679,6 @@ To adjust for this, we decided to modify the network architecture and included f
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
\paragraph{Decoder.}
|
|
||||||
The decoder (see \rev{Figure}~\ref{fig:setup_arch_ef_decoder}) mirrors the encoder’s structure but introduces changes to improve reconstruction stability:
|
The decoder (see \rev{Figure}~\ref{fig:setup_arch_ef_decoder}) mirrors the encoder’s structure but introduces changes to improve reconstruction stability:
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item \textbf{Nearest-neighbor upsampling followed by convolution.} Instead of relying solely on transposed convolutions, each upsampling stage first enlarges the feature map using parameter-free nearest-neighbor interpolation, followed by a depthwise-separable convolution. This strategy reduces the risk of checkerboard artifacts while still allowing the network to learn fine detail.
|
\item \textbf{Nearest-neighbor upsampling followed by convolution.} Instead of relying solely on transposed convolutions, each upsampling stage first enlarges the feature map using parameter-free nearest-neighbor interpolation, followed by a depthwise-separable convolution. This strategy reduces the risk of checkerboard artifacts while still allowing the network to learn fine detail.
|
||||||
@@ -707,13 +707,13 @@ To compare the computational efficiency of the two architectures we show the num
|
|||||||
\begin{table}[!ht]
|
\begin{table}[!ht]
|
||||||
\centering
|
\centering
|
||||||
\renewcommand{\arraystretch}{1.15}
|
\renewcommand{\arraystretch}{1.15}
|
||||||
\begin{tabularx}{\linewidth}{crrrrrrrr}
|
\begin{tabularx}{\linewidth}{crrrr|rrrr}
|
||||||
\hline
|
\hline
|
||||||
& \multicolumn{4}{c}{\textbf{Encoders}} & \multicolumn{4}{c}{\textbf{Autoencoders}} \\
|
& \multicolumn{4}{c}{\textbf{Encoders}} & \multicolumn{4}{c}{\rev{\textbf{Autoencoders (Encoder $+$ Decoder)}}} \\
|
||||||
\cline{2-9}
|
\cline{2-9}
|
||||||
& \multicolumn{2}{c}{\textbf{LeNet}} & \multicolumn{2}{c}{\textbf{Efficient}} & \multicolumn{2}{c}{\textbf{LeNet}} & \multicolumn{2}{c}{\textbf{Efficient}} \\
|
& \multicolumn{2}{c}{\textbf{LeNet}} & \multicolumn{2}{c}{\textbf{Efficient}} & \multicolumn{2}{c}{\textbf{LeNet}} & \multicolumn{2}{c}{\textbf{Efficient}} \\
|
||||||
\cline{2-9}
|
\cline{2-9}
|
||||||
\textbf{Latent $z$} & \textbf{Params} & \textbf{MACs} & \textbf{Params} & \textbf{MACs} & \textbf{Params} & \textbf{MACs} & \textbf{Params} & \textbf{MACs} \\
|
\textbf{Latent $\mathbb{R}^d$} & \textbf{Params} & \textbf{MACs} & \textbf{Params} & \textbf{MACs} & \textbf{Params} & \textbf{MACs} & \textbf{Params} & \textbf{MACs} \\
|
||||||
\hline
|
\hline
|
||||||
32 & 0.53M & 27.92M & 0.26M & 29.82M & 1.05M & 54.95M & 0.53M & 168.49M \\
|
32 & 0.53M & 27.92M & 0.26M & 29.82M & 1.05M & 54.95M & 0.53M & 168.49M \\
|
||||||
64 & 1.05M & 28.44M & 0.53M & 30.08M & 2.10M & 56.00M & 1.06M & 169.02M \\
|
64 & 1.05M & 28.44M & 0.53M & 30.08M & 2.10M & 56.00M & 1.06M & 169.02M \\
|
||||||
@@ -724,7 +724,7 @@ To compare the computational efficiency of the two architectures we show the num
|
|||||||
1024 & 16.78M & 44.17M & 8.39M & 37.95M & 33.56M & 87.46M & 16.79M & 184.75M \\
|
1024 & 16.78M & 44.17M & 8.39M & 37.95M & 33.56M & 87.46M & 16.79M & 184.75M \\
|
||||||
\hline
|
\hline
|
||||||
\end{tabularx}
|
\end{tabularx}
|
||||||
\caption{Comparison of parameter count and MACs for DeepSAD LeNet-inspired and DeepSAD Efficient encoders across different latent space sizes.}
|
\caption{Comparison of parameter count and MACs for DeepSAD LeNet-inspired and DeepSAD Efficient encoder \rev{and pretraining autoencoder (encoder plus decoder) networks} across different latent space sizes.}
|
||||||
\label{tab:params_lenet_vs_efficient}
|
\label{tab:params_lenet_vs_efficient}
|
||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
@@ -915,7 +915,7 @@ Together, these results provide a comprehensive overview of the computational re
|
|||||||
|
|
||||||
\newchapter{results_discussion}{Results and Discussion}
|
\newchapter{results_discussion}{Results and Discussion}
|
||||||
|
|
||||||
The experiments described in Chapter~\ref{chp:experimental_setup} are presented in this chapter. We begin in Section~\ref{sec:results_pretraining} with the pretraining stage, where the two autoencoder architectures were trained across multiple latent space dimensionalities. These results provide insight into the representational capacity of each architecture. In Section~\ref{sec:results_deepsad}, we turn to the main experiments: training DeepSAD models and benchmarking them against baseline algorithms (Isolation Forest and OCSVM). Finally, in Section~\ref{sec:results_inference}, we present inference results on experiments that were held out during training. These plots illustrate how the algorithms behave when applied sequentially to unseen experiments, offering a more practical perspective on their potential for real-world rescue robotics applications.
|
The \rev{evaluation experiments which the setup in in Chapter~\ref{chp:experimental_setup} described,} are presented in this chapter. We begin in Section~\ref{sec:results_pretraining} with the pretraining stage, where the two autoencoder architectures were trained across multiple latent space dimensionalities. These results provide insight into the representational capacity of each architecture. In Section~\ref{sec:results_deepsad}, we turn to the main experiments: training DeepSAD models and benchmarking them against baseline algorithms (Isolation Forest and OCSVM). Finally, in Section~\ref{sec:results_inference}, we present inference results on \rev{data} that were held out during training. These plots illustrate how the algorithms behave when applied sequentially to unseen \rev{data}, offering a more practical perspective on their potential for real-world rescue robotics applications.
|
||||||
|
|
||||||
% --- Section: Autoencoder Pretraining Results ---
|
% --- Section: Autoencoder Pretraining Results ---
|
||||||
\newsection{results_pretraining}{Autoencoder Pretraining Results}
|
\newsection{results_pretraining}{Autoencoder Pretraining Results}
|
||||||
@@ -924,11 +924,11 @@ The results of pretraining the two autoencoder architectures are summarized in T
|
|||||||
|
|
||||||
\begin{table}[t]
|
\begin{table}[t]
|
||||||
\centering
|
\centering
|
||||||
\caption{Autoencoder pre-training MSE losses across latent dimensions. Left: overall loss; Right: anomaly-only loss. Cells show means across folds (no $\pm$std). Maximum observed standard deviation across all cells (not shown): 0.0067.}
|
\caption{Autoencoder pretraining MSE losses across latent dimensions. Left: overall loss; Right: anomaly-only loss. \rev{The mean across folds is reported}. Maximum observed standard deviation across all cells (not shown): 0.0067.}
|
||||||
\label{tab:pretraining_loss}
|
\label{tab:pretraining_loss}
|
||||||
\begin{tabularx}{\textwidth}{c*{2}{Y}|*{2}{Y}}
|
\begin{tabularx}{\textwidth}{c*{2}{Y}|*{2}{Y}}
|
||||||
\toprule
|
\toprule
|
||||||
& \multicolumn{2}{c}{Overall loss} & \multicolumn{2}{c}{Anomaly loss} \\
|
& \multicolumn{2}{c}{Overall loss \rev{(MSE)}} & \multicolumn{2}{c}{Anomaly loss \rev{(MSE)}} \\
|
||||||
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
|
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
|
||||||
Latent Dim. & LeNet & Efficient & LeNet & Efficient \\
|
Latent Dim. & LeNet & Efficient & LeNet & Efficient \\
|
||||||
\midrule
|
\midrule
|
||||||
@@ -943,11 +943,11 @@ The results of pretraining the two autoencoder architectures are summarized in T
|
|||||||
\end{tabularx}
|
\end{tabularx}
|
||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
\figc{ae_loss_overall}{figures/ae_elbow_test_loss_overall.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient autoencoder architectures.}{width=.9\textwidth}
|
\figc{ae_loss_overall}{figures/ae_elbow_test_loss_overall.png}{Reconstruction loss \rev{(MSE)} across latent dimensions for LeNet-inspired and Efficient autoencoder architectures.}{width=.9\textwidth}
|
||||||
|
|
||||||
Because overall reconstruction loss might obscure how well encoders represent anomalous samples, we additionally evaluate reconstruction errors only on degraded samples from manually-defined smoke segments (Figure~\ref{fig:ae_loss_degraded}). As expected, reconstruction losses are higher on these challenging samples than in the overall evaluation. However, the relative advantage of the Efficient architecture remains, suggesting that its improvements extend to anomalous inputs as well.
|
Because \rev{the} overall reconstruction loss might obscure how well encoders represent anomalous samples, we additionally evaluate reconstruction errors only on degraded samples from manually-defined smoke segments (Figure~\ref{fig:ae_loss_degraded}). As expected, reconstruction losses are higher on these challenging samples than in the overall evaluation. However, the relative advantage of the Efficient architecture remains, suggesting that its improvements extend to anomalous inputs as well.
|
||||||
|
|
||||||
\figc{ae_loss_degraded}{figures/ae_elbow_test_loss_anomaly.png}{Reconstruction loss across latent dimensions for LeNet-inspired and Efficient autoencoder architectures, evaluated only on degraded data from manually-defined smoke experiments.}{width=.9\textwidth}
|
\figc{ae_loss_degraded}{figures/ae_elbow_test_loss_anomaly.png}{Reconstruction loss \rev{(MSE)} across latent dimensions for LeNet-inspired and Efficient autoencoder architectures, evaluated only on degraded data from manually-defined smoke experiments.}{width=.9\textwidth}
|
||||||
|
|
||||||
% --- Section: DeepSAD Training Results ---
|
% --- Section: DeepSAD Training Results ---
|
||||||
\newsection{results_deepsad}{DeepSAD Detection Performance}
|
\newsection{results_deepsad}{DeepSAD Detection Performance}
|
||||||
@@ -958,7 +958,7 @@ Due to the challenges of ground truth quality, evaluation results must be interp
|
|||||||
\item \textbf{Manually-defined labels:} A cleaner ground truth, containing only clearly degraded frames. This removes mislabeled intervals and allows nearly perfect separation. However, it also simplifies the task too much, because borderline cases are excluded.
|
\item \textbf{Manually-defined labels:} A cleaner ground truth, containing only clearly degraded frames. This removes mislabeled intervals and allows nearly perfect separation. However, it also simplifies the task too much, because borderline cases are excluded.
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimensions, labeling regimes, and methods. Under experiment-based evaluation, both DeepSAD variants consistently outperform the baselines, reaching AP values around 0.60–0.66 compared to 0.21 for Isolation Forest and 0.31–0.49 for OCSVM. Under manually-defined evaluation, DeepSAD achieves nearly perfect AP in all settings, while the baselines remain much lower. This contrast shows that the lower AP under experiment-based evaluation is not a weakness of DeepSAD itself, but a direct result of mislabeled samples in the evaluation data. The manually-defined scheme therefore confirms that DeepSAD separates clearly normal from clearly degraded frames very well, while also highlighting that label noise must be kept in mind when interpreting the experiment-based results.
|
Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimensions, labeling regimes, and methods. Under experiment-based evaluation, both DeepSAD variants consistently outperform the baselines, reaching AP values around 0.60–0.66 compared to 0.21 for \rev{the} Isolation Forest and 0.31–0.49 for OCSVM. Under manually-defined evaluation, DeepSAD achieves nearly perfect AP in all settings, while the baselines remain much lower. This contrast shows that the lower AP under experiment-based evaluation is not a weakness of DeepSAD itself, but a direct result of mislabeled samples in the evaluation data. The manually-defined scheme therefore confirms that DeepSAD separates clearly normal from clearly degraded frames very well, while also highlighting that label noise must be kept in mind when interpreting the experiment-based results.
|
||||||
|
|
||||||
\begin{table}[t]
|
\begin{table}[t]
|
||||||
\centering
|
\centering
|
||||||
@@ -1004,7 +1004,7 @@ Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimen
|
|||||||
\end{table}
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
The precision--recall curves (Figure~\ref{fig:prc_representative}) illustrate these effects more clearly. For DeepSAD, precision stays close to 1 until about 0.5 recall, after which it drops off sharply. This plateau corresponds to the fraction of truly degraded frames in the anomalous set. Once recall moves beyond this point, the evaluation demands that the model also “find” the mislabeled anomalies near the run boundaries. To do so, the decision threshold must be lowered so far that many normal frames are also flagged, which causes precision to collapse. The baselines behave differently: OCSVM shows a smooth but weaker decline without a strong high-precision plateau, while Isolation Forest collapses to near-random performance. These operational differences are hidden in a single AP number but are important for judging how the methods would behave in deployment.
|
The precision--recall curves \rev{for experiment-based evaluation} (Figure~\ref{fig:prc_representative}) illustrate these effects more clearly. For DeepSAD, precision stays close to 1 until about 0.5 recall, after which it drops off sharply. This plateau corresponds to the fraction of truly degraded frames in the anomalous set. Once recall moves beyond this point, the evaluation demands that the model also “find” the mislabeled anomalies near the run boundaries. To do so, the decision threshold must be lowered so far that many normal frames are also flagged, which causes precision to collapse. The baselines behave differently: OCSVM shows a smooth but weaker decline without a strong high-precision plateau, while Isolation Forest collapses to near-random performance. These operational differences are hidden in a single AP number but are important for judging how the methods would behave in deployment.
|
||||||
|
|
||||||
Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the manually-defined labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods’ potential for quantifying \rev{LiDAR} degradation in practice.
|
Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the manually-defined labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods’ potential for quantifying \rev{LiDAR} degradation in practice.
|
||||||
|
|
||||||
@@ -1028,13 +1028,13 @@ Table~\ref{tab:results_ap} shows that the unsupervised regime \((0/0)\) achieves
|
|||||||
|
|
||||||
The precision--recall curves in Figure~\ref{fig:prc_over_semi} show that the overall curve shapes are similar across regimes, but shifted relative to one another in line with the AP ordering \((0/0) > (500/100) > (50/10)\). We attribute these shifts to overfitting: when only a few anomalies are labeled, the model fits them too strongly, and if those examples differ too much from other anomalies, generalization suffers. This explains why lightly supervised training performs even worse than unsupervised training, which avoids this bias.
|
The precision--recall curves in Figure~\ref{fig:prc_over_semi} show that the overall curve shapes are similar across regimes, but shifted relative to one another in line with the AP ordering \((0/0) > (500/100) > (50/10)\). We attribute these shifts to overfitting: when only a few anomalies are labeled, the model fits them too strongly, and if those examples differ too much from other anomalies, generalization suffers. This explains why lightly supervised training performs even worse than unsupervised training, which avoids this bias.
|
||||||
|
|
||||||
\figc{prc_over_semi}{figures/results_prc_over_semi.png}{Precision--recall curves at latent dimension~32 for all three labeling regimes (unsupervised, lightly supervised, heavily supervised), shown separately for the LeNet-inspired (left) and Efficient (right) encoders. Baseline methods are included for comparison. Latent dimension~32 is shown as it achieved the best overall AP and is representative of the typical PRC shapes across dimensions.}{width=.7\textwidth}
|
\figc{prc_over_semi}{figures/results_prc_over_semi.png}{\rev{PRCs} at latent dimension~32 for all three labeling regimes (unsupervised, lightly supervised, heavily supervised), shown separately for the LeNet-inspired (\rev{top}) and Efficient (\rev{bottom}) encoders. Baseline methods are included for comparison. Latent dimension~32 is shown as it achieved the best overall AP and is representative of the typical PRC shapes across dimensions.}{width=.7\textwidth}
|
||||||
|
|
||||||
The LeNet variant illustrates this effect most clearly, showing unusually high variance across folds in the lightly supervised case. In several folds, precision drops untypically early, which supports the idea that the model has overfit to a poorly chosen subset of labeled anomalies. The Efficient variant is less affected, maintaining more stable precision plateaus, which suggests it is more robust to such overfitting, which we observe consistently for nearly all latent dimensionalities.
|
The LeNet variant illustrates this effect most clearly, showing unusually high variance across folds in the lightly supervised case. In several folds, precision drops untypically early, which supports the idea that the model has overfit to a poorly chosen subset of labeled anomalies. The Efficient variant is less affected, maintaining more stable precision plateaus, which suggests it is more robust to such overfitting, which we observe consistently for nearly all latent dimensionalities.
|
||||||
|
|
||||||
With many labels \((500/100)\), the results become more stable again and the PRC curves closely resemble the unsupervised case, only shifted slightly left. A larger and more diverse set of labeled anomalies reduces the risk of unlucky sampling and improves generalization, but it still cannot fully match the unsupervised regime, where no overfitting to a specific labeled subset occurs. The only exception is an outlier at latent dimension 512 for LeNet, where the curve again resembles the lightly supervised case, likely due to label sampling effects amplified by higher latent capacity.
|
With many labels \((500/100)\), the results become more stable again and the PRC curves closely resemble the unsupervised case, only shifted slightly left. A larger and more diverse set of labeled anomalies reduces the risk of unlucky sampling and improves generalization, but it still cannot fully match the unsupervised regime, where no overfitting to a specific labeled subset occurs. The only exception is an outlier at latent dimension 512 for LeNet, where the curve again resembles the lightly supervised case, likely due to label sampling effects amplified by higher latent capacity.
|
||||||
|
|
||||||
In summary, three consistent patterns emerge: (i) a very small number of labels can hurt performance by causing overfitting to specific examples, (ii) many labels reduce this problem but still do not surpass unsupervised generalization, and (iii) encoder architecture strongly affects robustness, with LeNet being more sensitive to unstable behavior than Efficient.
|
In summary, three consistent patterns emerge: (i) a very small number of labels can hurt performance by causing overfitting to specific examples, (ii) many labels reduce this problem but still do not surpass unsupervised generalization, and (iii) encoder architecture strongly affects robustness, with \rev{the LeNet-inspired encoder} being more sensitive to unstable behavior than \rev{the Efficient encoder}.
|
||||||
|
|
||||||
% --- Section: Autoencoder Pretraining Results ---
|
% --- Section: Autoencoder Pretraining Results ---
|
||||||
\newsection{results_inference}{Inference on Held-Out Experiments}
|
\newsection{results_inference}{Inference on Held-Out Experiments}
|
||||||
@@ -1074,11 +1074,11 @@ Our results indicate a qualified “yes.” Using anomaly detection (AD)—in pa
|
|||||||
\item \textbf{Empirical comparison for \rev{LiDAR} degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OCSVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}).
|
\item \textbf{Empirical comparison for \rev{LiDAR} degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OCSVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}).
|
||||||
|
|
||||||
\item \textbf{Latent dimensionality insight.}
|
\item \textbf{Latent dimensionality insight.}
|
||||||
Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32–128) achieve the highest AP. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures.
|
Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32–128) achieve the highest \rev{mean average precision (mAP)}. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures.
|
||||||
|
|
||||||
\item \textbf{Semi-supervision insight.} In our data, \emph{unsupervised} DeepSAD performed best; \emph{light} labeling (50/10) performed worst; \emph{many} labels (500/100) partially recovered performance but did not surpass unsupervised. Evidence from PRC shapes and fold variance points to \emph{training-side overfitting to a small labeled set}, an effect that persists even under clean manually-defined evaluation (Table~\ref{tab:results_ap}, Figure~\ref{fig:prc_over_semi}).
|
\item \textbf{Semi-supervision insight.} In our data, \emph{unsupervised} DeepSAD performed best; \emph{light} labeling (50/10) performed worst; \emph{many} labels (500/100) partially recovered performance but did not surpass \rev{the unsupervised approach}. Evidence from \rev{precision--recall curve (PRC)} shapes and fold variance points to \emph{training-side overfitting to a small labeled set}, an effect that persists even under clean manually-defined evaluation (Table~\ref{tab:results_ap}, Figure~\ref{fig:prc_over_semi}).
|
||||||
|
|
||||||
\item \textbf{Encoder architecture matters.} The Efficient encoder outperformed the LeNet-inspired variant in pretraining and downstream AD, indicating that representation quality substantially affects DeepSAD performance (Section~\ref{sec:results_pretraining}, Section~\ref{sec:results_deepsad}).
|
\item \textbf{Encoder architecture matters.} The Efficient encoder \rev{specifically tailored to the application at hand} outperformed the LeNet-inspired variant in pretraining and downstream AD, indicating that representation quality substantially affects DeepSAD performance (Section~\ref{sec:results_pretraining}, Section~\ref{sec:results_deepsad}).
|
||||||
|
|
||||||
\item \textbf{Temporal inference recipe.} For deployment-oriented analysis we propose clean-run $z$-score normalization and causal EMA smoothing to obtain interpretable time-series anomaly scores on full experiments (Section~\ref{sec:results_inference}).
|
\item \textbf{Temporal inference recipe.} For deployment-oriented analysis we propose clean-run $z$-score normalization and causal EMA smoothing to obtain interpretable time-series anomaly scores on full experiments (Section~\ref{sec:results_inference}).
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|||||||
@@ -15,6 +15,8 @@
|
|||||||
let
|
let
|
||||||
pkgs = import nixpkgs { inherit system; };
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
|
||||||
|
aspellWithDicts = pkgs.aspellWithDicts (d: [ d.en ]);
|
||||||
|
|
||||||
latex-packages = with pkgs; [
|
latex-packages = with pkgs; [
|
||||||
texlive.combined.scheme-full
|
texlive.combined.scheme-full
|
||||||
which
|
which
|
||||||
@@ -33,6 +35,7 @@
|
|||||||
buildInputs = [
|
buildInputs = [
|
||||||
latex-packages
|
latex-packages
|
||||||
dev-packages
|
dev-packages
|
||||||
|
aspellWithDicts
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user