fixed plots

formatting
abstract lidar capitalization
2025-10-21 19:04:19 +02:00 · 2025-10-19 17:39:42 +02:00 · 2025-10-19 17:34:38 +02:00 · 2025-10-19 17:29:31 +02:00 · 2025-10-19 16:46:29 +02:00 · 2025-10-19 16:27:22 +02:00
34 changed files with 2556 additions and 262 deletions
--- a/thesis/Main.bbl
+++ b/thesis/Main.bbl
@@ -2134,6 +2134,261 @@
      \verb http://dx.doi.org/10.1109/CVPR.2018.00716
      \endverb
    \endentry
    \entry{roc}{inproceedings}{}{}
      \name{author}{1}{}{%
        {{hash=296b45ce1995399650391e9bc8b09c22}{%
           family={Metz},
           familyi={M\bibinitperiod},
           given={Charles\bibnamedelima E},
           giveni={C\bibinitperiod\bibinitdelim E\bibinitperiod}}}%
      }
      \list{organization}{1}{%
        {Elsevier}%
      }
      \strng{namehash}{296b45ce1995399650391e9bc8b09c22}
      \strng{fullhash}{296b45ce1995399650391e9bc8b09c22}
      \strng{fullhashraw}{296b45ce1995399650391e9bc8b09c22}
      \strng{bibnamehash}{296b45ce1995399650391e9bc8b09c22}
      \strng{authorbibnamehash}{296b45ce1995399650391e9bc8b09c22}
      \strng{authornamehash}{296b45ce1995399650391e9bc8b09c22}
      \strng{authorfullhash}{296b45ce1995399650391e9bc8b09c22}
      \strng{authorfullhashraw}{296b45ce1995399650391e9bc8b09c22}
      \field{sortinit}{6}
      \field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
      \field{labelnamesource}{author}
      \field{labeltitlesource}{title}
      \field{booktitle}{Seminars in nuclear medicine}
      \field{number}{4}
      \field{title}{Basic principles of ROC analysis}
      \field{volume}{8}
      \field{year}{1978}
      \field{pages}{283\bibrangedash 298}
      \range{pages}{16}
    \endentry
    \entry{roc_vs_prc2}{article}{}{}
      \name{author}{3}{}{%
        {{hash=6c614dc489bb7775dc417b3ac5025f38}{%
           family={Calikus},
           familyi={C\bibinitperiod},
           given={Ece},
           giveni={E\bibinitperiod}}}%
        {{hash=6db975f29e3fa71e616c18ec03af0af4}{%
           family={Nowaczyk},
           familyi={N\bibinitperiod},
           given={Slawomir},
           giveni={S\bibinitperiod}}}%
        {{hash=bd821c16b0a8af3d78e72b89568fb1b2}{%
           family={Dikmen},
           familyi={D\bibinitperiod},
           given={Onur},
           giveni={O\bibinitperiod}}}%
      }
      \list{publisher}{2}{%
        {Springer Science}%
        {Business Media LLC}%
      }
      \strng{namehash}{5d40ff291ddd41f19115ab9b533809c6}
      \strng{fullhash}{54b87a3930e5fe91fd9f2e0d38e53ee7}
      \strng{fullhashraw}{54b87a3930e5fe91fd9f2e0d38e53ee7}
      \strng{bibnamehash}{54b87a3930e5fe91fd9f2e0d38e53ee7}
      \strng{authorbibnamehash}{54b87a3930e5fe91fd9f2e0d38e53ee7}
      \strng{authornamehash}{5d40ff291ddd41f19115ab9b533809c6}
      \strng{authorfullhash}{54b87a3930e5fe91fd9f2e0d38e53ee7}
      \strng{authorfullhashraw}{54b87a3930e5fe91fd9f2e0d38e53ee7}
      \field{sortinit}{6}
      \field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
      \field{labelnamesource}{author}
      \field{labeltitlesource}{title}
      \field{issn}{2364-4168}
      \field{journaltitle}{International Journal of Data Science and Analytics}
      \field{month}{6}
      \field{number}{1}
      \field{title}{Context discovery for anomaly detection}
      \field{volume}{19}
      \field{year}{2024}
      \field{pages}{99\bibrangedash 113}
      \range{pages}{15}
      \verb{doi}
      \verb 10.1007/s41060-024-00586-x
      \endverb
      \verb{urlraw}
      \verb http://dx.doi.org/10.1007/s41060-024-00586-x
      \endverb
      \verb{url}
      \verb http://dx.doi.org/10.1007/s41060-024-00586-x
      \endverb
    \endentry
    \entry{roc_vs_prc}{article}{}{}
      \name{author}{8}{}{%
        {{hash=6a81232d1cbcdd0f3888fb10d2d20c69}{%
           family={Campos},
           familyi={C\bibinitperiod},
           given={Guilherme\bibnamedelima O.},
           giveni={G\bibinitperiod\bibinitdelim O\bibinitperiod}}}%
        {{hash=cbfafee6627ecbb346007c41a5787a4e}{%
           family={Zimek},
           familyi={Z\bibinitperiod},
           given={Arthur},
           giveni={A\bibinitperiod}}}%
        {{hash=802157026f850823b2027c2100cb359a}{%
           family={Sander},
           familyi={S\bibinitperiod},
           given={Jörg},
           giveni={J\bibinitperiod}}}%
        {{hash=382d8e986b9afdbfedb661be5cf9ce33}{%
           family={Campello},
           familyi={C\bibinitperiod},
           given={Ricardo\bibnamedelimb J.\bibnamedelimi G.\bibnamedelimi B.},
           giveni={R\bibinitperiod\bibinitdelim J\bibinitperiod\bibinitdelim G\bibinitperiod\bibinitdelim B\bibinitperiod}}}%
        {{hash=38f453607e6cbb2c3efe156849a986dd}{%
           family={Micenková},
           familyi={M\bibinitperiod},
           given={Barbora},
           giveni={B\bibinitperiod}}}%
        {{hash=d5aa8a82c7032184011fd502a43e205a}{%
           family={Schubert},
           familyi={S\bibinitperiod},
           given={Erich},
           giveni={E\bibinitperiod}}}%
        {{hash=69b6af16c92b02af90eb0a2864250685}{%
           family={Assent},
           familyi={A\bibinitperiod},
           given={Ira},
           giveni={I\bibinitperiod}}}%
        {{hash=3f5ba8771c6d99e9af9f7716ed7d180b}{%
           family={Houle},
           familyi={H\bibinitperiod},
           given={Michael\bibnamedelima E.},
           giveni={M\bibinitperiod\bibinitdelim E\bibinitperiod}}}%
      }
      \list{publisher}{2}{%
        {Springer Science}%
        {Business Media LLC}%
      }
      \strng{namehash}{2541e132e48ea3d61d11fb7ef5cc2fb4}
      \strng{fullhash}{6b641fe45043d123c859110c492455cd}
      \strng{fullhashraw}{6b641fe45043d123c859110c492455cd}
      \strng{bibnamehash}{6b641fe45043d123c859110c492455cd}
      \strng{authorbibnamehash}{6b641fe45043d123c859110c492455cd}
      \strng{authornamehash}{2541e132e48ea3d61d11fb7ef5cc2fb4}
      \strng{authorfullhash}{6b641fe45043d123c859110c492455cd}
      \strng{authorfullhashraw}{6b641fe45043d123c859110c492455cd}
      \field{sortinit}{6}
      \field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
      \field{labelnamesource}{author}
      \field{labeltitlesource}{title}
      \field{issn}{1573-756X}
      \field{journaltitle}{Data Mining and Knowledge Discovery}
      \field{month}{1}
      \field{number}{4}
      \field{title}{On the evaluation of unsupervised outlier detection: measures, datasets, and an empirical study}
      \field{volume}{30}
      \field{year}{2016}
      \field{pages}{891\bibrangedash 927}
      \range{pages}{37}
      \verb{doi}
      \verb 10.1007/s10618-015-0444-8
      \endverb
      \verb{urlraw}
      \verb http://dx.doi.org/10.1007/s10618-015-0444-8
      \endverb
      \verb{url}
      \verb http://dx.doi.org/10.1007/s10618-015-0444-8
      \endverb
    \endentry
    \entry{prc}{article}{}{}
      \name{author}{3}{}{%
        {{hash=1f216647f3d14e9e167b5279b02fd2b6}{%
           family={Raghavan},
           familyi={R\bibinitperiod},
           given={Vijay},
           giveni={V\bibinitperiod}}}%
        {{hash=8fc430cb115c6f35cc6c715511c6d017}{%
           family={Bollmann},
           familyi={B\bibinitperiod},
           given={Peter},
           giveni={P\bibinitperiod}}}%
        {{hash=bbea5d20580d37dee6fdc8f2ab689622}{%
           family={Jung},
           familyi={J\bibinitperiod},
           given={Gwang\bibnamedelima S.},
           giveni={G\bibinitperiod\bibinitdelim S\bibinitperiod}}}%
      }
      \list{publisher}{1}{%
        {Association for Computing Machinery (ACM)}%
      }
      \strng{namehash}{8dbc985e2075b3f53854b49b85849232}
      \strng{fullhash}{960dc590833332a78b4cf6bc2d8114f5}
      \strng{fullhashraw}{960dc590833332a78b4cf6bc2d8114f5}
      \strng{bibnamehash}{960dc590833332a78b4cf6bc2d8114f5}
      \strng{authorbibnamehash}{960dc590833332a78b4cf6bc2d8114f5}
      \strng{authornamehash}{8dbc985e2075b3f53854b49b85849232}
      \strng{authorfullhash}{960dc590833332a78b4cf6bc2d8114f5}
      \strng{authorfullhashraw}{960dc590833332a78b4cf6bc2d8114f5}
      \field{sortinit}{6}
      \field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
      \field{labelnamesource}{author}
      \field{labeltitlesource}{title}
      \field{issn}{1558-2868}
      \field{journaltitle}{ACM Transactions on Information Systems}
      \field{month}{7}
      \field{number}{3}
      \field{title}{A critical investigation of recall and precision as measures of retrieval system performance}
      \field{volume}{7}
      \field{year}{1989}
      \field{pages}{205\bibrangedash 229}
      \range{pages}{25}
      \verb{doi}
      \verb 10.1145/65943.65945
      \endverb
      \verb{urlraw}
      \verb http://dx.doi.org/10.1145/65943.65945
      \endverb
      \verb{url}
      \verb http://dx.doi.org/10.1145/65943.65945
      \endverb
    \endentry
    \entry{zscore}{article}{}{}
      \name{author}{3}{}{%
        {{hash=c4141ad87d07f41e44c31cc3b342bb04}{%
           family={Kreyszig},
           familyi={K\bibinitperiod},
           given={Erwin},
           giveni={E\bibinitperiod}}}%
        {{hash=a9bc276dc5f8f0a388ab7a862ced31db}{%
           family={Stroud},
           familyi={S\bibinitperiod},
           given={K},
           giveni={K\bibinitperiod}}}%
        {{hash=f32f108a562b342127650ab203d3b303}{%
           family={Stephenson},
           familyi={S\bibinitperiod},
           given={G},
           giveni={G\bibinitperiod}}}%
      }
      \list{publisher}{1}{%
        {John Wiley \& Sons, Inc. 9 th edition, 2006 Page 2 of 6 Teaching methods~…}%
      }
      \strng{namehash}{f6ac776fd8cd938f842095a0fcfd6d6e}
      \strng{fullhash}{39b0b87e985991ca0e9951b740c61064}
      \strng{fullhashraw}{39b0b87e985991ca0e9951b740c61064}
      \strng{bibnamehash}{39b0b87e985991ca0e9951b740c61064}
      \strng{authorbibnamehash}{39b0b87e985991ca0e9951b740c61064}
      \strng{authornamehash}{f6ac776fd8cd938f842095a0fcfd6d6e}
      \strng{authorfullhash}{39b0b87e985991ca0e9951b740c61064}
      \strng{authorfullhashraw}{39b0b87e985991ca0e9951b740c61064}
      \field{sortinit}{6}
      \field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
      \field{labelnamesource}{author}
      \field{labeltitlesource}{title}
      \field{journaltitle}{Integration}
      \field{number}{4}
      \field{title}{Advanced engineering mathematics}
      \field{volume}{9}
      \field{year}{2008}
      \field{pages}{1014}
      \range{pages}{1}
    \endentry
  \enddatalist
 \endrefsection
 \endinput
--- a/thesis/Main.pdf
+++ b/thesis/Main.pdf
--- a/thesis/Main.tex
+++ b/thesis/Main.tex
@@ -53,9 +53,9 @@
 % **************************************************************************************************
 % template setup -- do not change these unless you know what you are doing!
-\input{./base/documentclass_\DocumentType}
+\input{./base/documentclass_thesis}
 \input{./base/packages}
-\input{./base/layout_\DocumentType}
+\input{./base/layout_thesis}
 \input{./base/macros}
 % **************************************************************************************************
@@ -91,7 +91,8 @@
 }
-\DeclareRobustCommand{\rev}[1]{\textcolor{red}{#1}}
+%\DeclareRobustCommand{\rev}[1]{\textcolor{red}{#1}}
 \DeclareRobustCommand{\rev}[1]{#1}
 \DeclareRobustCommand{\mcah}[1]{}
 % correct bad hyphenation
@@ -156,26 +157,27 @@
 % variable for page numbering
 \newcounter{mypageno}
-% **************************************************************************************************
+
 \begin{document}
 % **************************************************************************************************
 \input{./base/syntax_formatting}
 % for thesis: switch to frontmatter (Roman numbering, etc.)
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\frontmatter \pagestyle{plain} \pagenumbering{Roman}
 }{}
 % **************************************************************************************************
 \begin{document}
 % **************************************************************************************************
 %title
-\input{./base/titlepage_\DocumentType}
+\input{./base/titlepage_thesis}
 % for thesis: abstract, kurzfassung, affidavit and statutory declaration
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\emptydoublepage
 	\addcontentsline{toc}{chapter}{Statutory Declaration}
-	\input{./base/declaration_\DocumentLanguage}
+	\input{./base/declaration_en}
 	\emptydoublepage
 	\input{thesis_preamble/acknowledgements}
 	\emptydoublepage
@@ -187,7 +189,7 @@
 \tableofcontents
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\emptydoublepage
 	\setcounter{mypageno}{\value{page}}
@@ -249,25 +251,25 @@ Because anomalies are, by nature, often unpredictable in form and structure, uns
 Anomaly detection refers to the process of detecting unexpected patterns of data, outliers that deviate significantly from the majority of data, which is implicitly defined as normal by its prevalence. In classic statistical analysis, these techniques have been studied as early as the 19th century~\cite{anomaly_detection_history}. Since then, a multitude of methods and use cases for them have been proposed and studied. Examples of applications include healthcare, where computer vision algorithms are used to detect anomalies in medical images for diagnostics and early detection of diseases~\cite{anomaly_detection_medical}, detection of fraud in decentralized financial systems based on blockchain technology~\cite{anomaly_detection_defi}, as well as fault detection in industrial machinery using acoustic sound data~\cite{anomaly_detection_manufacturing}.
-Figure~\ref{fig:anomaly_detection_overview} depicts a simple but illustrative example of data that can be classified as either normal or anomalous and shows the problem anomaly detection methods try to generally solve. A successful anomaly detection method would somehow learn to differentiate normal from anomalous data, for example, by learning the boundaries around the available normal data and classifying it as either normal or anomalous based on its location inside or outside of those boundaries. Another possible approach could calculate an analog value that correlates with the likelihood of a sample being anomalous, for example, by using the sample's distance from the closest normal data cluster's center.
+Figure~\ref{fig:anomaly_detection_overview} depicts a simple but illustrative example of data that can be classified as either normal or anomalous and shows the problem that anomaly detection methods try to generally solve. A successful anomaly detection method would somehow learn to differentiate normal from anomalous data, for example, by learning the boundaries around the available normal data and classifying it as either normal or anomalous based on its location inside or outside of those boundaries. Another possible approach could calculate an analog value that correlates with the likelihood of a sample being anomalous, for example, by using the sample's distance from the closest normal data cluster's center.
-\figc{anomaly_detection_overview}{figures/anomaly_detection_overview}{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$ as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}\rev{.}}{width=0.5\textwidth}
+\figc{anomaly_detection_overview}{figures/anomaly_detection_overview}{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$, as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}\rev{.}}{width=0.55\textwidth}
 By their very nature, anomalies are rare occurrences and oftentimes unpredictable in nature, which makes it hard to define all possible anomalies in any system. It also makes it very challenging to create an algorithm that is capable of detecting anomalies that may have never occurred before and may not have been known to exist during the creation of the detection algorithm. There are many possible approaches to this problem, though they can be roughly grouped into six distinct categories based on the techniques used~\cite{anomaly_detection_survey}:
 \begin{enumerate}
-	\item \textbf{Classification Based} \\ A classification technique, such as \rev{Support Vector Machine (SVM)~\cite{bg_svm}}, is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available, a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples.
+	\item \textbf{Classification-Based} \\ A classification technique, such as \rev{Support Vector Machine (SVM)~\cite{bg_svm}}, is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available, a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples.
-	\item \textbf{Clustering Based} \\ Clustering techniques such as \rev{K-Means~\cite{bg_kmeans}} or DBSCAN\rev{~\cite{bg_dbscan}} aim to group similar \rev{data into} clusters, differentiating it from dissimilar data, which may belong to another cluster or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not \rev{belonging} to any cluster at all, or at least be \rev{at} an appreciable distance from the closest normal cluster's center.
+	\item \textbf{Clustering-Based} \\ Clustering techniques such as \rev{K-Means~\cite{bg_kmeans}} or DBSCAN\rev{~\cite{bg_dbscan}} aim to group similar \rev{data into} clusters, differentiating it from dissimilar data, which may belong to another cluster or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not \rev{belonging} to any cluster at all, or at least be \rev{at} an appreciable distance from the closest normal cluster's center.
-	\item \textbf{Nearest Neighbor Based} \\ Similar to the clustering based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to its $k^{th}$ nearest neighbor or the density of its local neighborhood to judge whether a sample is anomalous.
+	\item \textbf{Nearest Neighbor Based} \\ Similar to the clustering-based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to its $k^{th}$ nearest neighbor or the density of its local neighborhood to judge whether a sample is anomalous.
 	\item \textbf{Statistical} \\ These methods try to fit a statistical model of the normal behavior to the data. After the distribution from which normal data originates is defined, samples can be found to be normal or anomalous based on their likelihood \rev{of arising from that} distribution.
-	\item \textbf{Information Theoretic} \\ The main assumption for information theoretic anomaly detection methods is that anomalies differ somehow in their information content from anomalous data. An information theoretic measure is therefore used to determine \rev{irregularities} in the data's information content, enabling the detection of anomalous samples.
+	\item \textbf{Information-Theoretic} \\ The main assumption for information-theoretic anomaly detection methods is that anomalies differ somehow in their information content from anomalous data. An information-theoretic measure is therefore used to determine \rev{irregularities} in the data's information content, enabling the detection of anomalous samples.
 	\item \textbf{Spectral} \\ Spectral approaches assume the possibility of mapping data into a lower-dimensional space, where normal data appears significantly different from anomalous data. To this end, a dimensionality reduction technique such as Principal Component Analysis (PCA)\rev{~\cite{bg_pca}} is used to embed the data into a lower-dimensional \rev{subspace. Spectral} methods are often used as a pre-processing step followed by another anomaly detection method operating on the data's subspace.
 \end{enumerate}
 In this thesis, we used an anomaly detection method, namely \citetitle{deepsad}\rev{~(DeepSAD)~\cite{deepsad}}, to model our problem\rev{—}how to quantify the degradation of \rev{LiDAR} sensor data\rev{—}as an anomaly detection problem. We do this by classifying good-quality data as normal and degraded data as anomalous, and rely on a method that can express each sample's likelihood of being anomalous as an analog anomaly score, which enables us to interpret it as the \rev{data} degradation quantification value.
-Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering based approach with a spectral pre-processing component, in that it uses a neural network to reduce the input's dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high-dimensional, it makes sense to use a spectral method to reduce \rev{its} dimensionality. \rev{Moreover} reporting an analog value rather than a binary classification is useful for our use case since we want to quantify not only classify the data degradation.
+Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering-based approach with a spectral pre-processing component, in that it uses a neural network to reduce the input's dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high-dimensional, it makes sense to use a spectral method to reduce \rev{its} dimensionality. \rev{Moreover}, reporting an analog value rather than a binary classification is useful for our use case since we want to quantify not only classify the data degradation.
 There is a wide \rev{set} of problems in domains similar to the one we research in this \rev{thesis}, for which modeling them as anomaly detection problems has been proven successful. The degradation of point clouds, produced by an industrial 3D sensor, has been modeled as an anomaly detection task in \rev{\cite{bg_ad_pointclouds_scans}}. \citeauthor{bg_ad_pointclouds_scans} propose a student-teacher model capable of inferring a pointwise anomaly score for degradation in point clouds. The teacher network is trained on an anomaly-free dataset to extract dense features of the point clouds' local geometries, after which an identical student network is trained to emulate the teacher network's outputs. For degraded point clouds, the regression between the teacher's and student's outputs is calculated and interpreted as the anomaly score, with the rationalization that the student network has not observed features produced by anomalous geometries during training, leaving it incapable of producing a similar output as the teacher for those regions. Another example would be \rev{\cite{bg_ad_pointclouds_poles}}, which proposes a method to detect and classify pole-like objects in urban point cloud data, to differentiate between natural and man-made objects such as street signs, for autonomous driving purposes. An anomaly detection method was used to identify the vertical pole-like objects in the point clouds, and then the preprocessed objects were grouped by similarity using a clustering algorithm to classify them as either trees or man-made poles.
@@ -303,12 +305,12 @@ In reinforcement learning, an agent learns by trial and error while interacting
 Semi-supervised learning algorithms are an \rev{in-between} category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically, vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. Semi-supervised methods are often an effort to improve a machine learning algorithm belonging to either the supervised or unsupervised category. Supervised methods, such as classification tasks, are enhanced by using large amounts of unlabeled data to augment the supervised training without the need for additional labeling work. Alternatively, unsupervised methods like clustering algorithms may not only use unlabeled data but also improve their performance by considering some hand-labeled data during training.
-Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their suitability varies. While supervised anomaly detection methods exist, their usability not only depends on the availability of labeled training data but also on a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labor-intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making the capture of enough anomalous behavior a hard problem. Semi-supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods which is to model the normal class behavior and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behavior to improve their performance over fully unsupervised methods. DeepSAD is a semi-supervised method that extends its unsupervised predecessor Deep SVDD~\cite{deep_svdd} by including some labeled samples during training. Both DeepSAD and Deep SVDD also utilize an autoencoder in a pretraining step, a machine learning architecture\rev{, which we will look at next}.
+Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their suitability varies. While supervised anomaly detection methods exist, their usability not only depends on the availability of labeled training data but also on a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labor-intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making the capture of enough anomalous behavior a hard problem. Semi-supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods, which is to model the normal class behavior and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behavior to improve their performance over fully unsupervised methods. DeepSAD is a semi-supervised method that extends its unsupervised predecessor Deep SVDD~\cite{deep_svdd} by including some labeled samples during training. Both DeepSAD and Deep SVDD also utilize an autoencoder in a pretraining step, a machine learning architecture\rev{, which we will look at next}.
 \newsection{autoencoder}{Autoencoder}
-Autoencoders are a type of neural network architecture whose main goal is learning to encode input data into a representative state, from which the same input can be reconstructed, hence the name. They typically consist of two functions, an encoder and a decoder with a latent space \rev{in between} them as depicted in the toy example in \rev{Figure}~\ref{fig:autoencoder_general}. The encoder learns to extract the most significant features from the input and to convert them into the input's latent space representation. The reconstruction goal ensures that the most prominent features of the input are retained during the encoding phase, due to the inherent inability to reconstruct the input if too much relevant information is missing. The decoder simultaneously learns to reconstruct the original input from its encoded latent space representation by minimizing the error between the input sample and the autoencoder's output. This optimization goal complicates the categorization of autoencoders as unsupervised methods. Although they do not require labeled data, they still compute an error against a known target—the input itself. For this reason, some authors describe them as a form of self-supervised learning, where the data provides its own supervisory signal without requiring expert labeling.
+Autoencoders are a type of neural network architecture whose main goal is learning to encode input data into a representative state, from which the same input can be reconstructed, hence the name. They typically consist of two functions, an encoder and a decoder with a latent space \rev{in between} them, as depicted in the toy example in \rev{Figure}~\ref{fig:autoencoder_general}. The encoder learns to extract the most significant features from the input and to convert them into the input's latent space representation. The reconstruction goal ensures that the most prominent features of the input are retained during the encoding phase, due to the inherent inability to reconstruct the input if too much relevant information is missing. The decoder simultaneously learns to reconstruct the original input from its encoded latent space representation by minimizing the error between the input sample and the autoencoder's output. This optimization goal complicates the categorization of autoencoders as unsupervised methods. Although they do not require labeled data, they still compute an error against a known target—the input itself. For this reason, some authors describe them as a form of self-supervised learning, where the data provides its own supervisory signal without requiring expert labeling.
 \fig{autoencoder_general}{figures/autoencoder_principle.png}{Illustration of an autoencoder’s working principle. The encoder $\mathbf{g_\phi}$ compresses the input into a lower-dimensional bottleneck representation $\mathbf{z}$, which is then reconstructed by the decoder $\mathbf{f_\theta}$. During training, the difference between input and output serves as the loss signal to optimize both the encoder’s feature extraction and the decoder’s reconstruction. Reproduced from~\cite{ml_autoencoder_figure_source}.
 }
@@ -366,7 +368,7 @@ DeepSAD's full training and inference procedure is visualized in \rev{Figure}~\r
 \newsection{algorithm_details}{Algorithm Details and Hyperparameters}
-Since DeepSAD is heavily based on its predecessor \rev{Deep SVDD}~\cite{deep_svdd} it is helpful to first understand Deep SVDD's optimization objective, so we start with explaining it here.  For input space $\mathcal{X} \subseteq \mathbb{R}^D$, output space $\mathcal{Z} \subseteq \mathbb{R}^d$, and a neural network $\phi(\wc; \mathcal{W}) : \mathcal{X} \to \mathcal{Z}$, where $\mathcal{W}$ depicts the neural network's weights with $L$ layers $\{\mathbf{W}_1, \dots, \mathbf{W}_L\}$, $n$ the number of unlabeled training samples $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, $\mathbf{c}$ the center of the hypersphere in the latent space, Deep SVDD teaches the neural network to cluster normal data closely together in the latent space by defining its optimization objective as \rev{follows.}
+Since DeepSAD is heavily based on its predecessor \rev{Deep SVDD}~\cite{deep_svdd}, it is helpful to first understand Deep SVDD's optimization objective, so we start by explaining it here.  For input space $\mathcal{X} \subseteq \mathbb{R}^D$, output space $\mathcal{Z} \subseteq \mathbb{R}^d$, and a neural network $\phi(\wc; \mathcal{W}) : \mathcal{X} \to \mathcal{Z}$, where $\mathcal{W}$ depicts the neural network's weights with $L$ layers $\{\mathbf{W}_1, \dots, \mathbf{W}_L\}$, $n$ the number of unlabeled training samples $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, $\mathbf{c}$ the center of the hypersphere in the latent space, Deep SVDD teaches the neural network to cluster normal data closely together in the latent space by defining its optimization objective as \rev{follows.}
 \begin{equation}
 	\label{eq:deepsvdd_optimization_objective}
@@ -425,7 +427,7 @@ To ensure our chosen dataset meets the needs of reliable degradation quantificat
 \begin{enumerate}
 	\item \textbf{Data Modalities:}\\
-	      The dataset must include \rev{LiDAR} sensor data, since we decided to train and evaluate our method on what should be the most universally used sensor type in the given domain. To keep our method as generalized as possible, we chose to only require range-based point cloud data and \rev{opt out of} sensor-specific data such as intensity or reflectivity, though it may be of interest for future work. It is also desirable to have complementary visual data such as camera images, for better context, manual verification and understanding of the data.
+	      The dataset must include \rev{LiDAR} sensor data, since we decided to train and evaluate our method on what should be the most universally used sensor type in the given domain. To keep our method as generalized as possible, we chose to only require range-based point cloud data and neglect sensor-specific data such as intensity or reflectivity, though it may be of interest for future work. It is also desirable to have complementary visual data, such as camera images, for better context, manual verification, and understanding of the data.
 	\item \textbf{Context \& Collection Method:}\\
 	      To mirror the real-world conditions of autonomous rescue robots, the data should originate from locations such as subterranean environments (tunnels, caves, collapsed structures), which closely reflect what would be encountered during rescue missions. Ideally, it should be captured from a ground-based, self-driving robot platform in motion instead of aerial, handheld, or stationary collection, to ensure similar circumstances to the target domain.
@@ -443,13 +445,13 @@ To ensure our chosen dataset meets the needs of reliable degradation quantificat
-Quantitative benchmarking of degradation quantification requires a degradation label for every scan. Ideally that label would be a continuous degradation score, although a binary label would still enable meaningful comparison. As the rest of this section shows, producing any reliable label is already challenging and assigning meaningful analog scores may not be feasible at all. Compounding the problem, no public search-and-rescue (SAR) \rev{LiDAR} data set offers such ground truth as far as we know. To understand the challenges around labeling \rev{LiDAR} data degradation, we will look at what constitutes degradation in this context.
+Quantitative benchmarking of degradation quantification requires a degradation label for every scan. Ideally, that label would be a continuous degradation score, although a binary label would still enable meaningful comparison. As the rest of this section shows, producing any reliable label is already challenging, and assigning meaningful analog scores may not be feasible at all. Compounding the problem, no public search-and-rescue (SAR) \rev{LiDAR} data set offers such ground truth as far as we know. To understand the challenges around labeling \rev{LiDAR} data degradation, we will look at what constitutes degradation in this context.
-In \rev{Section}~\ref{sec:lidar_related_work} we discussed some internal and environmental error causes of \rev{LiDAR} sensors, such as multi-return ambiguities or atmospheric scattering respectively. While we are aware of research into singular failure \rev{modes~\cite{lidar_errormodel_particles}} or research trying to model the totality of error souces occuring in other \rev{domains~\cite{lidar_errormodel_automotive}}, there appears to be no such model for the search and rescue domain and its unique environmental circumstances. Although, scientific consensus appears to be, that airborne particles are the biggest contributor to degradation in SAR~\cite{lidar_errormodel_consensus}, we think that a more versatile definition is required to ensure confidence during critical SAR missions, which are often of a volatile nature. We are left with an ambiguous definition of what constitutes \rev{LiDAR} point cloud degradation in the SAR domain.
+In \rev{Section}~\ref{sec:lidar_related_work}, we discussed some internal and environmental error causes of \rev{LiDAR} sensors, such as multi-return ambiguities or atmospheric scattering, respectively. While we are aware of research into singular failure \rev{modes~\cite{lidar_errormodel_particles}} or research trying to model the totality of error sources occurring in other \rev{domains~\cite{lidar_errormodel_automotive}}, there appears to be no such model for the search and rescue domain and its unique environmental circumstances. Although scientific consensus appears to be that airborne particles are the biggest contributor to degradation in SAR~\cite{lidar_errormodel_consensus}, we think that a more versatile definition is required to ensure confidence during critical SAR missions, which are often of a volatile nature. We are left with an ambiguous definition of what constitutes \rev{LiDAR} point cloud degradation in the SAR domain.
-We considered which types of objective measurements may be available to produce ground-truth labels, such as particulate matter sensors, \rev{LiDAR} point clouds' inherent properties such as range-dropout rate and others, but we fear that using purely objective measures to label the data, would limit our learning based method to imitating the labels' sources instead of differentiating all possible degradation patterns from high quality data. Due to the incomplete error model in this domain, there may be novel or compound error sources that would not be captured using such an approach. As an example, we did observe dense smoke reflecting enough rays to produce phantom objects, which may fool SLAM algorithms. Such a case may even be labeleled incorrectly as normal by one of the aforementioned objective measurement labeling options, if the surroundings do not exhibit enough dispersed smoke particles already.
+We considered which types of objective measurements may be available to produce ground-truth labels, such as particulate matter sensors, \rev{LiDAR} point clouds' inherent properties such as range-dropout rate and others, but we fear that using purely objective measures to label the data, would limit our learning based method to imitating the labels' sources instead of differentiating all possible degradation patterns from high quality data. Due to the incomplete error model in this domain, there may be novel or compound error sources that would not be captured using such an approach. As an example, we did observe dense smoke reflecting enough rays to produce phantom objects, which may fool SLAM algorithms. Such a case may even be labeled incorrectly as normal by one of the aforementioned objective measurement labeling options, if the surroundings do not already exhibit enough dispersed smoke particles.
-To mitigate the aforementioned risks we adopt a human-centric, binary labelling strategy. We judged analog and multi-level discrete rating scales to be too subjective for human consideration, which only left us with the simplistic, but hopefully more reliable binary choice. We used two labeling approaches, producing two evaluation sets, whose motivation and details will be discussed in more detail in \rev{Section}~\ref{sec:preprocessing}. Rationale for the exact labeling procedures requires knowledge of the actual dataset we ended up choosing, which we will present in the next section.
+To mitigate the aforementioned risks, we adopt a human-centric, binary labelling strategy. We judged analog and multi-level discrete rating scales to be too subjective for human consideration, which only left us with the simplistic, but hopefully more reliable, binary choice. We used two labeling approaches, producing two evaluation sets, whose motivation and details will be discussed in more detail in \rev{Section}~\ref{sec:preprocessing}. Rationale for the exact labeling procedures requires knowledge of the actual dataset we ended up choosing, which we will present in the next section.
 \newsection{data_dataset}{\rev{Dataset}}
@@ -500,60 +502,60 @@ We use data from the \emph{Ouster OS1-32} \rev{LiDAR} sensor, which was configur
 \end{figure}
-During the measurement campaign, a total of 14 experiments were conducted—10 prior to operating the artificial smoke machine (hereafter referred to as normal experiments) and 4 after it has already been running for some time (anomalous experiments). In 13 of these experiments, the sensor platform was in near-constant motion (either translating at roughly 1m/s or rotating), with only one anomalous experiment conducted while the platform remained stationary. Although this means we do not have two stationary experiments from the same exact position for a direct comparison between normal and anomalous conditions, the overall experiments are similar enough to allow for meaningful comparisons. In addition to the presence of water vapor from the smoke machine, the experiments vary in illumination conditions, the presence of humans on the measurement grounds, and additional static artifacts. For our purposes, only the artificial smoke is relevant; differences in lighting or incidental static objects do not affect our analysis. Regardless of illumination, the \rev{LiDAR} sensor consistently produces comparable point clouds, and the presence of static objects does not influence our quantification of point cloud degradation.
+During the measurement campaign, a total of 14 experiments were conducted—10 prior to operating the artificial smoke machine (hereafter referred to as normal experiments) and 4 after it had already been running for some time (anomalous experiments). In 13 of these experiments, the sensor platform was in near-constant motion (either translating at roughly 1m/s or rotating), with only one anomalous experiment conducted while the platform remained stationary. Although this means we do not have two stationary experiments from the same exact position for a direct comparison between normal and anomalous conditions, the overall experiments are similar enough to allow for meaningful comparisons. In addition to the presence of water vapor from the smoke machine, the experiments vary in illumination conditions, the presence of humans on the measurement grounds, and additional static artifacts. For our purposes, only the artificial smoke is relevant; differences in lighting or incidental static objects do not affect our analysis. Regardless of illumination, the \rev{LiDAR} sensor consistently produces comparable point clouds, and the presence of static objects does not influence our quantification of point cloud degradation.
 In the anomalous experiments, the artificial smoke machine appears to have been running for some time before data collection began, as evidenced by both camera images and \rev{LiDAR} data showing an even distribution of water vapor around the machine. The stationary experiment is particularly unique: the smoke machine was positioned very close to the sensor platform and was actively generating new, dense smoke, to the extent that the \rev{LiDAR} registered the surface of the fresh water vapor as if it were a solid object.
-The \rev{Figures}~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show an representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 \rev{LiDAR} sensor at practically the same time.
+\rev{Figures}~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show a representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 \rev{LiDAR} sensor at practically the same time.
-\figc{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment's point cloud produced by the OS1-32 \rev{LiDAR} sensor without smoke and with illumination (same frame and roughly same alignment as \rev{Figure}~\ref{fig:data_screenshot_camera}). Point color corresponds to measurement range and axis in \rev{the center of the figure marks} the \rev{LiDAR}'s position.}{width=.9\textwidth}
+\figc{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment's point cloud produced by the OS1-32 \rev{LiDAR} sensor without smoke and with illumination (same frame and roughly same alignment as \rev{Figure}~\ref{fig:data_screenshot_camera}). The point color corresponds to the measurement range, and the axis in \rev{the center of the figure marks} the \rev{LiDAR}'s position.}{width=.9\textwidth}
 \figc{data_screenshot_camera}{figures/data_screenshot_camera.png}{Screenshot of IR camera output of an experiment without smoke and with illumination (same frame and roughly same alignment as \rev{Figure}~\ref{fig:data_screenshot_pointcloud})\rev{.}}{width=.9\textwidth}
 Regarding the dataset volume, the 10 normal experiments ranged from 88.7 to 363.1 seconds, with an average duration of 157.65 seconds. At a capture rate of 10 frames per second, these experiments yield 15,765 non-degraded point clouds. In contrast, the 4 anomalous experiments, including one stationary experiment lasting 11.7 seconds and another extending to 62.1 seconds, averaged 47.33 seconds, resulting in 1,893 degraded point clouds. In total, the dataset comprises 17,658 point clouds, with approximately 89.28\% classified as non-degraded (normal) and 10.72\% as degraded (anomalous). The distribution of experimental data is visualized in \rev{Figure}~\ref{fig:data_points_pie}.
-\fig{data_points_pie}{figures/data_points_pie.png}{Pie chart visualizing the amount and distribution of normal and anomalous point clouds in \cite{subter}\rev{.}}
+\fig{data_points_pie}{figures/data_points_pie.png}{Pie chart visualizing the amount and distribution of normal and anomalous LiDAR frames (i.e., point clouds) in \cite{subter}\rev{.}}
-The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline.  One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance.  Figure~\ref{fig:data_missing_points} shows the resulting right–shift of the missing-point histogram, a known effect for \rev{LiDAR} sensors in aerosol-filled environments. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box-plot in \rev{Figure}~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, a range at which we do not expect any non-erroneous measurements. Both effects are consistent with the behaviour reported in \rev{\cite{when_the_dust_settles}}.
+The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline.  One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance.  Figure~\ref{fig:data_missing_points} shows the resulting right–shift of the missing-point histogram, a known effect for \rev{LiDAR} sensors in aerosol-filled environments. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box plot in \rev{Figure}~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, a range at which we do not expect any non-erroneous measurements. Both effects are consistent with the behaviour reported in \rev{\cite{when_the_dust_settles}}.
-\fig{data_missing_points}{figures/data_missing_points.png}{Density histogram showing the percentage of missing measurements per scan for normal experiments without degradation and anomalous experiments with artifical smoke introduced as degradation.}
+\fig{data_missing_points}{figures/data_missing_points.png}{Density histogram showing the percentage of missing measurements per scan for normal experiments without degradation and anomalous experiments with artificial smoke introduced as degradation.}
 \fig{particles_near_sensor}{figures/particles_near_sensor_boxplot_zoomed_500.png}{Box diagram depicting the percentage of measurements closer than 50 centimeters to the sensor for normal and anomalous experiments.}
-Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes.  They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of \rev{LiDAR} quality. Next we will discuss how the \rev{LiDAR} scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so \rev{that} we could train and evaluate our quantification degradation methods.
+Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes.  They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of \rev{LiDAR} quality. Next, we will discuss how the \rev{LiDAR} scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so \rev{that} we could train and evaluate our quantification degradation methods.
 \newsection{preprocessing}{Preprocessing Steps and Labeling}
 As described in Section~\ref{sec:algorithm_description}, the method under evaluation is data type agnostic and can be adapted to work with any kind of data by choosing a suitable autoencoder architecture. In our case, the input data are point clouds produced by a \rev{LiDAR} sensor. Each point cloud contains up to 65,536 points, with each point represented by its \emph{X}, \emph{Y}, and \emph{Z} coordinates. To tailor the DeepSAD architecture to this specific data type, we would need to design an autoencoder suitable for processing three-dimensional point clouds. Although autoencoders can be developed for various data types, \rev{\cite{autoencoder_survey} observed} that over 60\% of recent research on autoencoders focuses on two-dimensional image classification and reconstruction. Consequently, there is a more established understanding of autoencoder architectures for images compared to those for three-dimensional point clouds.
-For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven sucessful in related work~\cite{degradation_quantification_rain}—encodes each \rev{LiDAR} measurement as a single pixel, where the pixel’s grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the \rev{LiDAR} sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$.
+For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven successful in related work~\cite{degradation_quantification_rain}—encodes each \rev{LiDAR} measurement as a single pixel, where the pixel’s grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the \rev{LiDAR} sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$.
 To create this mapping, we leveraged the available measurement indices and channel information inherent in the dense point clouds, which are ordered from 0 to 65,535 in a horizontally ascending, channel-by-channel manner. For sparse point clouds without such indices, one would need to rely on the pitch and yaw angles relative to the sensor's origin to correctly map each point to its corresponding pixel, although this often leads to ambiguous mappings due to numerical errors in angle estimation.
-Figure~\ref{fig:data_projections} displays two examples of \rev{LiDAR} point cloud projections to aid in the reader’s understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The top projection is derived from a scan without artificial smoke—and therefore minimal degradation—while the lower projection comes from an experiment where artificial smoke introduced significant degradation.
+Figure~\ref{fig:data_projections} displays two examples of \rev{LiDAR} point cloud projections to aid in the reader’s understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The projection in (a) is derived from a scan without artificial smoke—and therefore minimal degradation—while the projection in (b) comes from an experiment where artificial smoke introduced significant degradation.
-\fig{data_projections}{figures/data_2d_projections.png}{Two-dimensional projections of two point clouds, one from an experiment without degradation and one from an experiment with artifical smoke as degradation. To aid the readers perception, the images are vertically stretched and a colormap has been applied to the pixels' reciprocal range values, while the actual training data is grayscale.}
+\fig{data_projections}{figures/data_2d_projections.png}{Two-dimensional projections of two point clouds, (a) from an experiment without degradation and (b) from an experiment with artificial smoke as degradation. To aid the reader's perception, the images are vertically stretched, and a colormap has been applied to the pixels' reciprocal range values, while the actual training data is grayscale.}
-The remaining challenge, was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in \rev{Section}~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available one provide objective labels for \rev{LiDAR} data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using \rev{the statistical} properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in \rev{Section~\ref{sec:data_dataset}}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments.
+The remaining challenge was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in \rev{Section}~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available one provides objective labels for \rev{LiDAR} data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using \rev{the statistical} properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in \rev{Section~\ref{sec:data_dataset}}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments.
-The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this, is that during the three non-static anomalous experiments the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the beginnings and ends of the anomalous experiments capture point clouds which are subjectively not degraded and appear similar to ones from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier\rev{--}the proportion of missing points and the amount of erroneous points close to the sensor per point cloud\rev{--}as can be seen in \rev{Figure}~\ref{fig:data_anomalies_timeline}.
+The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this is that during the three non-static anomalous experiments, the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time, and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the beginnings and ends of the anomalous experiments capture point clouds that are subjectively not degraded and appear similar to those from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier\rev{--}the proportion of missing points and the amount of erroneous points close to the sensor per point cloud\rev{--}as can be seen in \rev{Figure}~\ref{fig:data_anomalies_timeline}.
-\fig{data_anomalies_timeline}{figures/data_combined_anomalies_timeline.png}{Missing points and points with a measured range smaller than 50cm per point cloud over a normalized timeline of the individual experiments. This illustrates the rise, plateau and fall of degradation intensity during the anomalous experiments, owed to the spacial proximity to the degradation source (smoke machine). One of the normal experiments (without artifical smoke) is included as a baseline \rev{in gray}.}
+\fig{data_anomalies_timeline}{figures/data_combined_anomalies_timeline.png}{Missing points and points with a measured range smaller than 50cm per point cloud over a normalized timeline of the individual experiments. This illustrates the rise, plateau, and fall of degradation intensity during the anomalous experiments, owing to the spatial proximity between the LiDAR sensor and the degradation source (smoke machine). One of the normal experiments (without artificial smoke) is included as a baseline \rev{in gray}.}
 Afraid that the incorrectly labeled data may negatively impact DeepSAD's semi-supervised training, we chose to manually remove the anomalous labels from the beginning and end of the anomalous experiments, for training purposes. This refinement gave us more confidence in the training signal but reduced the number of labeled anomalies. For evaluation, we therefore report results under both schemes:
 \begin{enumerate}
-	\item \textbf{Experiment-based labels:} All scans from anomalous experiments marked anomalous, including border cases. This yields conservative performance metrics that reflect real-world label noise.
+	\item \textbf{Experiment-based labels:} All scans from anomalous experiments are marked anomalous, including border cases. This yields conservative performance metrics that reflect real-world label noise.
 	\item \textbf{Manually-defined labels:} Only unequivocally degraded scans are marked anomalous, producing near-ideal separation in a lot of cases.
 \end{enumerate}
-Under both evaluation schemes all frames from normal experiments were marked as normal, since they appear to have produced high quality data throughout. A visualization of how the two evaluation schemes measure up in terms of numbers of samples per class can be seen in \rev{Figure}~\ref{fig:data_eval_labels}.
+Under both evaluation schemes, all frames from normal experiments were marked as normal, since they appear to have produced high-quality data throughout. A visualization of how the two evaluation schemes measure up in terms of the number of samples per class can be seen in \rev{Figure}~\ref{fig:data_eval_labels}.
-\fig{data_eval_labels}{figures/data_eval_labels.png}{Pie charts visualizing the number of normal and anomalous labels applied to the dataset per labeling scheme. A large part of the experiment-based anomalous labels had to be removed for the manually-defined scheme, since they were either subjectively clearly or possibly not degraded.}
+\fig{data_eval_labels}{figures/data_eval_labels.png}{Pie charts visualizing the number of normal and anomalous labels applied to the dataset for (a) experiment-based labeling scheme and (b) manually-defined labeling scheme. A large part of the experiment-based anomalous labels had to be removed for the manually-defined scheme, since, subjectively, they were either clearly or possibly not degraded.}
-By evaluating and comparing both approaches, we hope to demonstrate a more thorough performance investigatation than with only one of the two \rev{labeling schemes}.
+By evaluating and comparing both approaches, we hope to demonstrate a more thorough performance investigation than with only one of the two \rev{labeling schemes}.
 \newchapter{experimental_setup}{Experimental Setup}
@@ -568,16 +570,16 @@ In the following sections, we detail our adaptations to this framework:
 	\item Experimental environment: the hardware and software stack used, with typical training and inference runtimes.
 \end{itemize}
-Together, these components define the full experimental pipeline, from data loading, preprocessing, method training to the evaluation and comparing of methods.
+Together, these components define the full experimental pipeline, from data loading, preprocessing, method training, to the evaluation and comparison of methods.
 \section{Framework \& Data Preparation}
-DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added \rev{functionality. We allowed loading data from our of} chosen dataset, added DeepSAD models that work with the \rev{LiDAR} projections datatype, added more evaluation methods and an inference module.
+DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10, and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE, and SemiDGM, with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation, which was originally developed for Python 3.7, to work with Python 3.12 and changed or added \rev{functionality. We allowed loading data from our} chosen dataset, added DeepSAD models that work with the \rev{LiDAR} projections datatype, added more evaluation methods, and an inference module.
-The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 \rev{LiDAR}. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in \rev{Section}~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern \rev{LiDARs} can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
+The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 \rev{LiDAR}. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in \rev{Section}~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern \rev{LiDARs} can be configured to output range images directly, which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
-We extended the DeepSAD framework’s PyTorch \texttt{DataLoader} by implementing a custom \texttt{Dataset} class that ingests our precomputed NumPy range-image files and attaches appropriate evaluation labels. Each experiment’s frames are stored as individual \texttt{.npy} files with  the numpy array shape \((\text{Number of Frames}, H, W)\), containing the point clouds' reciprocal range values. Our \texttt{Dataset} initializer scans a directory of these files, loads the NumPy arrays from file into memory, transforms them into PyTorch tensors and assigns evaluation and training labels accordingly.
+We extended the DeepSAD framework’s PyTorch \texttt{DataLoader} by implementing a custom \texttt{Dataset} class that ingests our precomputed NumPy range-image files and attaches appropriate evaluation labels. Each experiment’s frames are stored as individual \texttt{.npy} files with  the numpy array shape \((\text{Number of Frames}, H, W)\), containing the point clouds' reciprocal range values. Our \texttt{Dataset} initializer scans a directory of these files, loads the NumPy arrays from file into memory, transforms them into PyTorch tensors, and assigns evaluation and training labels accordingly.
 The first labeling scheme, called \emph{experiment-based labels}, assigns
 \[
@@ -589,7 +591,7 @@ The first labeling scheme, called \emph{experiment-based labels}, assigns
 \]
 At load time, any file with “smoke” in its name is treated as anomalous (label \(-1\)), and all others (normal experiments) are labeled \(+1\).
-To obtain a second source of ground truth, we also support \emph{manually-defined labels}. A companion JSON file specifies a start and end frame index for each of the four smoke experiments—defining the interval of unequivocal degradation. During loading the second label $y_{man}$ is assigned as follows:
+To obtain a second source of ground truth, we also support \emph{manually-defined labels}. A companion JSON file specifies a start and end frame index for each of the four smoke experiments—defining the interval of unequivocal degradation. During loading, the second label $y_{man}$ is assigned as follows:
 \[
 	y_{\mathrm{man}} =
@@ -600,19 +602,19 @@ To obtain a second source of ground truth, we also support \emph{manually-define
 	\end{cases}
 \]
-We pass instances of this \texttt{Dataset} to PyTorch’s \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed \rev{LiDAR} projection, both evaluation labels and a semi-supervised training label.
+We pass instances of this \texttt{Dataset} to PyTorch’s \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed \rev{LiDAR} projection, both evaluation labels, and a semi-supervised training label.
-To control the supervision of DeepSAD's training, our custom PyTorch \texttt{Dataset} accepts two integer parameters, \texttt{num\_labelled\_normal} and \texttt{num\_labelled\_anomalous}, which specify how many samples of each class should retain their labels during training. We begin with the manually-defined evaluation labels, to not use mislabeled anomalous frames for the semi-supervision. Then, we randomly un-label (set to 0) enough samples of each class until exactly \texttt{num\_labelled\_normal} normals and \texttt{num\_labelled\_anomalous} anomalies remain labeled.
+To control the supervision of DeepSAD's training, our custom PyTorch \texttt{Dataset} accepts two integer parameters, \texttt{num\_labelled\_normal} and \texttt{num\_labelled\_anomalous}, which specify how many samples of each class should retain their labels during training. We begin with the manually-defined evaluation labels, so as not to use mislabeled anomalous frames for the semi-supervision. Then, we randomly un-label (set to 0) enough samples of each class until exactly \texttt{num\_labelled\_normal} normals and \texttt{num\_labelled\_anomalous} anomalies remain labeled.
 To obtain robust performance estimates on our relatively small dataset, we implement $k$-fold cross-validation. A single integer parameter, \texttt{num\_folds}, controls the number of splits. We use scikit-learn’s \texttt{KFold} (from \texttt{sklearn.model\_selection}) with \texttt{shuffle=True} and a fixed random seed to partition each experiment’s frames into \texttt{num\_folds} disjoint folds. Training then proceeds across $k$ rounds, each time training on $(k-1)/k$ of the data and evaluating on the remaining $1/k$. In our experiments, we set \texttt{num\_folds=5}, yielding an 80/20 train/evaluation split per fold.
-For inference (i.e.\ model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire experiment.
+For inference (i.e., model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames, nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire experiment.
-\section{Model Configuration \& Evaluation Protocol}
+\section{Model Configuration}
-Since the neural network architecture trained in the \rev{DeepSAD} method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \rev{\cite{degradation_quantification_rain}} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth \rev{refered} to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
+Since the neural network architecture trained in the \rev{DeepSAD} method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \rev{\cite{degradation_quantification_rain}} reported success in training DeepSAD on similar data, we first adapted the network architecture utilized by them for our use case, which is based on the simple and well-understood LeNet architecture~\cite{lenet}. Additionally, we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture, henceforth \rev{referred} to as "efficient architecture", to incorporate a few modern techniques, befitting our use case.
-The LeNet-inspired autoencoder can be split into an encoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) with a latent space \rev{in between} the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in \rev{Section}~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained.
+The LeNet-inspired autoencoder can be split into an encoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) with a latent space \rev{in between} the two parts. Such an arrangement is typical for autoencoder architectures, as we discussed in \rev{Section}~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture, which is used to infer the degradation quantification in our use case, once trained.
 \figc{setup_arch_lenet_encoder}{diagrams/arch_lenet_encoder}{
 	Architecture of the LeNet-inspired encoder. The input is a \rev{LiDAR} range image of size
@@ -636,7 +638,7 @@ The LeNet-inspired encoder network (see \rev{Figure}~\ref{fig:setup_arch_lenet_e
 	$4\times 512\times 8$ (channels $\times$ width $\times$ height).
 	The first upsampling stage applies interpolation with scale factor 2, followed by a
 	transpose convolution with 8 output channels, batch normalization, and LeakyReLU activation,
-	yielding $8\times 1024\times 16$. The second stage again upsamples by factor 2 and applies
+	yielding $8\times 1024\times 16$. The second stage again upsamples by a factor of 2 and applies
 	a transpose convolution, reducing the channels to 1. This produces the reconstructed output
 	of size $1\times 2048\times 32$, which matches the original input dimensionality required
 	for the autoencoding objective.
@@ -644,14 +646,14 @@ The LeNet-inspired encoder network (see \rev{Figure}~\ref{fig:setup_arch_lenet_e
 The decoder network (see \rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) mirrors the encoder and reconstructs the input from its latent representation. A dense layer first expands the latent vector into a feature map of shape $4\times 512\times 8$, which is then upsampled and refined in two successive stages. Each stage consists of an interpolation step that doubles the spatial resolution, followed by a transpose convolution that learns how to add structural detail. The first stage operates on 4 channels, and the second on 8 channels, with the final transpose convolution reducing the output to a single channel. The result is a reconstructed output of size $1\times 2048\times 32$, matching the original input dimensionality required for the autoencoding objective.
-Even though the LeNet-inspired encoder proved capable of achieving our degradation quantification objective in initial experiments, we identified several shortcomings that motivated the design of a second, more efficient architecture. The most important issue concerns the shape of the CNN's receptive field (RF) which describes the region of the input that influences a single output activation. Its size and aspect ratio determine which structures the network can effectively capture: if the RF is too small, larger patterns cannot be detected, while an excessively large RF may hinder the network from learning to recognize fine details. For standard image data, the RF is often expressed as a symmetric $n \times n$ region, but in principle it can be computed independently per axis.
+Even though the LeNet-inspired encoder proved capable of achieving our degradation quantification objective in initial experiments, we identified several shortcomings that motivated the design of a second, more efficient architecture. The most important issue concerns the shape of the CNN's receptive field (RF), which describes the region of the input that influences a single output activation. Its size and aspect ratio determine which structures the network can effectively capture: if the RF is too small, larger patterns cannot be detected, while an excessively large RF may hinder the network from learning to recognize fine details. For standard image data, the RF is often expressed as a symmetric $n \times n$ region, but in principle it can be computed independently per axis.
 %\figc{setup_ef_concept}{figures/setup_ef_concept}{Receptive fields in a CNN. Each output activation aggregates information from a region of the input; stacking layers expands this region, while kernel size, stride, and padding control how quickly it grows and what shape it takes. (A) illustrates slower, fine-grained growth; (B) shows faster expansion, producing a larger—potentially anisotropic—receptive field and highlighting the trade-off between detail and context. Reproduced from~\cite{ef_concept_source}}{width=.6\textwidth}
-The RF shape's issue arises from the fact that spinning multi-beam \rev{LiDAR} oftentimes produce point clouds posessing dense horizontal but limited vertical resolution. In our \rev{case, this} results in a pixel-per-degree resolution of approximately $5.69\,\sfrac{pixel}{deg}$ vertically and $1.01\,\sfrac{pixel}{deg}$ horizontally. Consequently, the LeNet-inspired encoder’s calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the network’s ability to capture degradation patterns that extend differently across the two axes.
+The RF shape's issue arises from the fact that spinning multi-beam \rev{LiDAR} oftentimes produce point clouds possessing dense horizontal but limited vertical resolution. In our \rev{case, this} results in a pixel-per-degree resolution of approximately $5.69\,\sfrac{pixel}{deg}$ vertically and $1.01\,\sfrac{pixel}{deg}$ horizontally. Consequently, the LeNet-inspired encoder’s calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the network’s ability to capture degradation patterns that extend differently across the two axes.
-To adjust for this, we decided to modify the network architecture and included further modificatons to improve the method's performance. The encoder (see \rev{Figure}~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modificatons:
+To adjust for this, we decided to modify the network architecture and included further modifications to improve the method's performance. The encoder (see \rev{Figure}~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modifications:
 \begin{itemize}
 	\item \textbf{Non-square convolution kernels.} Depthwise-separable convolutions with kernel size $3 \times 17$ are used instead of square kernels, resulting in an RF of $10 \times 52$ pixels, corresponding to $9.93^{\circ} \times 9.14^{\circ}$, substantially more balanced than the LeNet-inspired network's RF.
 	\item \textbf{Circular padding along azimuth.} The horizontal axis is circularly padded to respect the wrap-around of $360^{\circ}$ \rev{LiDAR} data, preventing artificial seams at the image boundaries.
@@ -683,7 +685,7 @@ The decoder (see \rev{Figure}~\ref{fig:setup_arch_ef_decoder}) mirrors the encod
 \begin{itemize}
 	\item \textbf{Nearest-neighbor upsampling followed by convolution.} Instead of relying solely on transposed convolutions, each upsampling stage first enlarges the feature map using parameter-free nearest-neighbor interpolation, followed by a depthwise-separable convolution. This strategy reduces the risk of checkerboard artifacts while still allowing the network to learn fine detail.
 	\item \textbf{Asymmetric upsampling schedule.} Horizontal resolution is restored more aggressively (e.g., scale factor $1 \times 4$) to reflect the anisotropic downsampling performed in the encoder.
-	\item \textbf{Final convolution with circular padding.} The output is generated using a $(3 \times 17)$ convolution with circular padding along the azimuth similar to the new encoder, ensuring consistent treatment of the 360° \rev{LiDAR} input.
+	\item \textbf{Final convolution with circular padding.} The output is generated using a $(3 \times 17)$ convolution with circular padding along the azimuth, similar to the new encoder, ensuring consistent treatment of the 360° \rev{LiDAR} input.
 \end{itemize}
 \fig{setup_arch_ef_decoder}{diagrams/arch_ef_decoder}{
@@ -702,7 +704,7 @@ The decoder (see \rev{Figure}~\ref{fig:setup_arch_ef_decoder}) mirrors the encod
 }
-To compare the computational efficiency of the two architectures we show the number of trainable parameters and the number of multiply–accumulate operations (MACs) for different latent space sizes used in our experiments in \rev{Table}~\ref{tab:params_lenet_vs_efficient}. Even though the efficient architecture employs more layers and channels which allows the network to learn to recognize more types of patterns when compared to the LeNet-inspired one, the encoders' MACs are quite similar. The more complex decoder design of the efficient network appears to contribute a lot more MACs, which leads to longer pretraining times which we report in \rev{Section}~\ref{sec:setup_experiments_environment}.
+To compare the computational efficiency of the two architectures, we show the number of trainable parameters and the number of multiply–accumulate operations (MACs) for different latent space sizes used in our experiments in \rev{Table}~\ref{tab:params_lenet_vs_efficient}. Even though the efficient architecture employs more layers and channels, which allows the network to learn to recognize more types of patterns when compared to the LeNet-inspired one, the encoders' MACs are quite similar. The more complex decoder design of the efficient network appears to contribute a lot more MACs, which leads to longer pretraining times, which we report in \rev{Section}~\ref{sec:setup_experiments_environment}.
 %& \multicolumn{4}{c}{\textbf{Encoders}} & \multicolumn{4}{c}{\rev{\textbf{Autoencoders (Encoder $+$ Decoder)}}} \\
@@ -743,28 +745,50 @@ To compare the computational efficiency of the two architectures we show the num
 \FloatBarrier
-\paragraph{Baseline methods (Isolation Forest, OCSVM)}
+\newsection{setup_baselines_evaluation}{Baseline Methods \& Evaluation Metrics}
 To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and OCSVM. Both are included in the original DeepSAD codebase and associated paper, and they represent well-understood yet conceptually distinct families of anomaly detection. Together, these baselines provide complementary perspectives: raw input tree-based partitioning (Isolation Forest) and dimensionality-reduced kernel-based boundary learning (OCSVM), offering a broad and well-established basis for comparison.
-To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and OCSVM. Both are included in the original DeepSAD codebase and the associated paper, and they represent well-understood but conceptually different families of anomaly detection. In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the DeepSAD autoencoder’s \emph{encoder} as a learned dimensionality reduction (to the same latent size as DeepSAD), to allow OCSVM training on this latent space. Together, these two baselines cover complementary perspectives: raw input tree-based partitioning (Isolation Forest) and dimensionality reduced kernel-based boundary learning (OCSVM), providing a broad and well-established basis for comparison.
+\paragraph{Isolation Forest} is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.
-Isolation Forest is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.
+Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient, and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the \rev{LiDAR} input representation, providing a strong non-neural baseline for comparison against DeepSAD.
-Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the \rev{LiDAR} input representation, providing a strong non-neural baseline for comparison against DeepSAD.
+\paragraph{OCSVM} takes a very different approach by learning a flexible boundary around normal samples. It assumes all training data to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies. The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible, even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data.
 OCSVM takes a very different approach by learning a flexible boundary around normal samples. It assumes all training data to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies.
 The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data.
 During training, the algorithm balances two competing objectives: capturing as many of the normal samples as possible inside the boundary, while keeping the region compact enough to exclude potential outliers. Once this boundary is established, applying OCSVM is straightforward — any new data point is checked against the learned boundary, with points inside considered normal and those outside flagged as anomalous.
-We adapted the baseline implementations to our data loader and input format, and added support for multiple evaluation targets per frame (two labels per data point), reporting both results per experiment. For OCSVM, the dimensionality reduction step is \emph{always} performed with the corresponding DeepSAD encoder and its autoencoder pretraining weights that match the evaluated setting (i.e., same latent size and backbone). Both baselines, like DeepSAD, output continuous anomaly scores. This allows us to evaluate them directly without committing to a fixed threshold.
+In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the autoencoder’s encoder from DeepSAD's pretraining as a learned dimensionality reduction (to the same latent size as DeepSAD) to allow OCSVM training on this latent space. The dimensionality reduction step is always performed with the corresponding DeepSAD encoder and its autoencoder pretraining weights that match the evaluated setting (i.e., same latent size and backbone).
 We adapted the baseline implementations to our data loader and input format and added support for multiple evaluation targets per frame (two labels per data point), reporting both results per experiment. Both baselines, like DeepSAD, output continuous anomaly scores, which allows us to evaluate them directly without committing to a fixed threshold.
 \paragraph{Evaluation Metrics}
 As discussed in Section~\ref{sec:preprocessing}, evaluating model performance in our setup is challenging due to the absence of analog ground truth. Instead, we rely on binary labels that are additionally noisy and subjective. All models under consideration produce continuous anomaly scores: DeepSAD outputs a positive-valued distance to the center of a hypersphere, Isolation Forest measures deviation from the mean tree depth (which can be negative), and OCSVM returns a signed distance to the decision boundary. Because these scores differ in scale and sign—and due to the lack of a reliable degradation threshold—it is not appropriate to evaluate performance using metrics such as accuracy or F1 score, both of which require classification at a fixed threshold.
 Instead, we adopt threshold-independent evaluation curves that illustrate model behavior across the full range of possible thresholds. The most commonly used of these is the Receiver Operating Characteristic (ROC)~\cite{roc} curve, along with its scalar summary metric, ROC AUC. ROC curves plot the true positive rate (TPR) against the false positive rate (FPR), providing insight into how well a model separates the two classes. However, as noted in~\cite{roc_vs_prc2,roc_vs_prc} and confirmed in our own testing, ROC AUC can be misleading under strong class imbalance—a common condition in anomaly detection.
 To address this, we instead rely on Precision–Recall Curves (PRC)~\cite{prc}, which better capture model behavior on the minority class. PRC plots precision—the fraction of predicted anomalies that are correct—against recall—the fraction of true anomalies that are detected. As the decision threshold is lowered, recall increases but typically at the cost of precision, since more false positives are included. This tradeoff is captured across all thresholds. The metric definitions are as follows:
 \[
 	\text{Precision} = \frac{\text{TP}}{\text{TP} + \text{FP}}, \quad
 	\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}.
 \]
 In our evaluation, this distinction proved practically significant. As illustrated in Figure~\ref{fig:roc_vs_prc}, ROC AUC values in (a) appear similarly strong for both, Isolation Forest and DeepSAD (0.693 vs. 0.782), suggesting comparable performance. However, the PRC in (b) reveals a clear divergence: while DeepSAD maintains high precision across recall levels, Isolation Forest suffers a steep decline in precision as recall increases, due to a high number of false positives. The resulting Average Precision (AP)—the area under the PRC—is much lower for Isolation Forest (0.207 vs. 0.633), offering a more realistic account of its performance under imbalance.
 \figc{roc_vs_prc}{figures/setup_roc_vs_prc.png}{Comparison of ROC (a) and PRC (b) for the same evaluation run. ROC fails to reflect the poor performance of Isolation Forest, which misclassifies many normal samples as anomalous at lower thresholds. The PRC exposes this effect, resulting in a substantially lower AP for Isolation Forest than for DeepSAD.}{width=.9\textwidth}
 In addition to cross-validated performance comparisons, we also apply the trained models to previously unseen, temporally ordered experiments to simulate inference in realistic conditions. Since each method produces scores on a different scale—with different signs and ranges—raw scores are not directly comparable. To enable comparison, we compute a $z$-score~\cite{zscore} per frame, defined as the number of standard deviations a score deviates from the mean of the normal data. To perform the normalization, we compute the mean and standard deviation of anomaly scores on a clean reference experiment. These values are then used to normalize scores from degraded experiments, making it easy to see how much each method's output deviates from its own baseline under degradation. It also facilitates a unified view across methods, even though their outputs are otherwise heterogeneous. In this way, $z$-score normalization supports threshold-free interpretation and enables consistent model comparison during inference.
 In conclusion, the combination of unreliable thresholds and pronounced class imbalance renders traditional threshold-based metrics unsuitable for our setting. PRC and AP provide a more faithful representation of model behavior across thresholds, while $z$-score normalization extends this by enabling direct comparison of inference-time outputs across methods and conditions.
 \newsection{setup_experiments_environment}{Experiment Overview \& Computational Environment}
-Across all experiments we vary three factors: (i) latent space dimensionality, (ii) encoder architecture (LeNet-inspired vs. Efficient), and (iii) the amount of semi-supervision (labeling regime). To keep results comparable, we fix the remaining training hyperparameters: all autoencoders are pretrained for $E_A = 50$~epochs with ADAM as an optimzer at a starting learning rate of $L_A = 1\cdot 10^{-5}$; all DeepSAD models are then trained for $E_M = 150$~epochs with the same optimizer and starting learning rate ($L_M = 1\cdot 10^{-5}$). The DeepSAD label-weighting parameter is kept at $\eta = 1$ and the regularization rate at $\lambda = 1\cdot 10^{-6}$ for all runs. Every configuration is evaluated with 5-fold cross-validation, and we report fold means.
+Across all experiments, we vary three factors: (i) latent space dimensionality, (ii) encoder architecture (LeNet-inspired vs. Efficient), and (iii) the amount of semi-supervision (labeling regime). To keep results comparable, we fix the remaining training hyperparameters: all autoencoders are pretrained for $E_A = 50$~epochs with ADAM as an optimizer at a starting learning rate of $L_A = 1\cdot 10^{-5}$; all DeepSAD models are then trained for $E_M = 150$~epochs with the same optimizer and starting learning rate ($L_M = 1\cdot 10^{-5}$). The DeepSAD label-weighting parameter is kept at $\eta = 1$ and the regularization rate at $\lambda = 1\cdot 10^{-6}$ for all runs. Every configuration is evaluated with 5-fold cross-validation, and we report fold means.
-We first search over the latent bottleneck size by pretraining autoencoders only. For both encoder backbones, we evaluate latent sizes $32, 64, 128, 256, 512, 768,$ and $1024$. The goal is to identify compact yet expressive representations and to compare the autoencoding performance between the two network architectures LeNet-inspired and Efficient. Additionally, we are interested in finding possible correlations between the autoencoder performance and the DeepSAD anomaly detection performance.
+We first search over the latent bottleneck size by pretraining autoencoders only. For both encoder backbones, we evaluate latent sizes $32, 64, 128, 256, 512, 768,$ and $1024$. The goal is to identify compact yet expressive representations and to compare the autoencoding performance between the two network architectures, LeNet-inspired and Efficient. Additionally, we are interested in finding possible correlations between the autoencoder performance and the DeepSAD anomaly detection performance.
 Using the same latent sizes and backbones, we train full DeepSAD models initialized from the pretrained encoders. We study three supervision regimes, from unsupervised to strongly supervised (see Table~\ref{tab:labeling_regimes} for proportions within the training folds):
 \begin{itemize}
@@ -772,7 +796,7 @@ Using the same latent sizes and backbones, we train full DeepSAD models initiali
 	\item \textbf{Low supervision:} $(50,10)$ labeled samples.
 	\item \textbf{High supervision:} $(500,100)$ labeled samples.
 \end{itemize}
-Percentages in Table~\ref{tab:labeling_regimes} are computed relative to the training split of each fold (80\% of the data) from the experiment-based labeling scheme. Importantly, for semi-supervised labels we \emph{only} use hand-selected, unambiguous smoke intervals from the manually-defined evaluation scheme, to avoid injecting mislabeled data into training.
+Percentages in Table~\ref{tab:labeling_regimes} are computed relative to the training split of each fold (80\% of the data) from the experiment-based labeling scheme. Importantly, for semi-supervised labels, we \emph{only} use hand-selected, unambiguous smoke intervals from the manually-defined evaluation scheme to avoid injecting mislabeled data into training.
 \begin{table}
 	\centering
@@ -878,7 +902,7 @@ Pretraining runtimes for the autoencoders are reported in Table~\ref{tab:ae_pret
 	\end{tabularx}
 \end{table}
-The full DeepSAD training times are shown in Table~\ref{tab:train_runtimes_compact}, alongside the two classical baselines Isolation Forest and OCSVM. Here the contrast between methods is clear: while DeepSAD requires on the order of 15–20 minutes of GPU training per configuration and fold, both baselines complete training in seconds on CPU. The OCSVM training can only be this fast due to the reduced input dimensionality from utilizing DeepSAD's pretraining encoder as a preprocessing step, although other dimensionality reduction methods may also be used which could require less computational resources for this step.
+The full DeepSAD training times are shown in Table~\ref{tab:train_runtimes_compact}, alongside the two classical baselines, Isolation Forest and OCSVM. Here, the contrast between methods is clear: while DeepSAD requires on the order of 15–20 minutes of GPU training per configuration and fold, both baselines complete training in seconds on CPU. The OCSVM training can only be this fast due to the reduced input dimensionality from utilizing DeepSAD's pretraining encoder as a preprocessing step, although other dimensionality reduction methods may also be used, which could require less computational resources for this step.
 \begin{table}
 	\centering
@@ -929,7 +953,7 @@ Together, these results provide a comprehensive overview of the computational re
 \newchapter{results_discussion}{Results and Discussion}
-The \rev{evaluation experiments which the setup in in Chapter~\ref{chp:experimental_setup} described,} are presented in this chapter. We begin in Section~\ref{sec:results_pretraining} with the pretraining stage, where the two autoencoder architectures were trained across multiple latent space dimensionalities. These results provide insight into the representational capacity of each architecture. In Section~\ref{sec:results_deepsad}, we turn to the main experiments: training DeepSAD models and benchmarking them against baseline algorithms (Isolation Forest and OCSVM). Finally, in Section~\ref{sec:results_inference}, we present inference results on \rev{data} that were held out during training. These plots illustrate how the algorithms behave when applied sequentially to unseen \rev{data}, offering a more practical perspective on their potential for real-world rescue robotics applications.
+In this chapter, we present the \rev{evaluation experiments, based on the experimental setup described in Chapter~\ref{chp:experimental_setup}}. We begin in Section~\ref{sec:results_pretraining} with the pretraining stage, where the two autoencoder architectures were trained across multiple latent space dimensionalities. These results provide insight into the representational capacity of each architecture. In Section~\ref{sec:results_deepsad}, we turn to the main experiments: training DeepSAD models and benchmarking them against baseline algorithms (Isolation Forest and OCSVM). Finally, in Section~\ref{sec:results_inference}, we present inference results on \rev{data} that were held out during training. These plots illustrate how the algorithms behave when applied sequentially to unseen \rev{data}, offering a more practical perspective on their potential for real-world rescue robotics applications.
 % --- Section: Autoencoder Pretraining Results ---
 \newsection{results_pretraining}{Autoencoder Pretraining Results}
@@ -972,7 +996,7 @@ Due to the challenges of ground truth quality, evaluation results must be interp
 	\item \textbf{Manually-defined labels:} A cleaner ground truth, containing only clearly degraded frames. This removes mislabeled intervals and allows nearly perfect separation. However, it also simplifies the task too much, because borderline cases are excluded.
 \end{itemize}
-Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimensions, labeling regimes, and methods. Under experiment-based evaluation, both DeepSAD variants consistently outperform the baselines, reaching AP values around 0.60–0.66 compared to 0.21 for \rev{the} Isolation Forest and 0.31–0.49 for OCSVM. Under manually-defined evaluation, DeepSAD achieves nearly perfect AP in all settings, while the baselines remain much lower. This contrast shows that the lower AP under experiment-based evaluation is not a weakness of DeepSAD itself, but a direct result of mislabeled samples in the evaluation data. The manually-defined scheme therefore confirms that DeepSAD separates clearly normal from clearly degraded frames very well, while also highlighting that label noise must be kept in mind when interpreting the experiment-based results.
+Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimensions, labeling regimes, and methods. Under experiment-based evaluation, both DeepSAD variants consistently outperform the baselines, reaching AP values around 0.60–0.66 compared to 0.21 for \rev{the} Isolation Forest and 0.31–0.49 for OCSVM. Under manually-defined evaluation, DeepSAD achieves nearly perfect AP in all settings, while the baselines remain much lower. This contrast shows that the lower AP under experiment-based evaluation is not a weakness of DeepSAD itself, but a direct result of mislabeled samples in the evaluation data. Therefore, the manually-defined scheme confirms that DeepSAD separates clearly normal from clearly degraded frames very well, while also highlighting that label noise must be kept in mind when interpreting the experiment-based results.
 \begin{table}
 	\centering
@@ -1022,53 +1046,53 @@ The precision--recall curves \rev{for experiment-based evaluation} (Figure~\ref{
 Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the manually-defined labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods’ potential for quantifying \rev{LiDAR} degradation in practice.
-\fig{prc_representative}{figures/results_prc.png}{Representative precision–recall curves over all latent dimensionalities for semi-labeling regime 0/0 from experiment-based evaluation labels. DeepSAD maintains a large high-precision operating region before collapsing; OCSVM declines smoother but exhibits high standard deviation between folds; IsoForest collapses quickly and remains flat. DeepSAD's fall-off is at least partly due to known mislabeled evaluation targets.}
+\fig{prc_representative}{figures/results_prc.png}{Representative precision–recall curves (a) - (g) over all latent dimensionalities 32 - 1024 for semi-labeling regime 0/0 from experiment-based evaluation labels. DeepSAD maintains a large high-precision operating region before collapsing; OCSVM declines smoother but exhibits high standard deviation between folds; IsoForest collapses quickly and remains flat. DeepSAD's fall-off is at least partly due to known mislabeled evaluation targets.}
 \FloatBarrier
 \paragraph{Effect of latent space dimensionality.}
-During autoencoder pretraining we observed that reconstruction loss decreased monotonically with larger latent spaces, as expected: a bigger bottleneck allows the encoder–decoder to retain more information. If autoencoder performance were directly predictive of DeepSAD performance, we would therefore expect average precision to improve with larger latent dimensions. The actual results, however, show the opposite trend (Figure~\ref{fig:latent_dim_ap}): compact latent spaces (32–128) achieve the highest AP, while performance declines as the latent size grows. This inverse correlation is most clearly visible in the unsupervised case. Part of this effect can be attributed to evaluation label noise, which larger spaces amplify. More importantly, it shows that autoencoder performance does not translate directly into DeepSAD performance. Pretraining losses can still help compare different architectures for robustness, and performance but they cannot be used to tune the latent dimensionality: the dimensionality that minimizes reconstruction loss in pretraining is not necessarily the one that maximizes anomaly detection performance in DeepSAD.
+During autoencoder pretraining, we observed that reconstruction loss decreased monotonically with larger latent spaces, as expected: a bigger bottleneck allows the encoder–decoder to retain more information. If autoencoder performance were directly predictive of DeepSAD performance, we would therefore expect average precision to improve with larger latent dimensions. The actual results, however, show the opposite trend (Figure~\ref{fig:latent_dim_ap}): compact latent spaces (32–128) achieve the highest AP, while performance declines as the latent size grows. This inverse correlation is most clearly visible in the unsupervised case. Part of this effect can be attributed to evaluation label noise, which larger spaces amplify. More importantly, it shows that autoencoder performance does not translate directly into DeepSAD performance. Pretraining losses can still help compare different architectures for robustness and performance, but they cannot be used to tune the latent dimensionality: the dimensionality that minimizes reconstruction loss in pretraining is not necessarily the one that maximizes anomaly detection performance in DeepSAD.
 % \paragraph{Effect of latent space dimensionality.}
 % Figure~\ref{fig:latent_dim_ap} shows how average precision changes with latent dimension under the experiment-based evaluation. The best performance is reached with compact latent spaces (32–128), while performance drops as the latent dimension grows. This can be explained by how the latent space controls the separation between normal and anomalous samples. Small bottlenecks act as a form of regularization, keeping the representation compact and making it easier to distinguish clear anomalies from normal frames. Larger latent spaces increase model capacity, but this extra flexibility also allows more overlap between normal frames and the mislabeled anomalies from the evaluation data. As a result, the model struggles more to keep the two groups apart.
 %
 % This effect is clearly visible in the precision--recall curves. For DeepSAD at all dimensionalities we observe high initial precision and a steep drop once the evaluation demands that mislabeled anomalies be included. However, the sharpness of this drop depends on the latent size: at 32 dimensions the fall is comparably more gradual, while at 1024 it is almost vertical. In practice, this means that higher-dimensional latent spaces amplify the label-noise problem and lead to sudden precision collapses once the clear anomalies have been detected. Compact latent spaces are therefore more robust under noisy evaluation conditions and appear to be the safer choice for real-world deployment.
-\figc{latent_dim_ap}{figures/results_ap_over_latent.png}{AP as a function of latent dimension (experiment-based evaluation). DeepSAD shows inverse correlation between AP and latent space size.}{width=.7\textwidth}
+\figc{latent_dim_ap}{figures/results_ap_over_latent.png}{AP as a function of latent dimension (experiment-based evaluation). DeepSAD shows an inverse correlation between AP and latent space size.}{width=.7\textwidth}
 \FloatBarrier
 \paragraph{Effect of semi-supervised labeling.}
-Table~\ref{tab:results_ap} shows that the unsupervised regime \((0/0)\) achieves the best AP, while the lightly supervised regime \((50/10)\) performs worst. With many labels \((500/100)\), performance improves again but remains slightly below the unsupervised case. This pattern also appears under the manually-defined evaluation, which excludes mislabeled frames. The drop with light supervision therefore cannot be explained by noisy evaluation targets, but must stem from the training process itself.
+Table~\ref{tab:results_ap} shows that the unsupervised regime \((0/0)\) achieves the best AP, while the lightly supervised regime \((50/10)\) performs worst. With many labels \((500/100)\), performance improves again but remains slightly below the unsupervised case. This pattern also appears under the manually-defined evaluation, which excludes mislabeled frames. Consequently, the drop with light supervision cannot be explained by noisy evaluation targets, but must stem from the training process itself.
 The precision--recall curves in Figure~\ref{fig:prc_over_semi} show that the overall curve shapes are similar across regimes, but shifted relative to one another in line with the AP ordering \((0/0) > (500/100) > (50/10)\). We attribute these shifts to overfitting: when only a few anomalies are labeled, the model fits them too strongly, and if those examples differ too much from other anomalies, generalization suffers. This explains why lightly supervised training performs even worse than unsupervised training, which avoids this bias.
-\figc{prc_over_semi}{figures/results_prc_over_semi.png}{\rev{PRCs} at latent dimension~32 for all three labeling regimes (unsupervised, lightly supervised, heavily supervised), shown separately for the LeNet-inspired (\rev{top}) and Efficient (\rev{bottom}) encoders. Baseline methods are included for comparison. Latent dimension~32 is shown as it achieved the best overall AP and is representative of the typical PRC shapes across dimensions.}{width=.7\textwidth}
+\figc{prc_over_semi}{figures/results_prc_over_semi.png}{\rev{PRCs} from experiment-based evaluation for all three labeling regimes (unsupervised, lightly supervised, heavily supervised), shown separately for the LeNet-inspired (\rev{a}) and Efficient (\rev{b}) encoders. Baseline methods are included for comparison. Latent dimension~32 is shown as it achieved the best overall AP and is representative of the typical PRC shapes across dimensions.}{width=.7\textwidth}
 The LeNet variant illustrates this effect most clearly, showing unusually high variance across folds in the lightly supervised case. In several folds, precision drops untypically early, which supports the idea that the model has overfit to a poorly chosen subset of labeled anomalies. The Efficient variant is less affected, maintaining more stable precision plateaus, which suggests it is more robust to such overfitting, which we observe consistently for nearly all latent dimensionalities.
-With many labels \((500/100)\), the results become more stable again and the PRC curves closely resemble the unsupervised case, only shifted slightly left. A larger and more diverse set of labeled anomalies reduces the risk of unlucky sampling and improves generalization, but it still cannot fully match the unsupervised regime, where no overfitting to a specific labeled subset occurs. The only exception is an outlier at latent dimension 512 for LeNet, where the curve again resembles the lightly supervised case, likely due to label sampling effects amplified by higher latent capacity.
+With many labels \((500/100)\), the results become more stable again, and the PRC curves closely resemble the unsupervised case, only shifted slightly left. A larger and more diverse set of labeled anomalies reduces the risk of unlucky sampling and improves generalization, but it still cannot fully match the unsupervised regime, where no overfitting to a specific labeled subset occurs. The only exception is an outlier at latent dimension 512 for LeNet, where the curve again resembles the lightly supervised case, likely due to label sampling effects amplified by higher latent capacity.
 In summary, three consistent patterns emerge: (i) a very small number of labels can hurt performance by causing overfitting to specific examples, (ii) many labels reduce this problem but still do not surpass unsupervised generalization, and (iii) encoder architecture strongly affects robustness, with \rev{the LeNet-inspired encoder} being more sensitive to unstable behavior than \rev{the Efficient encoder}.
 % --- Section: Autoencoder Pretraining Results ---
 \newsection{results_inference}{Inference on Held-Out Experiments}
-In addition to the evaluation of average precision and precision--recall curves obtained from $k$-fold cross-validation with varying hyperparameters, we also examine the behavior of the fully trained methods when applied to previously unseen, held-out experiments.
+In addition to the evaluation of PRC and AP obtained from $k$-fold cross-validation with varying hyperparameters, we also examine the behavior of the fully trained methods when applied to previously unseen, held-out experiments.
-While the prior analysis provided valuable insights into the classification capabilities of the methods, it was limited by two factors: first, the binary ground-truth labels were of uneven quality due to aforementioned mislabeling of frames, and second, the binary formulation does not reflect our overarching goal of quantifying sensor degradation on a continuous scale.
+While the prior analysis provided valuable insights into the classification capabilities of the methods, it was limited by two factors: first, the binary ground-truth labels were of uneven quality due to the aforementioned mislabeling of frames, and second, the binary formulation does not reflect our overarching goal of quantifying sensor degradation on a continuous scale.
 To provide a more intuitive understanding of how the methods might perform in real-world applications, we therefore present results from running inference sequentially on entire experiments.
 These frame-by-frame time-axis plots simulate online inference and illustrate how anomaly scores evolve as data is captured, thereby serving as a candidate metric for quantifying the degree of \rev{LiDAR} degradation during operation.
 %\fig{results_inference_normal_vs_degraded}{figures/results_inference_normal_vs_degraded.png}{Comparison of anomaly detection methods with statistical indicators across clean (dashed) and degraded (solid) experiments. Each subplot shows one method (DeepSAD--LeNet, DeepSAD--Efficient, OCSVM, Isolation Forest). Red curves denote how strongly the anomaly score deviates from clean-experiment baseline; blue and green curves denote the percentage of missing \rev{LiDAR} points and near-sensor particle hits, respectively. Latent Space Dimensionality was 32 and semi-supervised labeling regime was 0 normal and 0 anomalous samples during training.}
-\fig{results_inference_normal_vs_degraded}{figures/results_inference_normal_vs_degraded.png}{Comparison of anomaly detection methods with statistical indicators across clean (dashed) and degraded (solid) experiments. Each subplot shows one method (DeepSAD--LeNet, DeepSAD--Efficient, OCSVM, Isolation Forest). Red curves denote method anomaly scores normalized to the clean experiment; blue and green curves denote the percentage of missing \rev{LiDAR} points and near-sensor particle hits, respectively. Clear separation between clean and degraded runs is observed for the DeepSAD variants and, to a lesser degree, for OCSVM, while Isolation Forest produces high scores even in the clean experiment. Latent Space Dimensionality was 32 and semi-supervised labeling regime was 0 normal and 0 anomalous samples during training.}
+\fig{results_inference_normal_vs_degraded}{figures/results_inference_normal_vs_degraded.png}{Comparison of inference on unseen data for clean (dashed) vs. degraded (solid) experiments. Each subplot, (a) - (d), compares one method's anomaly score deviation from its clean baseline in red to statistical indicators in blue and green, which indicate the percentage of missing \rev{LiDAR} points and near-sensor particle hits, respectively. Latent dimension: 32; training regime: 0 normal, 0 anomalous samples. Smoothed with EMA $\alpha=0.1$.}
-The plots in Figure~\ref{fig:results_inference_normal_vs_degraded} highlight important differences in how well the tested methods distinguish between normal and degraded sensor conditions.
+As discussed in Section~\ref{sec:setup_baselines_evaluation}, we apply $z$-score normalization to enable comparison of the different methods during inference.  After normalization, the resulting time series were still highly noisy, which motivated the application of exponential moving average (EMA) smoothing. EMA was chosen because it is causal (does not rely on future data) and thus suitable for real-time inference. Although it introduces a small time delay, this delay is shorter than for other smoothing techniques, such as running averages.
 Among the four approaches, the strongest separation is achieved by DeepSAD (Efficient), followed by DeepSAD (LeNet), then OCSVM.
 For Isolation Forest, the anomaly scores are already elevated in the clean experiment, which prevents reliable differentiation between normal and degraded runs and makes the method unsuitable in this context.
-Because anomaly scores are on incomparable scales, we apply $z$-score normalization based on the clean experiment. This allows deviations in degraded runs to be measured relative to the clean baseline, enabling direct comparison across methods. To allow comparison between the clean and degraded experiments, the mean and standard deviation were estimated exclusively from the clean experiment and then used to normalize the degraded scores as well. After normalization, the resulting time series were still highly noisy, which motivated the application of exponential moving average (EMA) smoothing. EMA was chosen because it is causal (does not rely on future data) and thus suitable for real-time inference. Although it introduces a small time delay, this delay is shorter than for other smoothing techniques such as running averages.
+The plots in Figure~\ref{fig:results_inference_normal_vs_degraded} highlight important differences in how well the tested methods distinguish between normal and degraded sensor conditions. The plots show how strongly the method's scores deviate from their clean-data baseline and include statistical indicators (missing points and near-sensor particle hits) in blue and green.
-The red method curves can also be compared with the blue and green statistical indicators (missing points and near-sensor particle hits).
+Among the four approaches, the strongest separation is achieved by DeepSAD Efficient (b), followed by DeepSAD LeNet (a), then OCSVM (c). For Isolation Forest (d), the anomaly scores are already elevated in the clean experiment, which prevents reliable differentiation between normal and degraded runs and makes the method unsuitable in this context.
-While some similarities in shape may suggest that the methods partly capture these statistics, such interpretations should be made with caution.
+
 When comparing the methods to the statistical indicators, some similarities in shape may suggest that the methods partly capture these statistics, although such interpretations should be made with caution.
 The anomaly detection models are expected to have learned additional patterns that are not directly observable from simple statistics, and these may also contribute to their ability to separate degraded from clean data.
@@ -1081,20 +1105,20 @@ This thesis set out to answer the research question stated in Chapter~\ref{chp:i
 \begin{quote}
 	Can autonomous robots quantify the reliability of \rev{LiDAR} sensor data in hazardous environments to make more informed decisions?
 \end{quote}
-Our results indicate a qualified “yes.” Using anomaly detection (AD)—in particular DeepSAD—we can obtain scores that (i) separate clearly normal from clearly degraded scans and (ii) track degradation trends over time on held-out experiments (see Sections~\ref{sec:results_deepsad} and \ref{sec:results_inference}). At the same time, the absence of robust ground truth limits how confidently we can assess \emph{continuous} quantification quality and complicates cross-method comparisons. The remainder of this chapter summarizes what we contribute, what we learned, and what is still missing.
+Our results indicate a qualified “yes.” Using anomaly detection (AD)—in particular DeepSAD—we can obtain scores that (i) separate clearly normal from clearly degraded scans and (ii) track degradation trends over time on held-out experiments (see Sections~\ref{sec:results_deepsad} and \ref{sec:results_inference}). At the same time, the absence of robust ground truth limits how confidently we can assess \emph{continuous} quantification quality and complicates cross-method comparisons. The remainder of this chapter summarizes what we contributed, what we learned, and what is still missing.
 \paragraph{Main contributions.}
 \begin{itemize}
 	\item \textbf{Empirical comparison for \rev{LiDAR} degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OCSVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}).
 	\item \textbf{Latent dimensionality insight.}
-	      Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32–128) achieve the highest \rev{mean average precision (mAP)}. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures.
+	      Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32–128) achieve the highest \rev{average precision (AP)}. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures.
 	\item \textbf{Semi-supervision insight.} In our data, \emph{unsupervised} DeepSAD performed best; \emph{light} labeling (50/10) performed worst; \emph{many} labels (500/100) partially recovered performance but did not surpass \rev{the unsupervised approach}. Evidence from \rev{precision--recall curve (PRC)} shapes and fold variance points to \emph{training-side overfitting to a small labeled set}, an effect that persists even under clean manually-defined evaluation (Table~\ref{tab:results_ap}, Figure~\ref{fig:prc_over_semi}).
-	\item \textbf{Encoder architecture matters.} The Efficient encoder \rev{specifically tailored to the application at hand} outperformed the LeNet-inspired variant in pretraining and downstream AD, indicating that representation quality substantially affects DeepSAD performance (Section~\ref{sec:results_pretraining}, Section~\ref{sec:results_deepsad}).
+	\item \textbf{Encoder architecture matters.} The Efficient encoder, \rev{specifically tailored to the application at hand,} outperformed the LeNet-inspired variant in pretraining and downstream AD, indicating that representation quality substantially affects DeepSAD performance (Section~\ref{sec:results_pretraining}, Section~\ref{sec:results_deepsad}).
-	\item \textbf{Temporal inference recipe.} For deployment-oriented analysis we propose $z$-score normalization based on clean data and causal EMA smoothing to obtain interpretable time-series anomaly scores on full experiments (Section~\ref{sec:results_inference}).
+	\item \textbf{Temporal inference recipe.} For deployment-oriented analysis, we propose $z$-score normalization based on clean data and causal EMA smoothing to obtain interpretable time-series anomaly scores on full experiments (Section~\ref{sec:results_inference}).
 \end{itemize}
 \paragraph{Practical recommendations.}
@@ -1110,9 +1134,9 @@ We now turn to the main limiting factor that emerged throughout this work: the l
 \newsection{conclusion_data}{Missing Ground Truth as an Obstacle}
-The most significant obstacle identified in this work is the absence of robust and comprehensive ground truth for \rev{LiDAR} degradation. As discussed in Chapter~\ref{chp:data_preprocessing}, it is not trivial to define what “degradation” precisely means in practice. Although error models for \rev{LiDAR} and theoretical descriptions of how airborne particles affect laser returns exist, these models typically quantify errors at the level of individual points (e.g., missing returns, spurious near-range hits). Such metrics, however, may not be sufficient to assess the impact of degraded data on downstream perception. For example, a point cloud with relatively few but highly localized errors—such as those caused by a dense smoke cloud—may cause a SLAM algorithm to misinterpret the region as a solid obstacle. In contrast, a point cloud with a greater number of dispersed errors might be easier to filter and thus cause little or no disruption in mapping. Consequently, the notion of “degradation” must extend beyond point-level error statistics to include how different error patterns propagate to downstream modules.
+The most significant obstacle identified in this work is the absence of a robust and comprehensive ground truth for \rev{LiDAR} degradation. As discussed in Chapter~\ref{chp:data_preprocessing}, it is not trivial to define what “degradation” precisely means in practice. Although error models for \rev{LiDAR} and theoretical descriptions of how airborne particles affect laser returns exist, these models typically quantify errors at the level of individual points (e.g., missing returns, spurious near-range hits). Such metrics, however, may not be sufficient to assess the impact of degraded data on downstream perception. For example, a point cloud with relatively few but highly localized errors—such as those caused by a dense smoke cloud—may cause a SLAM algorithm to misinterpret the region as a solid obstacle. In contrast, a point cloud with a greater number of dispersed errors might be easier to filter and thus cause little or no disruption in mapping. Consequently, the notion of “degradation” must extend beyond point-level error statistics to include how different error patterns propagate to downstream modules.
-To our knowledge, no public datasets with explicit ground truth for \rev{LiDAR} degradation exist. Even if such data were collected, for example with additional smoke sensors, it is unclear whether this would provide a usable ground truth. A smoke sensor measures only at a single point in space, while \rev{LiDAR} observes many points across the environment from a distance, so the two do not directly translate. In our dataset, we relied on the fact that clean and degraded experiments were clearly separated: data from degraded runs was collected only after artificial smoke had been released. However, the degree of degradation varied strongly within each run. Because the smoke originated from a single machine in the middle of the sensor platform's traversal path, early and late frames were often nearly as clear as those from clean experiments. This led to mislabeled frames at the run boundaries and limited the reliability of experiment-based evaluation. As shown in Section~\ref{sec:results_deepsad}, this effect capped achievable AP scores even for strong models. The underlying difficulty is not only label noise, but also the challenge of collecting labeled subsets that are representative of the full range of anomalies.
+To our knowledge, no public datasets with explicit ground truth for \rev{LiDAR} degradation exist. Even if such data were collected, for example, with additional smoke sensors, it is unclear whether this would provide a usable ground truth. A smoke sensor measures only at a single point in space, while \rev{LiDAR} observes many points across the environment from a distance, so the two do not directly translate. In our dataset, we relied on the fact that clean and degraded experiments were clearly separated: data from degraded runs was collected only after artificial smoke had been released. However, the degree of degradation varied strongly within each run. Because the smoke originated from a single machine in the middle of the sensor platform's traversal path, early and late frames were often nearly as clear as those from clean experiments. This led to mislabeled frames at the run boundaries and limited the reliability of experiment-based evaluation. As shown in Section~\ref{sec:results_deepsad}, this effect capped achievable AP scores even for strong models. The underlying difficulty is not only label noise, but also the challenge of collecting labeled subsets that are representative of the full range of anomalies.
 One promising direction is to evaluate degradation not directly on raw \rev{LiDAR} frames but via its downstream impact. For example, future work could assess degradation based on discrepancies between a previously mapped 3D environment and the output of a SLAM algorithm operating under degraded conditions. In such a setup, subjective labeling may still be required in special cases (e.g., dense smoke clouds treated as solid obstacles by SLAM), but it would anchor evaluation more closely to the ultimate users of the data.
@@ -1120,11 +1144,11 @@ Finally, the binary ground truth employed here is insufficient for the quantific
 \newsection{conclusion_ad}{Insights into DeepSAD and AD for Degradation Quantification}
-This work has shown that the DeepSAD principle is applicable to \rev{LiDAR} degradation in hazardous environments and yields promising detection performance as well as runtime feasibility (see Sections~\ref{sec:results_deepsad} and~\ref{sec:setup_experiments_environment}). Compared to simpler baselines such as Isolation Forest and OCSVM, DeepSAD achieved much stronger separation between clean and degraded data. While OCSVM showed smoother but weaker separation and Isolation Forest produced high false positives even in clean runs, both DeepSAD variants maintained large high-precision regions before collapsing under mislabeled evaluation targets.
+This work has shown that the DeepSAD principle is applicable to \rev{LiDAR} degradation in hazardous environments and yields promising detection performance as well as runtime feasibility (see Sections~\ref{sec:results_deepsad} and~\ref{sec:setup_experiments_environment}). Compared to simpler baselines such as Isolation Forest and OCSVM, DeepSAD achieved much stronger separation between clean and degraded data. While OCSVM showed smoother but weaker separation, and Isolation Forest produced high false positives even in clean runs, both DeepSAD variants maintained large high-precision regions before collapsing under mislabeled evaluation targets.
-However, the semi-supervised component of DeepSAD did not improve results in our setting. In fact, adding a small number of labels often reduced performance due to overfitting to narrow subsets of anomalies, while larger labeled sets stabilized training, they still did not surpass the unsupervised regime (see Section~\ref{sec:results_deepsad}). This suggests that without representative and diverse labeled anomalies, unsupervised training remains the safer choice.
+However, the semi-supervised component of DeepSAD did not improve results in our setting. In fact, adding a small number of labels often reduced performance due to overfitting to narrow subsets of anomalies. While larger labeled sets stabilized training, they still did not surpass the unsupervised regime (see Section~\ref{sec:results_deepsad}). This suggests that without representative and diverse labeled anomalies, unsupervised training remains the safer choice.
-We also observed that the choice of encoder architecture and latent dimensionality are critical. The Efficient encoder consistently outperformed the LeNet-inspired baseline, producing more stable precision–recall curves and stronger overall results. Similarly, compact latent spaces (32–128 dimensions) yielded the best performance and proved more robust under noisy evaluation conditions, while larger latent spaces amplified the impact of mislabeled samples and caused sharper precision collapses. These findings underline the importance of representation design for robust anomaly detection.
+We also observed that the choices of encoder architecture and latent dimensionality are critical. The Efficient encoder consistently outperformed the LeNet-inspired baseline, producing more stable precision–recall curves and stronger overall results. Similarly, compact latent spaces (32–128 dimensions) yielded the best performance and proved more robust under noisy evaluation conditions, while larger latent spaces amplified the impact of mislabeled samples and caused sharper precision collapses. These findings underline the importance of representation design for robust anomaly detection.
 Finally, inference experiments showed that DeepSAD’s anomaly scores can track degradation trends over time when normalized and smoothed, suggesting potential for real-world quantification. Future work could explore per-sample weighting of semi-supervised targets, especially if analog ground truth becomes available, allowing DeepSAD to capture varying degrees of degradation as a graded rather than binary signal.
@@ -1146,7 +1170,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
 % **************************************************************************************************
 \appendix
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\setcounter{mypageno}{\value{page}}
 	\frontmatter \pagestyle{plain} \pagenumbering{Roman}
--- a/thesis/base/declaration_en.tex
+++ b/thesis/base/declaration_en.tex
@@ -24,15 +24,12 @@
 not used other than the declared sources/resources, and that I have
 explicitly indicated all material which has been quoted either
 literally or by content from the sources used.
-\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or
+The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
 	\equal{\ThesisTitle}{diploma thesis} \or
 	\equal{\ThesisTitle}{doctoral thesis}}
 {The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
 \par\vspace*{4cm}
 \centerline{
-\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
+	\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
-\cline{1-3} \cline{5-7}
+		\cline{1-3} \cline{5-7}
- & date & & & & (signature) &\\
+		 & date &  &  &  & (signature) & \\
-\end{tabular}}
+	\end{tabular}}
--- a/thesis/bib/bibliography.bib
+++ b/thesis/bib/bibliography.bib
@@ -684,6 +684,73 @@ article{ef_concept_source,
 	year = {1986},
 	month = dec,
 	pages = {56–68},
 },
@article{roc_vs_prc2,
 	title = {Context discovery for anomaly detection},
 	volume = {19},
 	ISSN = {2364-4168},
 	url = {http://dx.doi.org/10.1007/s41060-024-00586-x},
 	DOI = {10.1007/s41060-024-00586-x},
 	number = {1},
 	journal = {International Journal of Data Science and Analytics},
 	publisher = {Springer Science and Business Media LLC},
 	author = {Calikus, Ece and Nowaczyk, Slawomir and Dikmen, Onur},
 	year = {2024},
 	month = jun,
 	pages = {99–113},
 },
@article{roc_vs_prc,
 	title = {On the evaluation of unsupervised outlier detection: measures,
 	         datasets, and an empirical study},
 	volume = {30},
 	ISSN = {1573-756X},
 	url = {http://dx.doi.org/10.1007/s10618-015-0444-8},
 	DOI = {10.1007/s10618-015-0444-8},
 	number = {4},
 	journal = {Data Mining and Knowledge Discovery},
 	publisher = {Springer Science and Business Media LLC},
 	author = {Campos, Guilherme O. and Zimek, Arthur and Sander, J\"{o}rg and
 	          Campello, Ricardo J. G. B. and Micenková, Barbora and Schubert, Erich
 	          and Assent, Ira and Houle, Michael E.},
 	year = {2016},
 	month = jan,
 	pages = {891–927},
 },
@inproceedings{roc,
 	title = {Basic principles of ROC analysis},
 	author = {Metz, Charles E},
 	booktitle = {Seminars in nuclear medicine},
 	volume = {8},
 	number = {4},
 	pages = {283--298},
 	year = {1978},
 	organization = {Elsevier},
 },
@article{prc,
 	title = {A critical investigation of recall and precision as measures of
 	         retrieval system performance},
 	volume = {7},
 	ISSN = {1558-2868},
 	url = {http://dx.doi.org/10.1145/65943.65945},
 	DOI = {10.1145/65943.65945},
 	number = {3},
 	journal = {ACM Transactions on Information Systems},
 	publisher = {Association for Computing Machinery (ACM)},
 	author = {Raghavan, Vijay and Bollmann, Peter and Jung, Gwang S.},
 	year = {1989},
 	month = jul,
 	pages = {205–229},
 },
@article{zscore,
 	title = {Advanced engineering mathematics},
 	author = {Kreyszig, Erwin and Stroud, K and Stephenson, G},
 	journal = {Integration},
 	volume = {9},
 	number = {4},
 	pages = {1014},
 	year = {2008},
 	publisher = {John Wiley \& Sons, Inc. 9 th edition, 2006 Page 2 of 6 Teaching
 	             methods~…},
 }
--- a/thesis/figures/ae_elbow_test_loss_anomaly.png
+++ b/thesis/figures/ae_elbow_test_loss_anomaly.png
--- a/thesis/figures/ae_elbow_test_loss_overall.png
+++ b/thesis/figures/ae_elbow_test_loss_overall.png
--- a/thesis/figures/data_2d_projections.png
+++ b/thesis/figures/data_2d_projections.png
--- a/thesis/figures/data_combined_anomalies_timeline.png
+++ b/thesis/figures/data_combined_anomalies_timeline.png
--- a/thesis/figures/data_missing_points.png
+++ b/thesis/figures/data_missing_points.png
--- a/thesis/figures/data_points_pie.png
+++ b/thesis/figures/data_points_pie.png
--- a/thesis/figures/particles_near_sensor_boxplot_zoomed_500.png
+++ b/thesis/figures/particles_near_sensor_boxplot_zoomed_500.png
--- a/thesis/figures/results_inference_normal_vs_degraded.png
+++ b/thesis/figures/results_inference_normal_vs_degraded.png
--- a/thesis/figures/results_prc.png
+++ b/thesis/figures/results_prc.png
--- a/thesis/figures/results_prc_over_semi.png
+++ b/thesis/figures/results_prc_over_semi.png
--- a/thesis/filters/drop-images.lua
+++ b/thesis/filters/drop-images.lua
@@ -0,0 +1,11 @@
 -- drop-images.lua
 -- Replaces all images (figures, graphics) with a short placeholder.
 function Image(el) return pandoc.Str("[image omitted]") end
 -- For LaTeX figures that are still raw
 function RawBlock(el)
    if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
        return pandoc.Plain({pandoc.Str("[figure omitted]")})
    end
 end
--- a/thesis/filters/drop-tables.lua
+++ b/thesis/filters/drop-tables.lua
@@ -0,0 +1,11 @@
 -- drop-tables.lua
 -- Removes LaTeX tabular and tabularx environments (and their contents).
 function RawBlock(el)
    if el.format == "tex" then
        -- Check for tabular or tabularx environment
        if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
            return pandoc.Plain({pandoc.Str("[table omitted]")})
        end
    end
 end
--- a/thesis/filters/keep-citations.lua
+++ b/thesis/filters/keep-citations.lua
@@ -0,0 +1,43 @@
 -- keep-citations.lua
 -- Replace citations with a placeholder and eat any preceding space.
 local PH = "[citation]"
 -- Pandoc-native citations (if the reader produced Cite nodes)
 function Cite(el) return pandoc.Str(PH) end
 -- Raw LaTeX \cite-like macros (when not parsed as Cite)
 function RawInline(el)
    if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
        return pandoc.Str(PH)
    end
 end
 -- Remove a single leading Space before our placeholder
 local function squash_spaces(inlines)
    local out = {}
    local i = 1
    while i <= #inlines do
        local cur = inlines[i]
        local nxt = inlines[i + 1]
        if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
            PH then
            table.insert(out, nxt)
            i = i + 2
        else
            table.insert(out, cur)
            i = i + 1
        end
    end
    return out
 end
 function Para(el)
    el.content = squash_spaces(el.content)
    return el
 end
 function Plain(el)
    el.content = squash_spaces(el.content)
    return el
 end
--- a/thesis/filters/math-omit.lua
+++ b/thesis/filters/math-omit.lua
@@ -0,0 +1,48 @@
 -- math-omit.lua
 -- Replace any math with a placeholder and ensure a space before it when appropriate.
 local PH = "[math omitted]"
 function Math(el)
    -- Emit the placeholder as a Str; spacing is fixed in Para/Plain below.
    return pandoc.Str(PH)
 end
 local function ensure_space_before_ph(inlines)
    local out = {}
    for i = 1, #inlines do
        local cur = inlines[i]
        if cur.t == "Str" and cur.text == PH then
            local prev = out[#out]
            local need_space = true
            -- No space if it's the first token in the block
            if not prev then
                need_space = false
            elseif prev.t == "Space" then
                need_space = false
            elseif prev.t == "Str" then
                -- If previous char is an opening bracket/paren/slash/hyphen or whitespace, skip
                local last = prev.text:sub(-1)
                if last:match("[%(%[%{%/%-]") or last:match("%s") then
                    need_space = false
                end
            end
            if need_space then table.insert(out, pandoc.Space()) end
            table.insert(out, cur)
        else
            table.insert(out, cur)
        end
    end
    return out
 end
 function Para(el)
    el.content = ensure_space_before_ph(el.content)
    return el
 end
 function Plain(el)
    el.content = ensure_space_before_ph(el.content)
    return el
 end
--- a/thesis/flake.nix
+++ b/thesis/flake.nix
@@ -28,7 +28,10 @@
          zathura
          wmctrl
          python312
          pandoc
          pandoc-lua-filters
        ];
        filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
      in
      {
        devShell = pkgs.mkShell {
@@ -39,6 +42,28 @@
          ];
        };
        shellHook = ''
          set -eu
          # local folder in your repo to reference in commands
          link_target="pandoc-filters"
          # refresh symlink each time you enter the shell
          ln -sfn ${filtersPath} "$link_target"
          echo "Linked $link_target -> ${filtersPath}"
          # (optional) write a defaults file that uses the relative symlink
          if [ ! -f pandoc.defaults.yaml ]; then
            cat > pandoc.defaults.yaml <<'YAML'
            from: latex
            to: plain
            wrap: none
            lua-filter:
              - pandoc-filters/latex-hyphen.lua
              - pandoc-filters/pandoc-quotes.lua
            YAML
            echo "Wrote pandoc.defaults.yaml"
          fi
        '';
      }
    );
 }
--- a/thesis/tex2plaintext.sh
+++ b/thesis/tex2plaintext.sh
@@ -0,0 +1,61 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Usage:
 #   ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
 #
 # Defaults:
 #   INPUT_TEX     = Main.txt     (your original file name)
 #   OUT_BASENAME  = thesis       (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
 INPUT_TEX="${1:-Main.tex}"
 OUT_BASE="${2:-thesis}"
 FLAT_TEX="flat.tex"
 NO_TABLES_TEX="flat_notables.tex"
 PLAIN_TXT="${OUT_BASE}.txt"
 PART1_TXT="${OUT_BASE}_part1.txt"
 PART2_TXT="${OUT_BASE}_part2.txt"
 MARKER="Data and Preprocessing"
 echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}"
 latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
 echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
 # Replace entire tabular / tabularx environments with a placeholder
 perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
  "${FLAT_TEX}" > "${NO_TABLES_TEX}"
 echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}"
 pandoc -f latex -t plain --wrap=none \
  --lua-filter=filters/keep-citations.lua \
  --lua-filter=filters/math-omit.lua \
  "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
 echo "[4/5] Replacing [] placeholders with [figure]"
 sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}"
 echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
 # Ensure the marker exists exactly on its own line
 if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
  echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
  echo "       (It must be the only content on that line.)"
  exit 1
 fi
 # Clean previous outputs if present
 rm -f -- "${PART1_TXT}" "${PART2_TXT}"
 # Split so the marker line becomes the FIRST line of part 2
 awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
 BEGIN { current = out1 }
 $0 == marker { current = out2; print $0 > current; next }
 { print $0 > current }
 ' "${PLAIN_TXT}"
 echo "Done."
 echo "  - ${PLAIN_TXT}"
 echo "  - ${PART1_TXT}"
 echo "  - ${PART2_TXT}"
--- a/thesis/thesis_preamble/abstract.tex
+++ b/thesis/thesis_preamble/abstract.tex
@@ -1,9 +1,9 @@
 \addcontentsline{toc}{chapter}{Abstract}
 \begin{center}\Large\bfseries Abstract\end{center}\vspace*{1cm}\noindent
-Autonomous robots are increasingly used in search and rescue (SAR) missions. In these missions, lidar sensors are often the most important source of environmental data. However, lidar data can degrade under hazardous conditions, especially when airborne particles such as smoke or dust are present. This degradation can lead to errors in mapping and navigation and may endanger both the robot and humans. Therefore, robots need a way to estimate the reliability of their lidar data, so \rev{that} they can make better-informed decisions.
+Autonomous robots are increasingly used in search and rescue (SAR) missions. In these missions, LiDAR sensors are often the most important source of environmental data. However, LiDAR data can degrade under hazardous conditions, especially when airborne particles such as smoke or dust are present. This degradation can lead to errors in mapping and navigation and may endanger both the robot and humans. Therefore, robots need a way to estimate the reliability of their LiDAR data, so that they can make better-informed decisions.
 \bigskip
-This thesis investigates whether anomaly detection methods can be used to quantify lidar data degradation \rev{caused by airborne particles such as smoke and dust}. We apply a semi-supervised deep learning approach called DeepSAD, which produces an anomaly score for each lidar scan, serving as a measure of data reliability.
+This thesis investigates whether anomaly detection methods can be used to quantify LiDAR data degradation caused by airborne particles such as smoke and dust. We apply a semi-supervised deep learning approach called DeepSAD, which produces an anomaly score for each LiDAR scan, serving as a measure of data reliability.
 \bigskip
-We evaluate this method against baseline methods on a subterranean dataset that includes lidar scans degraded by artificial smoke. Our results show that DeepSAD consistently outperforms the baselines and can clearly distinguish degraded from normal scans. At the same time, we find that the limited availability of labeled data and the lack of robust ground truth remain major challenges. Despite these limitations, our work demonstrates that anomaly detection methods are a promising tool for lidar degradation quantification in SAR scenarios.
+We evaluate this method against baseline methods on a subterranean dataset that includes LiDAR scans degraded by artificial smoke. Our results show that DeepSAD consistently outperforms the baselines and can clearly distinguish degraded from normal scans. At the same time, we find that the limited availability of labeled data and the lack of robust ground truth remain major challenges. Despite these limitations, our work demonstrates that anomaly detection methods are a promising tool for LiDAR degradation quantification in SAR scenarios.
--- a/tools/devenv.nix
+++ b/tools/devenv.nix
@@ -1,6 +1,6 @@
 { pkgs, ... }:
 let
-  native_dependencies = with pkgs.python312Packages; [
+  native_dependencies = with pkgs.python311Packages; [
    torch-bin
    torchvision-bin
    aggdraw # for visualtorch
@@ -16,7 +16,7 @@ in
  packages = native_dependencies ++ tools;
  languages.python = {
    enable = true;
-    package = pkgs.python312;
+    package = pkgs.python311;
    uv = {
      enable = true;
      sync.enable = true;
--- a/tools/plot_scripts/ae_elbow_lenet.py
+++ b/tools/plot_scripts/ae_elbow_lenet.py
@@ -12,7 +12,7 @@ import numpy as np
 import polars as pl
 # CHANGE THIS IMPORT IF YOUR LOADER MODULE IS NAMED DIFFERENTLY
-from plot_scripts.load_results import load_pretraining_results_dataframe
+from load_results import load_pretraining_results_dataframe
 # ----------------------------
 # Config
@@ -78,8 +78,8 @@ def build_arch_curves_from_df(
            "overall": (dims, means, stds),
        } }
    """
-    if "split" not in df.columns:
+    # if "split" not in df.columns:
-        raise ValueError("Expected 'split' column in AE dataframe.")
+    #     raise ValueError("Expected 'split' column in AE dataframe.")
    if "scores" not in df.columns:
        raise ValueError("Expected 'scores' column in AE dataframe.")
    if "network" not in df.columns or "latent_dim" not in df.columns:
@@ -88,7 +88,7 @@ def build_arch_curves_from_df(
        raise ValueError(f"Expected '{label_field}' column in AE dataframe.")
    # Keep only test split
-    df = df.filter(pl.col("split") == "test")
+    # df = df.filter(pl.col("split") == "test")
    groups: dict[tuple[str, int], dict[str, list[float]]] = {}
@@ -201,7 +201,7 @@ def plot_multi_loss_curve(arch_results, title, output_path, colors=None):
    plt.xlabel("Latent Dimensionality")
    plt.ylabel("Test Loss")
-    plt.title(title)
+    # plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(all_dims)
--- a/tools/plot_scripts/data_anomalies_timeline.py
+++ b/tools/plot_scripts/data_anomalies_timeline.py
@@ -171,28 +171,28 @@ def plot_combined_timeline(
            range(num_bins), near_sensor_binned, color=color, linestyle="--", alpha=0.6
        )
-        # Add vertical lines for manually labeled frames if available
+        # # Add vertical lines for manually labeled frames if available
-        if all_paths[i].with_suffix(".npy").name in manually_labeled_anomaly_frames:
+        # if all_paths[i].with_suffix(".npy").name in manually_labeled_anomaly_frames:
-            begin_frame, end_frame = manually_labeled_anomaly_frames[
+        #     begin_frame, end_frame = manually_labeled_anomaly_frames[
-                all_paths[i].with_suffix(".npy").name
+        #         all_paths[i].with_suffix(".npy").name
-            ]
+        #     ]
-            # Convert frame numbers to normalized timeline positions
+        #     # Convert frame numbers to normalized timeline positions
-            begin_pos = (begin_frame / exp_len) * (num_bins - 1)
+        #     begin_pos = (begin_frame / exp_len) * (num_bins - 1)
-            end_pos = (end_frame / exp_len) * (num_bins - 1)
+        #     end_pos = (end_frame / exp_len) * (num_bins - 1)
-            # Add vertical lines with matching color and loose dotting
+        #     # Add vertical lines with matching color and loose dotting
-            ax1.axvline(
+        #     ax1.axvline(
-                x=begin_pos,
+        #         x=begin_pos,
-                color=color,
+        #         color=color,
-                linestyle=":",
+        #         linestyle=":",
-                alpha=0.6,
+        #         alpha=0.6,
-            )
+        #     )
-            ax1.axvline(
+        #     ax1.axvline(
-                x=end_pos,
+        #         x=end_pos,
-                color=color,
+        #         color=color,
-                linestyle=":",
+        #         linestyle=":",
-                alpha=0.6,
+        #         alpha=0.6,
-            )
+        #     )
    # Customize axes
    ax1.set_xlabel("Normalized Timeline")
@@ -202,7 +202,7 @@ def plot_combined_timeline(
    ax1.set_ylabel("Missing Points (%)")
    ax2.set_ylabel("Points with <0.5m Range (%)")
-    plt.title(title)
+    # plt.title(title)
    # Create legends without fixed positions
    # First get all lines and labels for experiments
@@ -221,7 +221,8 @@ def plot_combined_timeline(
    )
    # Create single legend in top right corner with consistent margins
-    fig.legend(all_handles, all_labels, loc="upper right", borderaxespad=4.8)
+    # fig.legend(all_handles, all_labels, loc="upper right", borderaxespad=2.8)
    fig.legend(all_handles, all_labels, bbox_to_anchor=(0.95, 0.99))
    plt.grid(True, alpha=0.3)
--- a/tools/plot_scripts/data_count_lidar_frames.py
+++ b/tools/plot_scripts/data_count_lidar_frames.py
@@ -122,8 +122,8 @@ def plot_data_points_pie(normal_experiment_frames, anomaly_experiment_frames):
    # prepare data for pie chart
    labels = [
-        "Normal Lidar Frames\nNon-Degraded Pointclouds",
+        "Normal Lidar Frames\nNon-Degraded Point Clouds",
-        "Anomalous Lidar Frames\nDegraded Pointclouds",
+        "Anomalous Lidar Frames\nDegraded Point Clouds",
    ]
    sizes = [total_normal_frames, total_anomaly_frames]
    explode = (0.1, 0)  # explode the normal slice
@@ -150,9 +150,9 @@ def plot_data_points_pie(normal_experiment_frames, anomaly_experiment_frames):
        va="center",
        color="black",
    )
-    plt.title(
+    # plt.title(
-        "Distribution of Normal and Anomalous\nPointclouds in all Experiments (Lidar Frames)"
+    #     "Distribution of Normal and Anomalous\nPointclouds in all Experiments (Lidar Frames)"
-    )
+    # )
    plt.tight_layout()
    # save the plot
--- a/tools/plot_scripts/data_missing_points.py
+++ b/tools/plot_scripts/data_missing_points.py
@@ -5,7 +5,6 @@ from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
 from pointcloudset import Dataset
 # define data path containing the bag files
 all_data_path = Path("/home/fedex/mt/data/subter")
@@ -82,7 +81,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
    plt.figure(figsize=(10, 5))
    plt.hist(missing_points_normal, bins=100, alpha=0.5, label="Normal Experiments")
    plt.hist(missing_points_anomaly, bins=100, alpha=0.5, label="Anomaly Experiments")
-    plt.title(title)
+    # plt.title(title)
    plt.xlabel("Number of Missing Points")
    plt.ylabel("Number of Pointclouds")
    plt.legend()
@@ -109,7 +108,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
        label="Anomaly Experiments",
        orientation="horizontal",
    )
-    plt.title(title)
+    # plt.title(title)
    plt.xlabel("Number of Pointclouds")
    plt.ylabel("Number of Missing Points")
    plt.legend()
@@ -142,7 +141,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
        label="Anomaly Experiments",
        density=True,
    )
-    plt.title(title)
+    # plt.title(title)
    plt.xlabel("Number of Missing Points")
    plt.ylabel("Density")
    plt.legend()
@@ -169,7 +168,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
        label="Anomaly Experiments (With Artifical Smoke)",
        density=True,
    )
-    plt.title(title)
+    # plt.title(title)
    plt.xlabel("Percentage of Missing Lidar Measurements")
    plt.ylabel("Density")
    # display the x axis as percentages
@@ -210,7 +209,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
        alpha=0.5,
        label="Anomaly Experiments",
    )
-    plt.title(title)
+    # plt.title(title)
    plt.xlabel("Number of Missing Points")
    plt.ylabel("Normalized Density")
    plt.legend()
--- a/tools/plot_scripts/data_particles_near_sensor.py
+++ b/tools/plot_scripts/data_particles_near_sensor.py
@@ -5,7 +5,6 @@ from pathlib import Path
 import matplotlib.pyplot as plt
 import numpy as np
 from pointcloudset import Dataset
 # define data path containing the bag files
 all_data_path = Path("/home/fedex/mt/data/subter")
@@ -164,7 +163,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
        plt.gca().set_yticklabels(
            ["{:.0f}%".format(y * 100) for y in plt.gca().get_yticks()]
        )
-        plt.title("Particles Closer than 0.5m to the Sensor")
+        # plt.title("Particles Closer than 0.5m to the Sensor")
        plt.ylabel("Percentage of measurements closer than 0.5m")
        plt.tight_layout()
        plt.savefig(output_datetime_path / f"particles_near_sensor_boxplot_{rt}.png")
@@ -186,7 +185,7 @@ def plot_data_points(normal_experiment_paths, anomaly_experiment_paths, title):
        plt.gca().set_yticklabels(
            ["{:.0f}%".format(y * 100) for y in plt.gca().get_yticks()]
        )
-        plt.title("Particles Closer than 0.5m to the Sensor")
+        # plt.title("Particles Closer than 0.5m to the Sensor")
        plt.ylabel("Percentage of measurements closer than 0.5m")
        plt.ylim(0, 0.05)
        plt.tight_layout()
--- a/tools/plot_scripts/data_spherical_projection.py
+++ b/tools/plot_scripts/data_spherical_projection.py
@@ -112,18 +112,27 @@ cmap = get_colormap_with_special_missing_color(
    args.colormap, args.missing_data_color, args.reverse_colormap
 )
-# --- Create a figure with 2 vertical subplots ---
+# --- Create a figure with 2 vertical subplots and move titles to the left ---
 fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 5))
-for ax, frame, title in zip(
+# leave extra left margin for the left-side labels
 fig.subplots_adjust(left=0.14, hspace=0.05)
 for ax, frame, label in zip(
    (ax1, ax2),
    (frame1, frame2),
-    (
+    ("(a)", "(b)"),
        "Projection of Lidar Frame without Degradation",
        "Projection of Lidar Frame with Degradation (Artifical Smoke)",
    ),
 ):
    im = ax.imshow(frame, cmap=cmap, aspect="auto", vmin=global_vmin, vmax=global_vmax)
-    ax.set_title(title)
+    # place the "title" to the left, vertically centered relative to the axes
    ax.text(
        -0.02,  # negative x places text left of the axes (in axes coordinates)
        0.5,
        label,
        transform=ax.transAxes,
        va="center",
        ha="right",
        fontsize=12,
    )
    ax.axis("off")
 # Adjust layout to fit margins for a paper
--- a/tools/plot_scripts/results_inference_timelines_exp_compare.py
+++ b/tools/plot_scripts/results_inference_timelines_exp_compare.py
@@ -260,11 +260,11 @@ def baseline_transform(clean: np.ndarray, other: np.ndarray, mode: str):
 def pick_method_series(gdf: pl.DataFrame, label: str) -> Optional[np.ndarray]:
-    if label == "DeepSAD (LeNet)":
+    if label == "DeepSAD LeNet":
        sel = gdf.filter(
            (pl.col("network") == "subter_LeNet") & (pl.col("model") == "deepsad")
        )
-    elif label == "DeepSAD (efficient)":
+    elif label == "DeepSAD Efficient":
        sel = gdf.filter(
            (pl.col("network") == "subter_efficient") & (pl.col("model") == "deepsad")
        )
@@ -311,8 +311,8 @@ def compare_two_experiments_progress(
    include_stats: bool = True,
 ):
    methods = [
-        "DeepSAD (LeNet)",
+        "DeepSAD LeNet",
-        "DeepSAD (efficient)",
+        "DeepSAD Efficient",
        "OCSVM",
        "Isolation Forest",
    ]
@@ -392,8 +392,8 @@ def compare_two_experiments_progress(
    axes = axes.ravel()
    method_to_axidx = {
-        "DeepSAD (LeNet)": 0,
+        "DeepSAD LeNet": 0,
-        "DeepSAD (efficient)": 1,
+        "DeepSAD Efficient": 1,
        "OCSVM": 2,
        "Isolation Forest": 3,
    }
@@ -404,6 +404,8 @@ def compare_two_experiments_progress(
    if not stats_available:
        print("[WARN] One or both stats missing. Subplots will include methods only.")
    letters = ["a", "b", "c", "d"]
    for label, axidx in method_to_axidx.items():
        ax = axes[axidx]
        yc = curves_clean.get(label)
@@ -412,7 +414,7 @@ def compare_two_experiments_progress(
            ax.text(
                0.5, 0.5, "No data", ha="center", va="center", transform=ax.transAxes
            )
-            ax.set_title(label)
+            ax.set_title(f"({letters[axidx]}) {label}")
            ax.grid(True, alpha=0.3)
            continue
@@ -435,6 +437,7 @@ def compare_two_experiments_progress(
        )
        ax.set_ylabel(y_label)
        ax.set_title(label)
        ax.set_title(f"({letters[axidx]}) {label}")
        ax.grid(True, alpha=0.3)
        # Right axis #1 (closest to plot): Missing points (%)
@@ -550,11 +553,11 @@ def compare_two_experiments_progress(
    for ax in axes:
        ax.set_xlabel("Progress through experiment (%)")
-    fig.suptitle(
+    # fig.suptitle(
-        f"AD Method vs Stats Inference — progress-normalized\n"
+    #     f"AD Method vs Stats Inference — progress-normalized\n"
-        f"Transform: z-score normalized to non-degraded experiment | EMA(α={EMA_ALPHA_METHODS})",
+    #     f"Transform: z-score normalized to non-degraded experiment | EMA(α={EMA_ALPHA_METHODS})",
-        fontsize=14,
+    #     fontsize=14,
-    )
+    # )
    fig.tight_layout(rect=[0, 0, 1, 0.99])
    out_name = (
--- a/tools/plot_scripts/results_latent_space_comparisons.py
+++ b/tools/plot_scripts/results_latent_space_comparisons.py
@@ -161,7 +161,7 @@ def _ensure_dim_axes(fig_title: str):
    fig, axes = plt.subplots(
        nrows=4, ncols=2, figsize=(12, 16), constrained_layout=True
    )
-    fig.suptitle(fig_title, fontsize=14)
+    # fig.suptitle(fig_title, fontsize=14)
    axes = axes.ravel()
    return fig, axes
@@ -213,11 +213,13 @@ def plot_grid_from_df(
    legend_labels = []
    have_legend = False
    letters = ["a", "b", "c", "d", "e", "f", "g", "h"]
    for i, dim in enumerate(LATENT_DIMS):
        if i >= 7:
            break  # last slot reserved for legend
        ax = axes[i]
-        ax.set_title(f"Latent Dim. = {dim}")
+        ax.set_title(f"({letters[i]}) Latent Dim. = {dim}")
        ax.grid(True, alpha=0.3)
        if kind == "roc":
--- a/tools/plot_scripts/results_semi_labels_comparison.py
+++ b/tools/plot_scripts/results_semi_labels_comparison.py
@@ -260,9 +260,9 @@ def make_figures_for_dim(
    fig_roc, axes = plt.subplots(
        nrows=2, ncols=1, figsize=(7, 10), constrained_layout=True
    )
-    fig_roc.suptitle(
+    # fig_roc.suptitle(
-        f"ROC — {EVALS_LABELS[eval_type]} — Latent Dim.={latent_dim}", fontsize=14
+    #     f"ROC — {EVALS_LABELS[eval_type]} — Latent Dim.={latent_dim}", fontsize=14
-    )
+    # )
    _plot_panel(
        axes[0],
@@ -272,7 +272,7 @@ def make_figures_for_dim(
        latent_dim=latent_dim,
        kind="roc",
    )
-    axes[0].set_title("DeepSAD (LeNet) + Baselines")
+    axes[0].set_title("(a) DeepSAD (LeNet) + Baselines")
    _plot_panel(
        axes[1],
@@ -282,7 +282,7 @@ def make_figures_for_dim(
        latent_dim=latent_dim,
        kind="roc",
    )
-    axes[1].set_title("DeepSAD (Efficient) + Baselines")
+    axes[1].set_title("(b) DeepSAD (Efficient) + Baselines")
    out_roc = out_dir / f"roc_{latent_dim}_{eval_type}.png"
    fig_roc.savefig(out_roc, dpi=150, bbox_inches="tight")
@@ -292,9 +292,9 @@ def make_figures_for_dim(
    fig_prc, axes = plt.subplots(
        nrows=2, ncols=1, figsize=(7, 10), constrained_layout=True
    )
-    fig_prc.suptitle(
+    # fig_prc.suptitle(
-        f"PRC — {EVALS_LABELS[eval_type]} — Latent Dim.={latent_dim}", fontsize=14
+    #     f"PRC — {EVALS_LABELS[eval_type]} — Latent Dim.={latent_dim}", fontsize=14
-    )
+    # )
    _plot_panel(
        axes[0],
@@ -304,7 +304,7 @@ def make_figures_for_dim(
        latent_dim=latent_dim,
        kind="prc",
    )
-    axes[0].set_title("DeepSAD (LeNet) + Baselines")
+    axes[0].set_title("(a)")
    _plot_panel(
        axes[1],
@@ -314,7 +314,7 @@ def make_figures_for_dim(
        latent_dim=latent_dim,
        kind="prc",
    )
-    axes[1].set_title("DeepSAD (Efficient) + Baselines")
+    axes[1].set_title("(b)")
    out_prc = out_dir / f"prc_{latent_dim}_{eval_type}.png"
    fig_prc.savefig(out_prc, dpi=150, bbox_inches="tight")
--- a/tools/pyproject.toml
+++ b/tools/pyproject.toml
@@ -6,6 +6,7 @@ readme = "README.md"
 requires-python = ">=3.11.9"
 dependencies = [
    "pandas>=2.3.2",
    "pointcloudset>=0.11.0",
    "polars>=1.33.0",
    "pyarrow>=21.0.0",
    "tabulate>=0.9.0",
--- a/tools/uv.lock
+++ b/tools/uv.lock
Author	SHA1	Message	Date
Jan Kowalczyk	7b5accb6c5	fixed plots	2025-10-21 19:04:19 +02:00
Jan Kowalczyk	8f983b890f	formatting	2025-10-19 17:39:42 +02:00
Jan Kowalczyk	6cd2c7fbef	abstract lidar capitalization	2025-10-19 17:34:38 +02:00
Jan Kowalczyk	62c424cd54	grammarly done	2025-10-19 17:29:31 +02:00
Jan Kowalczyk	bd9171f68e	grammarly data chapter	2025-10-19 16:46:29 +02:00
Jan Kowalczyk	efdc33035b	grammarly part 1 done	2025-10-19 16:27:22 +02:00
Jan Kowalczyk	f2c8fe241d	cleanup	2025-10-18 18:27:13 +02:00
Jan Kowalczyk	ece887860b	z-score rework	2025-10-18 18:01:41 +02:00
Jan Kowalczyk	c3830db913	metrics section draft	2025-10-18 17:23:18 +02:00
Jan Kowalczyk	3d21171a40	raw metrics section	2025-10-18 17:02:22 +02:00
Jan Kowalczyk	5aca00ad67	better grammarly prep	2025-10-18 12:47:16 +02:00
Jan Kowalczyk	374420727b	cleanup for raw txt (grammar check)	2025-10-18 12:19:26 +02:00
Jan Kowalczyk	8697c07c0f	reworked baselines	2025-10-18 11:28:12 +02:00