mt/thesis/Main.tex

% **************************************************************************************************
% ** SPSC Report and Thesis Template
% **************************************************************************************************
%
% ***** Authors *****
% Daniel Arnitz, Paul Meissner, Stefan Petrik, Dietmar Malli, Johanna Rock
% Signal Processing and Speech Communication Laboratory (SPSC)
% Graz University of Technology (TU Graz), Austria
%
% ***** Changelog *****
% 0.1   2010-01-25   extracted from report template by Daniel Arnitz (not ready yet)
% 0.2   2010-02-08   added thesis titlepage and modified layout (not ready yet)
% 0.3   2010-02-18   added TUG logo and statutory declaration
% 0.4   2010-02-18   moved the information fields below \input{./base/packages} (encoding...)
% 0.5   2010-03-02   added \ShortTitle to fix problems with long thesis titles
%                    added \ThesisType (makes the template suitable for MSc, BSc, PhD, ... Thesis)
% 0.6   2010-06-05   added pagestyle and pagenumbering after frontmatter, packages has now type
% 0.7   2010-09      \Advisors -> \Assessors, inserted frontmatter for thesis
% 0.8   2010-11      added examples
% 0.9   2011-04      \Twosided now {true,false}, scrbook for thesis (\front-, \main-, \backmatter)
%                    added \SpecialNote for titlepage (funding, etc.), added type "homework"
% 0.10  2011-10-18   fixed two typos in \bibliographystyle{} (bug reported by Michael Tauch)
% 0.11  2011-11-09   fixed/modified preamble (bug reported by Michael Tauch)
% 0.12  2012-07-20   added ./base/opt_macros to deal with optional macros
% 0.13  2012-07-27   added \PaperSize
% 0.14  2017-11-03   Fixed thispagestyle issue
%                    Implemented automatic setting of correct page number after switching from
%                    roman numbering back to normal numbering
%                    Implemented \DraftText hack
%                    Moved makeindex from external programm to newer stuff (package...)
%                    Made confidential dependent from \DraftText
%                    Made OptDraftMode and DisplayContentBoxes dependet from \DraftText
%                    Included some syntax formatting definitions
%                    Fixed wrong usage of scrbook class and \emptydoublepage mess... One should
%                    NOT need to adjust/tweak the layout by hand. That's what latex is for...
%                    Replaced bibtex with more modern biblatex (utf8 support in bibliography...)
%                    Added \printbibliography, \listoffigures, \listoftables and
%                    \printglossary[type=\acronymtype]
%                    Renewed and extended Introduction/Usage
% 0.15  2018-03-20   Homework and report now compileable again. Fixed a missing if.
% 0.16	2018-08-08	 fixed/modified title according to official TUG template
% 0.17	2018-08-09	 updated placeholder commands for new title page
%
% ***** Todo *****
%
% **************************************************************************************************
% basic setup
\newcommand{\DocumentType}{thesis} % "thesis" / "report" / "homework"
\newcommand{\DocumentLanguage}{en} % "en" / "de"
\newcommand{\PaperSize}{a4paper} % "a4paper" / "letterpaper"
\newcommand{\Twosided}{true} % "true" / "false" (=Duplex...)
\newcommand{\FramedLinks}{false} %"true" / "false"

% **************************************************************************************************
% template setup -- do not change these unless you know what you are doing!
\input{./base/documentclass_\DocumentType}
\input{./base/packages}
\input{./base/layout_\DocumentType}
\input{./base/macros}

% **************************************************************************************************

% uncomment to get watermarks:
% \usepackage[first,bottom,light,draft]{draftcopy}
% \draftcopyName{ENTWURF}{160}

\usepackage{xcolor}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage[colorinlistoftodos]{todonotes}
%\usepackage[disable]{todonotes}
\usepackage{makecell}

\DeclareRobustCommand{\threadtodo}[4]{%
  \todo[inline,
  % \todo[disable,
        backgroundcolor=red!20,
        bordercolor=red!50,
        textcolor=black!80,
        size=\small,
        caption={Common Thread Note}]{%
        \textbf{Goal:} #1 \newline
        \textbf{Context:} #2 \newline
        \textbf{Method:} #3 \newline
        \textbf{Transition:} #4
  }%
}

\DeclareRobustCommand{\sensorcell}[2]{%
  \makecell[l]{#1 \\ \emph{#2}}
}

% correct bad hyphenation
\hyphenation{}

% switches
\newboolean{OptDraftMode}
\newboolean{DisplayContentBoxes}
% \setboolean{OptDraftMode}{true} % optional draft mode for pixel graphics (speed up generation; add \OptDraft to options)

\ifthenelse{\boolean{OptDraftMode}}
{
	\setboolean{DisplayContentBoxes}{true}
}
{
	\setboolean{DisplayContentBoxes}{false}
}


% **************************************************************************************************
% information fields

% general
\newcommand{\DocumentTitle}{Lidar Degradation Quantification for Robot Navigation in Hazy Environments}
%\newcommand{\DocumentSubtitle}{}
\newcommand{\ShortTitle}{} % used in headers (keep short!)
% for thesis: Firstname Surename, current university degree (e.g. BSc)
% for report, homework: Firstname Surename, Mat.Nr.
\newcommand{\DocumentAuthor}{Jan Kowalczyk}
\newcommand{\DocumentPlace}{Graz}

% FOR THESIS ONLY
% used for the title page and statutory declaration
% one out of: "bachelor's thesis" / "Bachelorarbeit" /
%			  "master's thesis" / "Masterarbeit" /
%			  "diploma thesis" / "Diplomarbeit" /
%			  "doctoral thesis" / "Dissertation"
% ATTENTION: use correct language! Otherwise statutory declaration is faulty.
\newcommand{\ThesisTitle}{master's thesis}
\newcommand{\Institute}{Signal Processing and Speech Communication Laboratory}
\newcommand{\OrganizationsAdditional}{in cooperation with \\[0.2cm] \par Virtual Vehicle Research GmbH \\ Graz, Austria \\[2.0cm] \par}
\newcommand{\Supervisors}{Univ.-Prof. Dipl.-Ing. Dr.mont Franz Pernkopf} % Supervisor 1 \\ Supervisor 2 ...
\newcommand{\SpecialNote}{}

% FOR REPORT ONLY
%revision numbers
\newcommand{\RevPrefix}{alpha~}
\newcommand{\RevLarge}{1}
\newcommand{\RevSmall}{0}

% confidential? (can of course also be used for other messages/notes)
\newcommand{\ConfidNote}{\ifthenelse{\boolean{OptDraftMode}}{
		\textbf{DRAFT}, \today,
]}{
		%\textbf{CONFIDENTIAL}
}}

\DeclareMathAlphabet\mathbfcal{OMS}{cmsy}{b}{n}
\newcommand*\wc{{\mkern 2mu\cdot\mkern 2mu}}

\input{./base/opt_macros}

% variable for page numbering
\newcounter{mypageno}
% **************************************************************************************************
\begin{document}
% **************************************************************************************************
\input{./base/syntax_formatting}

% for thesis: switch to frontmatter (Roman numbering, etc.)
\ifthenelse{\equal{\DocumentType}{thesis}}
{
	\frontmatter \pagestyle{plain} \pagenumbering{Roman}
}{}

%title
\input{./base/titlepage_\DocumentType}

% for thesis: abstract, kurzfassung, affidavit and statutory declaration
\ifthenelse{\equal{\DocumentType}{thesis}}
{
	\emptydoublepage
	\addcontentsline{toc}{chapter}{Statutory Declaration}
	\input{./base/declaration_\DocumentLanguage}
	\emptydoublepage
	\input{thesis_preamble/acknowledgements}
	\emptydoublepage
	\input{thesis_preamble/abstract}
	\emptydoublepage
	\input{thesis_preamble/kurzfassung}
	\emptydoublepage
}{}

\tableofcontents

\ifthenelse{\equal{\DocumentType}{thesis}}
{
	\emptydoublepage
	\setcounter{mypageno}{\value{page}}
	\mainmatter \pagestyle{scrheadings} \pagenumbering{arabic}
	\setcounter{page}{\value{mypageno}}
}


% **************************************************************************************************
% mainmatter (=content)

\newchapter{introduction}{Introduction}
%\todo[inline, color=green!40]{its a master thesis where we try to know how trustworthy the sensor data for robot navigation is}
%\newsection{Motivation and Problem Statement}{motivation}
%\todo[inline]{lidar and its role in robot navigation. discuss sensor degradation and its effects on navigation.}
\threadtodo
{\textit{"What should the reader know after reading this section?"}}
{\textit{"Why is that of interest to the reader at this point?"}}
{\textit{"How am I achieving the stated goal?"}}
{\textit{"How does this lead to the next question or section?"}}

\threadtodo
{Create interest in topic, introduce main goal of thesis, summarize results}
{Reader only has abstract as context, need to create interest at beginning}
{emotional rescue missions, explain why data may be bad, state core research question}
{what has and hasn't been done $\rightarrow$ Scope of Research}

Autonomous robots have gained more and more prevailance in search and rescue (SAR) missions due to not endangering another human being and still being able to fulfil the difficult tasks of navigating hazardous environments like collapsed structures, identifying and locating victims and assessing the environment's safety for human rescue teams. To understand the environment, robots employ multiple sensor systems such as lidar, radar, ToF, ultrasound, optical cameras or infrared cameras of which lidar is the most prominently used due to its accuracy. The robots use the sensors' data to map their environments, navigate their surroundings and make decisions like which paths to prioritize. Many of the aforementioned algorithms are deep learning-based algorithms which are trained on large amounts of data whose characteristics are learned by the models.

Environments of search and rescue situations provide challenging conditions for the sensor systems to produce reliable data. One of the most promiment examples are aerosol particles from smoke and dust which can obstruct the view and lead sensors to produce erroneous data. If such degraded data was not present in the robots' algorithms' training data these errors may lead to unexpected outputs and potentially endanger the robot or even human rescue targets. This is especially important for autonomous robots whose decisions are entirely based on their sensor data without any human intervention. To safeguard against these problems, robots need a way to assess the trustworthiness of their sensor systems' data.


For remote controlled robots a human operator can make these decisions but many search and rescue missions do not allow remote control due to environment factors, such as radio signal attenuation or the search area's size and therefore demand autonomous robots. Therefore, during the design for such robots we arrive at the following critical question:

\begin{quote} Can autonomous robots quantify the reliability of lidar sensor data in hazardous environments to make more informed decisions? \end{quote}

In this thesis we aim to answer this question by assessing a deep learning-based anomaly detection method and its performance when quantifying the sensor data's degradation. The employed algorithm is a semi-supervised anomaly detection algorithm which uses manually labeled training data to improve its performance over unsupervised methods. We show how much the introduction of these labeled samples improves the methods performance. The models output is an anomaly score which quantifies the data reliability and can be used by algorithms that rely on the sensor data. These reliant algorithms may decide to for example slow down the robot to collect more data, choose alternative routes, signal for help or rely more heavily on other sensor's input data.

\todo[inline]{discuss results (we showed X)}

%\todo[inline, color=green!40]{autonomous robots have many sensors for understanding the world around them, especially visual sensors (lidar, radar, ToF, ultrasound, optical cameras, infrared cameras), they use that data for navigation mapping, SLAM algorithms, and decision making. these are often deep learning algorithms, oftentimes only trained on good data}
%\todo[inline, color=green!40]{difficult environments for sensors to produce good data quality (earthquakes, rescue robots), produced data may be unreliable, we don't know how trustworthy that data is (no quantification, confidence), since all navigation and decision making is based on input data, this makes the whole pipeline untrustworthy/problematic}
%\todo[inline, color=green!40]{contribution/idea of this thesis is to calculate a confidence score which describes how trustworthy input data is. algorithms further down the pipeline (slam, navigation, decision) can use this to make more informed decisions - examples: collect more data by reducing speed, find alternative routes, signal for help, do not attempt navigation, more heavily weight input from other sensors}

\newsection{scope_research}{Scope of Research}

\threadtodo
{clearly state what has and hasn't been researched + explanation why}
{from intro its clear what thesis wants to achieve, now we explain how we do that}
{state limit on data domain, sensors, output of method + reasoning for decisions}
{clear what we want to achieve $\rightarrow$ how is thesis structured to show this work}
%\todo[inline]{output is score, thresholding (yes/no), maybe confidence in sensor/data? NOT how this score is used in navigation/other decisions further down the line}
%\todo[inline]{Sensor degradation due to dust/smoke not rain/fog/...}
%\todo[inline, color=green!40]{we look at domain of rescue robots which save buried people after earthquakes, or in dangerous conditions (after fires, collapsed buildings) which means we are mostly working with indoors or subterranean environments which oftentimes are polluted by smoke and a lot of dust, ideally works for any kind of sensor data degradation but we only explore this domain}

%In this thesis we limit the domain of our research to that of autonomous rescue robots and their unique challenges. The degradation of sensor data in this domain appears to mainly stem from airborne particles, which we evaluate. Other kinds of degradation like from adverse weather effects, specific material properties, irrationally moving structures like leaves from trees and others are out of scope of our research due to the low likelihood of them occuring in the rescue scenarios of autonomous robots. While our approach does not specifically exclude these types of degradation and actually a case can be built for the employed method allowing for quantifying any of these and more kinds of degradation, we do not explicitely look at them or evaluate any of them other than airborne particles.

In this thesis, we focus our research on the unique challenges faced by autonomous rescue robots, specifically the degradation of sensor data caused by airborne particles. While degradation in sensor data can also arise from adverse weather, material properties, or dynamic elements such as moving leaves, these factors are considered less relevant to the rescue scenarios targeted by our study and are therefore excluded. Although our method is versatile enough to quantify various types of degradation, our evaluation is limited to degradation from airborne particles, as this is the most prevalent issue in the operational environments of autonomous rescue robots.

%\todo[inline, color=green!40]{mostly use lidar (state of the art) since they are very accurate in 3d mapping environments, so we focus on quantifying how trustworthy the lidar data is by itself. we do not look at other sensor data (tof, ultrasound, optical)}

%While computer vision systems of robots oftentimes include a multitude of sensor types like time of flight cameras, IR cameras, ultrasound sensors and others, we found that for autonomous robots in rescue missions mapping and navigation challenges are so hard that they require and mostly rely on lidar sensor data which is very accurate, high resolution and allow for mapping the whole surroundings due to their high field of view which oftentimes contains the whole 360° horizontal fov and a quite large vertical fov as well. additionally the cost of these lidar sensors has plummeted over the last decades due to their use in autonomous driving, drones and robotics as well as advancements in manufacturing like the utilisation of microeletromechanical systems which makes their proliferation for these types of vision problems near universal. for these reasons we limit our research to data produced by lidar sensors, namely the pointclouds of measurements in a coordinate system they produce. oftentimes sensor data is fused to achieve better accuracy and higher confidence in data but firstly examining these sensor fusion data would most likely increase the initial complexity of the research too much and secondly it would limit our research to platforms which utilise all the sensors we included instead of being able to quantify the sensor degradation by the lidar data itself.

While robotic computer vision systems often incorporate a variety of sensors—such as time-of-flight cameras, infrared cameras, and ultrasound sensors—we found that autonomous rescue robots primarily depend on LiDAR data for mapping and navigation. LiDAR sensors offer high accuracy, high resolution, and an extensive field of view (often a full 360° horizontally and a substantial vertical coverage), which are essential for constructing comprehensive environmental maps in challenging scenarios. Furthermore, the cost of LiDAR sensors has decreased significantly in recent decades, driven by their widespread adoption in autonomous driving, drones, and robotics, as well as manufacturing advancements like microelectromechanical systems (MEMS). For these reasons, our research is focused exclusively on LiDAR sensor data—specifically, the point clouds generated within a defined coordinate system. Although sensor fusion techniques are commonly used to enhance data accuracy and confidence, incorporating fused data would not only add significant complexity to our study but also limit our analysis to platforms equipped with all the sensor types involved. Consequently, we concentrate on quantifying sensor degradation solely through LiDAR data.

%\todo[inline, color=green!40]{intended output is confidence score which simply means higher score = worse data quality, lower score = trustworthy data. this score can be interpreted by algorithms in pipeline. we do not look at how this is implemented in the algorithms, no binary classifier but analog value, if this is wished followup algorithm has to decide (example by threshold or other methods)}

%The output of the method utilized by us and in our experiments is an analog score which relates to the confidence in the data and inversely to the data degradation. We do not look at how such a score might be utilized but can see many applications like simple thresholding and depending on the outcome deciding not to proceed in a certain direction or a direct usage by for example tying the robots speed to its confidence in the data, if necessary slowing down and collecting more data before progressing. The output score is independent of the time dimension and just a snapshot of each lidar scans degradation. In reality many lidars produce multiple scans per second, which would allow for including the time series of data and the scores they produce into an analysis as well such as for example a running average of the score or more complex statistical analysis. We do not investigate the differences between lidar scans which were taken with a small time delta, only at single snapshots of data in time.

The method we employ produces an analog score that reflects the confidence in the sensor data, with lower confidence indicating higher degradation. Although we do not investigate the direct applications of this score, potential uses include simple thresholding to decide whether to proceed with a given action as well as dynamically adjusting the robot's speed based on data quality to collect additional data when confidence is low. Importantly, this output score is a snapshot for each LiDAR scan and does not incorporate temporal information. While many LiDAR sensors capture multiple scans per second—enabling the possibility of time-series analyses such as running averages or more advanced statistical evaluations—we focus solely on individual scans without examining the differences between successive scans.

\newsection{thesis_structure}{Structure of the Thesis}

\threadtodo
{explain how structure will guide reader from zero knowledge to answer of research question}
{since reader knows what we want to show, an outlook over content is a nice transition}
{state structure of thesis and explain why specific background is necessary for next section}
{reader knows what to expect $\rightarrow$ necessary background info and related work}

\todo[inline]{brief overview of thesis structure}
\todo[inline, color=green!40]{in section x we discuss anomaly detection, semi-supervised learning since such an algorithm was used as the chosen method,  we also discuss how lidar works and the data it produces. then in we discuss in detail the chosen method Deep SAD in section X, in section 4 we discuss the traing and evaluation data, in sec 5 we describe our setup for training and evaluation (whole pipeline). results are presented and discussed in section 6. section 7 contains a conclusion and discusses future work}

\newchapter{background}{Background and Related Work}

%\todo[inline, color=green!40]{in this section we will discuss necessary background knowledge for our chosen method and the sensor data we work with. related work exists mostly from autonomous driving which does not include subter data and mostly looks at precipitation as source of degradation, we modeled after one such paper and try to adapt the same method for the domain of rescue robots, this method is a semi-supervised deep learning approach to anomaly detection which we describe in more detail in sections 2.1 and 2.2. in the last subsection 2.3 we discuss lidar sensors and the data they produce}

%As the domain of robotics and embedded systems often does, this thesis constitutes quite  broad interdisciplinary challenge of various fields of study. As we will see in this chapter, anomaly detection-the methodology we posed our degradation quantification problem as-has roots in statistical analysis and finds utility in many domains. As is the case for many fields of study, there has been success in incorporating learning based techniques-especially deep learning-into it to better or more efficiently solve problems anchored in interpretation of large data amounts. The very nature of anomalies often times makes their form and structure unpredictable, which lends itself to unsupervised learning techniques-ones where the training data is not assigned labels beforehand, since you cannot label what you cannot expect. These unsupervised techniques can oftentimes be improved by utilizing a small but impactful number of labeled training data, which results in semi-supervised methods. The method we evaluate for our task-Deep SAD-is a not only a semi-supervised deep learning approach but also employs an autoencoder in its architecture, a type of neural network architecture which has found widespread use in many deep learning applications over the last few years due to its feature extraction capability which solely relies on unlabeleld data. Its approach typically lends itself especially well to complex data for which feature extraction of conventional manual methods is hard to achieve, like the lidar data we are working with in this thesis. Lidar sensors measure the range from the sensor to the next reflective object for many angles simultaneously by projecting a laser in a specified direction and measuring the time it takes a reflected ray to return to the sensor. From the output angles of the rays and the measured travel time the sensor can construct a point cloud which is oftentimes dense enough to map out the sensors surroundings. In this chapter we will discuss these necessary technologies, give an overview of their history, use-cases and describe how it will be utilized in this thesis.

\threadtodo
{explain which background knowledge is necessary and why + mention related work}
{reader learns what he needs to know and which related work exists}
{state which background subsections will follow + why and mention related work}
{necessity of knowledge and order of subsections are explained $\rightarrow$ essential background}

This thesis tackles a broad, interdisciplinary challenge at the intersection of robotics, computer vision, and data science. In this chapter, we introduce the background of anomaly detection, which we formulate our degradation quantification problem as. Anomaly detection has its roots in statistical analysis and has been successfully applied in various domains. Recently, the incorporation of learning-based techniques, particularly deep learning, has enabled more efficient and effective analysis of large datasets.

Because anomalies are, by nature, often unpredictable in form and structure, unsupervised learning methods are widely used since they do not require pre-assigned labels—a significant advantage when dealing with unforeseen data patterns. However, these methods can be further refined through the integration of a small amount of labeled data, giving rise to semi-supervised approaches. The method evaluated in this thesis, DeepSAD, is a semi-supervised deep learning approach that also leverages an autoencoder architecture in its design. Autoencoders have gained widespread adoption in deep learning for their ability to extract features from unlabeled data, which is particularly useful for handling complex data types such as LiDAR scans.

LiDAR sensors function by projecting lasers in multiple directions near-simultaneously, measuring the time it takes for each reflected ray to return. Using the angles and travel times, the sensor constructs a point cloud that is often accurate enough to map the sensor's surroundings. In the following sections, we will delve into these technologies, review how they work, how they are generally used, how we employ them in this thesis and explore related work from these backgrounds.

%\todo[inline, color=green!40]{mention related work + transition to anomaly detection}


\newsection{anomaly_detection}{Anomaly Detection}

\threadtodo
{explain AD in general, allude to DeepSAD which was core method here}
{problem is formulated as AD problem, so reader needs to understand AD}
{give overview of AD goals, categories and challenges. explain we use DeepSAD}
{lots of data but few anomalies + hard to label $\rightarrow$ semi-supervised learning}

Anomaly detection refers to the process of detecting unexpected patterns of data, outliers which deviate significantly from the majority of data which is implicitly defined as normal by its prevalence. In classic statistical analysis these techniques have been studied as early as the 19th century~\cite{anomaly_detection_history}. Since then, a multitude of methods and use-cases for them have been proposed and studied. Examples of applications include healthcare, where computer vision algorithms are used to detect anomalies in medical images for diagnostics and early detection of diseases~\cite{anomaly_detection_medical}, detection of fraud in decentralized financial systems based on block-chain technology~\cite{anomaly_detection_defi} as well as fault detection in industrial machinery using acoustic sound data~\cite{anomaly_detection_manufacturing}.

Figure~\ref{fig:anomaly_detection_overview} depicts a simple but illustrative example of data which can be classified as either normal or anomalous and shows the problem anomaly detection methods try to generally solve. A successful anomaly detection method would somehow learn to differentiate normal from anomalous data, for example by learning the boundaries around the available normal data and classifying it as either normal or anomalous based on its location inside or outside of those boundaries. Another possible approach could calculate an analog value which correlates with the likelihood of an sample being anomalous, for example by using the sample's distance from the closest normal data cluster's center.


\begin{figure}
	\begin{center}
		\includegraphics[width=0.5\textwidth]{figures/anomaly_detection_overview}
	\end{center}
	\caption{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$ as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}}\label{fig:anomaly_detection_overview}
\end{figure}

By their very nature anomalies are rare occurences and oftentimes unpredictable in nature, which makes it hard to define all possible anomalies in any system. It also makes it very challenging to create an algorithm which is capable of detecting anomalies which may have never occured before and may not have been known to exist during the creation of the detection algorithm. There are many possible approaches to this problem, though they can be roughly grouped into six distinct categories based on the techniques used~\cite{anomaly_detection_survey}:

\begin{enumerate}
	\item \textbf{Classification Based} - A classification technique such as an SVM or a fitting neural network is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples which lie outside the learnt boundary.
	\item \textbf{Clustering Based} - Clustering techniques such as K-Means clustering or DBSCAN aim to group similar data together into clusters, differentiating it from dissimilar data which may belong to another or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not belong to any cluster at all or at least be an appreciable distance from the closest normal cluster's center.
	\item \textbf{Nearest Neighbor Based} - Similar to the clustering based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to their $k^{th}$ nearest neighbor or the density of their local neighborhood, to judge wether a sample is anomalous.
	\item \textbf{Statistical} - These methods try to fit a statistical model of the normal behaviour to the data. After the distribution from which normal data originates is defined, samples can be found to be normal or anomalous based on their likelihood to arise from said distribution.
	\item \textbf{Information Theoretic} - The main assumption for information theoretic anomaly detection methods, is that anomalies differ somehow in their information content from anomalous data. An information theoretic measure is therefore used to determine iregularities in the data's information content, enabling the detection of anomalous samples.
	\item \textbf{Spectral} - Spectral approaches assume the possibility to map data into a lower-dimensional space, where normal data appears significantly different from anomalous data. To this end a dimensionality reduction technique such as PCA is used to embed the data into a lower dimensional subspace. Although it may be easier to differentiate normal and anomalous data in that subspace, it is still necessary to employ a technique capable of this feat, so spectral methods are oftentimes used as a pre-processing step followed by an anomaly detection method operating on the data's subspace.
\end{enumerate}


In this thesis we used an anomaly detection method, namely \citetitle{deepsad}~\cite{deepsad} to model our problem -how to quantify the degradation of lidar sensor data- as an anomaly detection problem. We do this by classifying good quality data as normal and degraded data as anomalous and rely on a method which can express each samples likelihood of being anomalous as an analog anomaly score, which enables us to interpret it as the datas degradation quantification value.

Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering based approach with a spectral pre-processing component, in that it uses a neural network to reduce the inputs dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high dimensional it makes sense to use a spectral method to reduce the datas dimensionality and an approach which results in an analog value rather than a binary classification is useful for our use-case since we want to quantify not only classify the data degradation.

%\todo[inline, color=green!40]{data availability leading into semi-supervised   learning algorithms}

There is a wide array of problems in domains similar to the one we research in this paper, for which modeling them as anomaly detection problems has been proven successful. The degradation of pointclouds, produced by an industrial 3D sensor, has been modeled as an anomaly detection task in \citetitle{bg_ad_pointclouds_scans}~\cite{bg_ad_pointclouds_scans}. \citeauthor{bg_ad_pointclouds_scans} propose a student-teacher model capable of infering a pointwise anomaly score for degradation in point clouds. The teacher network is trained on an anomaly-free dataset to extract dense features of the point clouds' local geometries, after which an identical student network is trained to emulate the teacher networks' outputs. For degraded pointclouds the regression between the teacher's and student's outputs is calculated and interpreted as the anomaly score, with the rationalization that the student network has not observed features produced by anomalous geometries during training, leaving it incapable of producing a similar output as the teacher for those regions. Another example would be \citetitle{bg_ad_pointclouds_poles}~\cite{bg_ad_pointclouds_poles}, which proposes a method to detect and classify pole-like objects in urban point cloud data, to differentiate between natural and man-made objects such as street signs, for autonomous driving purposes. An anomaly detection method was used to identify the vertical pole-like objects in the point clouds and then the preprocessed objects were grouped by similarity using a clustering algorithm to then classify them as either trees or man-made poles.

As already shortly mentioned at the beginning of this section, anomaly detection methods and their usage are oftentimes challenged by the limited availability of anomalous data, owing to the very nature of anomalies which are rare occurences. Oftentimes the intended use-case is to even find unknown anomalies in a given dataset which have not yet been identified. In addition, it can be challenging to classify anomalies correctly for complex data, since the very definition of an anomaly is dependent on many factors such as the type of data, the intended use-case or even how the data evolves over time. For these reasons most types of anomaly detection approaches limit their reliance on anomalous data during training and many of them do not differentiate between normal and anomalous data at all. DeepSAD is a semi-supervised method which is characterized by using a mixture of labeled and unlabeled data.


% strategies of anomaly detection algorithnms according to x include classification, neirest neighbor, clustering, spectral, information theoretic, statistical

%\todo[inline, color=green!40]{cite exists since X and has been used to find anomalous data in many domains and works with all kinds of data types/structures (visual, audio, numbers). examples healthcare (computer vision diagnostics, early detection), financial anomalies (credit card fraud, maybe other example), security/safety video cameras (public, traffic, factories).}
%\todo[inline, color=green!40]{the goal of these algorithms is to differentiate between normal and anomalous data by finding statistically relevant information which separates the two, since these methods learn how normal data typically is distributed they do not have to have prior knowledge of the types of all anomalies, therefore can potentially detect unseen, unclassified anomalies as well. main challenges when implementing are that its difficult to cleanly separate normal from anormal data}
%\todo[inline, color=green!40]{typically no or very little labeled data is available and oftentimes the kinds of possible anomalies are unknown and therefore its not possible to label all of them. due to these circumstances anomaly detection methods oftentimes do not rely on labeled data but on the fact that normal circumstances make up the majority of training data (quasi per defintion)}
%\todo[inline, color=green!40]{figure example shows 2d data but anomaly detection methods work with any kind of dimensionality/shape. shows two clusters of normal data with clear boundaries and outside examples of outliers (anomalous data two single points and one cluster), anomaly detection methods learn to draw these boundaries from the training data given to them which can then be used to judge if unseen data is normal or anormal}
%\todo[inline, color=green!40]{as discussed in motivation, and same as in reference paper (rain autonomous driving) we model our problem as an anomaly detection problem where we define that good quality sensor data is normal data and degraded sensor data (in our case due to dust/smoke) is defined as an anomaly. this allows us to quantify the degradation of data by using the anomaly detection method to check how likely new data is an anomaly}
\iffalse
	Anomaly detection algorithms are designed to detect or quantify the likelihood of a pattern in data deviating significantly from a well-defined expected norm. Deviations such as these are classified as anomalies or outliers and often signify critical or actionable information.
	\begin{figure}
		\begin{center}
			\includegraphics[width=0.5\textwidth]{figures/anomaly_detection_overview}
		\end{center}
		\caption{An example of a 2-dimensional data set with anomalies. Reproduced from~\cite{Chandola2009AnomalyDA}}\label{fig:anomaly_detection_overview}
	\end{figure}

	\todo[inline]{Figure example normal data boundaries, single outliers o1, o2, cluster of outliers o3. difficult to define boundaries so that all normal data inside and anomalies outside }
\fi

\newsection{semi_supervised}{Semi-Supervised Learning Algorithms}

\threadtodo
{Give machine learning overview, focus on semi-supervised (what \& why)}
{used method is semi-supervised ML algorithm, reader needs to understand}
{explain what ML is, how the different approaches work, why to use semi-supervised}
{autoencoder special case (un-/self-supervised) used in DeepSAD $\rightarrow$ explain autoencoder}

%Machine learning defines types of algorithms capable of learning from existing data to perform tasks on previously unseen data without being explicitely programmed to do so~\cite{machine_learning_first_definition}. Among the techniques employed in machine learning algorithms, neural networks have become especially prominent over the past few decades due to their flexibility and ability to achieve state-of-the-art results across a wide variety of domains. They are most commonly composed of layers of interconnected artificial neurons. Each neuron computes a weighted sum of its inputs, adds a bias term, and then applies a nonlinear activation function (such as ReLU, sigmoid, or tanh) to produce its output. These layers are typically organized into three types:

Machine learning refers to algorithms capable of learning patterns from existing data to perform tasks on previously unseen data, without being explicitely programmed to do so~\cite{machine_learning_first_definition}. Central to many approaches is the definition of an objective function that measures how well the model is performing. The model’s parameters are then adjusted to optimize this objective. By leveraging these data-driven methods, machine learning can handle complex tasks across a wide range of domains.

Among the techniques employed in machine learning algorithms, neural networks have become especially prominent over the past few decades due to their ability to achieve state-of-the-art results across a wide variety of domains. They are most commonly composed of layers of interconnected artificial neurons. Each neuron computes a weighted sum of its inputs, adds a bias term, and then applies a nonlinear activation function, enabling them to model complex non-linear relationships. These layers are typically organized into three types:

\begin{itemize}
	\item Input layer, which receives raw data.
	\item Hidden layers, where the network transforms and extracts complex features by combining signals through successive nonlinear operations. Networks with at least two hidden layers are typically called deep learning networks.
	\item Output layer, which produces the network’s final prediction.
\end{itemize}

As outlined above, neural network training is formulated as an optimization problem: we define an objective function that measures how well the model is achieving its task and then we adjust the network’s parameters to optimize that objective. The most common approach is stochastic gradient descent (SGD) or one of its variants (e.g., Adam). In each training iteration, the network first performs a forward pass to compute its outputs and evaluate the objective, then a backward pass—known as backpropagation—to calculate gradients of the objective with respect to every weight in the network. These gradients indicate the direction in which each weight should change to improve performance, and the weights are updated accordingly. Repeating this process over many iterations (or epochs) allows the network to progressively refine its parameters and better fulfill its task.

%To train neural networks, we express the task as an optimization problem: we define a loss function that quantifies the discrepancy between the network’s predictions and the ground-truth labels (or target values), and we seek to minimize this loss across a dataset. The most common method for doing so is stochastic gradient descent (SGD) or one of its variants (e.g., Adam). During each training iteration, the network performs a forward pass to compute its predictions and the associated loss and then a backward pass—known as backpropagation—to calculate gradients of the loss with respect to each weight in the network. These gradients indicate how to adjust the weights to reduce the loss, and the weights are updated accordingly. Over many iterations also known as epochs, the network progressively refines its parameters, improving its ability to optimally fulfil the given task.

Aside from the underlying technique, one can also categorize machine learning algorithms by the type of feedback provided during learning, for the network to improve. Broadly speaking, three main categories-supervised, unsupervised and reinforcement learning-exist, although many other approaches do not exactly fit any of these categories and have spawned less common categories like semi-supervised or self-supervised learning.

In supervised learning, each input sample is paired with a “ground-truth” label representing the desired output. During training, the model makes a prediction and a loss function quantifies the difference between the prediction and the true label. The learning algorithm then adjusts its parameters to minimize this loss, improving its performance over time. Labels are typically categorical (used for classification tasks, such as distinguishing “cat” from “dog”) or continuous (used for regression tasks, like predicting a temperature or distance).
%For supervised learning each data sample is augmented by including a label depicting the ideal output the algorithm can produce for the given input. During the learning step these algorithms can compare their generated output with the one provided by an expert and calculate the error between them, minimizing the error to improve performance. Such labels are typically either a categorical or continuous target which are most commonly used for classification and regression tasks respectively.

\fig{ml_supervised_learning}{figures/ml_supervised_learning_placeholder.jpg}{PLACEHOLDER - An illustration of supervised learning-the training data is augmented to include the algorithms optimal output for the data sample, called labels.}

In unsupervised learning, models work directly with raw data, without any ground-truth labels to guide the learning process. Instead, they optimize an objective that reflects the discovery of useful structure—whether that is grouping similar data points together or finding a compact representation of the data. For example, cluster analysis partitions the dataset into groups so that points within the same cluster are more similar to each other (according to a chosen similarity metric) than to points in other clusters. Dimensionality reduction methods, on the other hand, project high-dimensional data into a lower-dimensional space, optimizing for minimal loss of the original data’s meaningful information. By focusing purely on the data itself, unsupervised algorithms can reveal hidden patterns and relationships that might be difficult to uncover with manual analysis.
%Unsupervised learning algorithms use raw data without a target label that can be used during the learning process. These types of algorithms are often utilized to identify underlying patterns in data which may be hard to discover using classical data analysis due to for example large data size or high data complexity. Cluster analysis depicts one common use case, in which data is grouped into clusters such that data from one cluster resembles other data from the same cluster more closely than data from other clusters, according to some predesignated criteria. Another important use case are dimensionality reduction tasks which transform high-dimensional data into a lower-dimensional subspace while retaining meaningful information of the original data.

\fig{ml_unsupervised_learning}{figures/ml_unsupervised_learning_placeholder.png}{PLACEHOLDER - An illustration of unsupervised learning-the training data does not contain any additional information like a label. The algorithm learns to group similar input data together.}

%A more interactive approach to learning is taken by reinforcement learning, which provides the algorithm with an environment and an interpreter of the environment's state. During training the algorithm explores new possible actions and their impact on the provided environment. The interpreter can then reward or punish the algorithm based on the outcome of its actions. To improve the algorithms capability it will try to maximize the rewards received from the interpreter, retaining some randomness as to enable the exploration of different actions and their outcomes. Reinforcement learning is usually used for cases where an algorithm has to make sequences of decisions in complex environments e.g., autonomous driving tasks.

In reinforcement learning, the model—often called an agent—learns by interacting with an environment, that provides feedback in the form of rewards or penalties. At each step, the agent observes the environment’s state, selects an action, and an interpreter judges the action's outcome based on how the environment changed, providing a scalar reward or penalty that reflects the desirability of that outcome. The agent’s objective is to adjust its decision-making strategy to maximize the cumulative reward over time, balancing exploration of new actions with exploitation of known high-reward behaviors. This trial-and-error approach is well suited to sequential decision problems in complex settings, such as autonomous navigation or robotic control, where each choice affects both the immediate state and future possibilities.


\todo[inline, color=green!40]{illustration reinforcement learning}

Semi-Supervised learning algorithms are an inbetween category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. Semi-supervised methods are oftentimes an effort to improve a machine learning algorithm belonging to either the supervised or unsupervised category. Supervised methods such as classification tasks are enhanced by using large amounts of unlabeled data to augment the supervised training without additional need of labeling work. Alternatively, unsupervised methods like clustering algorithms may not only use unlabeled data but improve their performance by considering some hand-labeled data during training.
%Semi-Supervised learning algorithms are an inbetween category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. The type of task performed by semi-supervised methods can originate from either supervised learningor unsupervised learning domain. For classification tasks which are oftentimes achieved using supervised learning the additional unsupervised data is added during training with the hope to achieve a better outcome than when training only with the supervised portion of the data. In contrast for unsupervised learning use cases such as clustering algorithms, the addition of labeled samples can help guide the learning algorithm to improve performance over fully unsupervised training.

Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their usability varies depending on the available training data. While supervised anomaly detection methods exist, their suitability depends mostly on the availability of labeled training data as well as a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labour intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making capture of enough anomalous behaviour a hard problem. Semi-Supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods which is to model the normal class behaviour and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behaviour to improve their perfomance over fully unsupervised methods. DeepSAD is a semi-supervised method which extends its unsupervised predecessor Deep SVDD by including some labeled samples during training. Both, DeepSAD and Deep SVDD also utilize an autoencoder in a pre-training step, a machine learning architecture, frequently grouped with unsupervised algorithms, even though that definition can be contested when scrutinizing it in more detail, which we will do next.


\newsection{autoencoder}{Autoencoder}

\threadtodo
{Explain how autoencoders work and what they are used for}
{autoencoder used in deepSAD}
{explain basic idea, unfixed architecture, infomax, mention usecases, dimension reduction}
{dimensionality reduction, useful for high dim data $\rightarrow$ pointclouds from lidar}

Autoencoders are a type of neural network architecture, whose main goal is learning to encode input data into a representative state, from which the same input can be reconstructed, hence the name. They typically consist of two functions, an encoder and a decoder with a latent space inbetween them as depicted in the toy example in figure~\ref{fig:autoencoder_general}. The encoder learns to extract the most significant features from the input and to convert them into the input's latent space representation. The reconstruction goal ensures that the most prominent features of the input get retained during the encoding phase, due to the inherent inability to reconstruct the input if too much relevant information is missing. The decoder simultaneously learns to reconstruct the original input from its encoded latent space representation, by minimizing the error between the input sample and the autoencoder's output. This optimization goal creates uncertainty when categorizing autoencoders as an unsupervised method, although literature commonly defines them as such. While they do not require any labeling of the input data, their optimization target can still calculate the error between the output and the optimal target, which is typically not available for unsupervised methods. For this reason, they are sometimes proposed to be a case of self-supervised learning, a type of machine learning where the data itself can be used to generate a supervisory signal without the need for a domain expert to provide one.

\fig{autoencoder_general}{figures/autoencoder_principle_placeholder.png}{PLACEHOLDER - An illustration of autoencoders' general architecture and reconstruction task.}

%\todo[inline, color=green!40]{explain figure}
\todo[inline, color=green!40]{Paragraph about Variational Autoencoders? generative models vs discriminative models, enables other common use cases such as generating new data by changing parameterized generative distribution in latent space - VAES are not really relevant, maybe leave them out and just mention them shortly, with the hint that they are important but too much to explain since they are not key knowledge for this thesis}

One key use case of autoencoders is to employ them as a dimensionality reduction technique. In that case, the latent space inbetween the encoder and decoder is of a lower dimensionality than the input data itself. Due to the aforementioned reconstruction goal, the shared information between the input data and its latent space representation is maximized, which is known as following the infomax principle. After training such an autoencoder, it may be used to generate lower-dimensional representations of the given datatype, enabling more performant computations which may have been infeasible to achieve on the original data. DeepSAD uses an autoencoder in a pre-training step to achieve this goal among others.

Autoencoders have been shown to be useful in the anomaly detection domain by assuming that autoencoders trained on more normal than anomalous data are better at reconstructing normal behaviour than anomalous one. This assumption allows methods to utilize the reconstruction error as an anomaly score. Examples of this are the method in \citetitle{bg_autoencoder_ad}~\cite{bg_autoencoder_ad} or the one in \citetitle{bg_autoencoder_ad_2}~\cite{bg_autoencoder_ad_2} which both employ an autoencoder and the aforementioned assumption. Autoencoders have also been shown to be a suitable dimensionality reduction technique for lidar data, which is oftentimes high-dimensional and sparse, making feature extraction and dimensionality reduction popular preprocessing steps. As an example, \citetitle{bg_autoencoder_lidar}~\cite{bg_autoencoder_lidar} shows the feasibility and advantages of using an autoencoder architecture to reduce lidar-orthophoto fused feature's dimensionality for their building detection method, which can recognize buildings in visual data taken from an airplane. Similarly, we can make use of the dimensionality reduction in DeepSAD's pre-training step, since our method is intended to work with high-dimensional lidar data.

%Another way to employ autoencoders is to use them as a generative technique. The decoder in autoencoders is trained to reproduce the input state from its encoded representation, which can also be interpreted as the decoder being able to generate data of the input type, from an encoded representation. A classic autoencoder trains the encoder to map its input to a single point in the latent space-a distriminative modeling approach, which can succesfully learn a predictor given enough data. In generative modeling on the other hand, the goal is to learn the distribution the data originates from, which is the idea behind variational autoencoders (VAE). VAEs have the encoder produce an distribution instead of a point representation, samples from which are then fed to the decoder to reconstruct the original input. The result is the encoder learning to model the generative distribution of the input data, which enables new usecases, due to the latent representation

\newsection{lidar_related_work}{Lidar - Light Detection and Ranging}

\threadtodo
{Explain how lidars work and what data they produce}
{understand why data is degraded, and how data looks}
{explain how radar/lidar works, usecases, output = pointclouds, what errors}
{rain degradation paper used deepsad $\rightarrow$ explained in detail in next chapter}

LiDAR (Light Detection and Ranging) measures distance by emitting short laser pulses and timing how long they take to return, an approach many may be familiar with from the more commonly known radar technology, which uses radio-frequency pulses and measures their return time to gauge an object's range. Unlike radar, however, LiDAR operates at much shorter wavelengths and can fire millions of pulses per second, achieving millimeter-level precision and dense, high-resolution 3D point clouds. This fine granularity makes LiDAR ideal for applications such as detailed obstacle mapping, surface reconstruction, and autonomous navigation in complex environments.

Because the speed of light in air is effectively constant, multiplying half the round‐trip time by that speed gives the distance between the lidar sensor and the reflecting object, as can be seen visualized in figure~\ref{fig:lidar_working_principle}. Modern spinning multi‐beam LiDAR systems emit millions of these pulses every second. Each pulse is sent at a known combination of horizontal and vertical angles, creating a regular grid of measurements: for example, 32 vertical channels swept through 360° horizontally at a fixed angular spacing. While newer solid-state designs (flash, MEMS, phased-array) are emerging, spinning multi-beam LiDAR remains the most commonly seen type in autonomous vehicles and robotics because of its proven range, reliability, and mature manufacturing base.

\fig{lidar_working_principle}{figures/bg_lidar_principle_placeholder.png}{PLACEHOLDER - An illustration of lidar sensors' working principle.}

Each instance a lidar emits and receives a laser pulse, it can use the ray's direction and the calculated distance to produce a single three-dimensional point. By collecting millions of such points each second, the sensor constructs a “point cloud”—a dense set of 3D coordinates relative to the LiDAR’s own position. In addition to X, Y, and Z, many LiDARs also record the intensity or reflectivity of each return, providing extra information about the surface properties of the object hit by the pulse.

%LiDAR’s high accuracy, long range, and full-circle field of view make it indispensable for tasks like obstacle detection, simultaneous localization and mapping (SLAM), and terrain modeling in autonomous driving and mobile robotics. While vehicles and robots often carry complementary sensors like time-of-flight cameras, ultrasonic sensors and RGB cameras, LiDAR outperforms them when it comes to precise 3D measurements over medium to long distances, operates reliably in varying light conditions, and delivers the spatial density needed for safe navigation. However, intrinsic sensor noise remains: range quantization can introduce discrete jumps in measured distance, angle jitter can blur fine features, and multi‐return ambiguities may arise when a single pulse generates several echoes (for example, from foliage or layered surfaces). Environmental factors further degrade data quality: specular reflections and multipath echoes can produce ghost points, beam occlusion by intermediate objects leads to missing measurements, and atmospheric scattering in rain, fog, snow, dust, or smoke causes early returns and spurious points.

%In subterranean and disaster environments—such as collapsed tunnels or earthquake‐damaged structures—LiDAR has become the de facto sensing modality for mapping and navigation. Its ability to rapidly generate accurate 3D point clouds enables simultaneous localization and mapping (SLAM) even in GPS‐denied conditions. Yet, the airborne particles prevalent in these scenarios—dust stirred by collapse, smoke from fires—introduce significant noise: early returns from nearby aerosols obscure real obstacles, and missing returns behind particle clouds can conceal hazards. While many SLAM and perception algorithms assume clean, high‐quality data, real‐world rescue deployments must contend with degraded point clouds. This mismatch motivates our work: rather than simply removing noise, we aim to quantify the degree of degradation so that downstream mapping and decision‐making algorithms can adapt to and compensate for varying data quality.

LiDAR’s high accuracy, long range, and full-circle field of view make it indispensable for tasks like obstacle detection, simultaneous localization and mapping (SLAM), and terrain modeling in autonomous driving and mobile robotics. While complementary sensors—such as time-of-flight cameras, ultrasonic sensors, and RGB cameras—have their strengths at short range or in particular lighting, only LiDAR delivers the combination of precise 3D measurements over medium to long distances, consistent performance regardless of illumination, and the pointcloud density needed for safe navigation. LiDAR systems do exhibit intrinsic noise (e.g., range quantization or occasional multi-return ambiguities), but in most robotic applications these effects are minor compared to environmental degradation.

In subterranean and rescue domain scenarios, the dominant challenge is airborne particles: dust kicked up by debris or smoke from fires. These aerosols create early returns that can mask real obstacles and cause missing data behind particle clouds, undermining SLAM and perception algorithms designed for cleaner data. This degradation is a type of atmospheric scattering, which can be caused by any kind of airborne particulates (e.g., snowflakes) or liquids (e.g., water droplets). Other kinds of environmental noise exist as well, such as specular reflections caused by smooth surfaces, beam occlusion due to close objects blocking the sensor's field of view or even thermal drift-temperature affecting the sensor's circuits and mechanics, introducing biases in the measurements.

All of these may create unwanted noise in the point cloud created by the lidar, making this domain an important research topic. \citetitle{lidar_denoising_survey}~\cite{lidar_denoising_survey} gives an overview about the current state of research into denoising methods for lidar in adverse environments, categorizes them according to their approach (distance-, intensity- or learning-based) and concludes that all approaches have merits but also open challenges to solve, for autonomous systems to safely navigate these adverse environments. The current research is heavily focused on the automotive domain, which can be observed by the vastly higher number of methods filtering noise from adverse weather effects-environmental scattering from rain, snow and fog-than from dust, smoke or other particles occuring rarely in the automotive domain.

A learning-based method to filter dust-caused degradation from lidar is introduced in \citetitle{lidar_denoising_dust}~\cite{lidar_denoising_dust}. The authors employ a convultional neural network to classify dust particles in lidar point clouds as such, enabling the filtering of those points and compare their methods to more conservative approaches, such as various outlier removal algorithms. Another relevant example would be the filtering method proposed in \citetitle{lidar_subt_dust_removal}~\cite{lidar_subt_dust_removal}, which enables the filtration of pointclouds degraded by smoke or dust in subterranean environments, with a focus on the search and rescue domain. To achieve this, they formulated a filtration framework that relies on dynamic onboard statistical cluster outlier removal, to classify and remove dust particles in point clouds.

Our method does not aim to remove the noise or degraded points in the lidar data, but quantify its degradation to inform other systems of the autonomous robot about the data's quality, enabling more informed decisions. One such approach, though from the autonomous driving and not from the search and rescue domain can be found in \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain}. A learning-based method to quantify the lidar's sensor data degradation caused by adverse weather-effects was proposed, implemented by posing the problem as an anomaly detection task and utilizing DeepSAD to learn degraded data to be an anomaly and high quality data to be normal behaviour. DeepSAD's anomaly score was used as the degradation's quantification score. From this example we decided to imitate this method and adapt it for the search and rescue domain, although this proved challenging due to the more limited data availability. Since it was effective for the closely related \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain}, we also employed DeepSAD, whose detailed workings we present in the following chapter.

%\todo[inline]{related work, survey on lidar denoising, noise removal in subt - quantifying same as us in rain, also used deepsad - transition}

%\todo[inline]{related work in lidar}
%\todo[inline, color=green!40]{the older more commonly known radar works by sending out an electromagnetic wave in the radiofrequency and detecting the time it takes to return (if it returns at all) signalling a reflective object in the path of the radiowave. lidar works on the same principle but sends out a lightray produced by a laser (citation needed) and measuring the time it takes for the ray to return to the sensor. since the speed of light is constant in air the system can calculate the distance between the sensor and the measured point. modern lidar systems send out multiple, often millions of measurement rays per second which results in a three dimensional point cloud, constructed from the information in which direction the ray was cast and the distance that was measured}
%\todo[inline, color=green!40]{lidar is used in most domains reliant on accurate 3d representations of the world like autonomous driving, robot navigation, (+ maybe quickly look up two other domains), its main advantage is high measurement accuracy, precision (use correct term), and high resolution (possible due to single point measurements instead of cones like radar, ToF, Ultrasonic) which enables more detailed mappings of the environment}
%\todo[inline, color=green!40]{due to point precision, lidar is sensitive to noise/degradation of airborne particles, which may produce early returns, deflections, errrors of light rays, this results in noise in the 3d point cloud and possibly missing data of the measurement behind the aerosol particle.}
%\todo[inline, color=green!40]{because of the given advantages of lidar it is most commonly used nowadays on robot platforms for environment mapping and navigiation - so we chose to demonstrate our method based on degraded data collected by a lidar sensor as discussed in more dtail in section (data section)}

%\newsection{related_work}{Related Work}

%\threadtodo
%{What other research has been done on this topic}
%{reader knows all background, what is starting point of research}
%{talk about rain degradation paper from automotive, cleaning pointclouds?}
%{Rain paper successful with DeepSAD $\rightarrow$ what is DeepSAD}


\newchapter{deepsad}{Deep SAD: Semi-Supervised Anomaly Detection}

%In this chapter we explore the method \emph{Deep Semi-Supervised Anomaly Detection}~\cite{deepsad} which we employed during our experiments to quanitfy the degradation of lidar scans caused by artifically introduced water vapor from a theater smoke machine. The same approach of modeling a degradation quantification problem as an anomaly detection task was succesfully used in \cite{degradation_quantification_rain} to quantify the degradation caused to lidar scans by bad weather conditions such as rain, fog and snow for autonomous driving tasks. Deep SAD is characterized by it being a deep-learning approach to anomaly detection which enables it to learn more complex anomalous data patterns than more classic statistical approaches and its capability of employing hand-labeled data samples-both normal and anomalous-during its training step to better teach the model to differentiate between know anomalies and normal data than if only an unsupervised approach was used which basically just learns the most common patterns in the implicitely more common normal data and to differentiate anything from that.

\threadtodo
{Introduce DeepSAD, how and why do we use it}
{let reader know why they need to know about Deepsad in detail}
{explain use-case, similar use-case worked, allude to core features}
{interest/curiosity created $\rightarrow$ wants to learn about DeepSAD}

In this chapter, we explore the method \citetitle{deepsad}~(Deep SAD)~\cite{deepsad}, which we employ to quantify the degradation of LiDAR scans caused by airborne particles in the form of artificially introduced water vapor from a theater smoke machine. A similar approach—modeling degradation quantification as an anomaly detection task—was successfully applied in \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} to assess the impact of adverse weather conditions on LiDAR data for autonomous driving applications. Deep SAD leverages deep learning to capture complex anomalous patterns that classical statistical methods might miss. Furthermore, by incorporating a limited amount of hand-labeled data (both normal and anomalous), it can more effectively differentiate between known anomalies and normal data compared to purely unsupervised methods, which typically learn only the most prevalent patterns in the dataset~\cite{deepsad}.


%Deep Semi-Supervised Anomaly Detection~\cite{deepsad} is a deep-learning based anomaly detection method whose performance in regards to sensor degradation quantification we explore in this thesis. It is a semi-supervised method which allows the introduction of manually labeled samples in addition to the unlabeled training data to improve the algorithm's performance over its unsupervised predecessor Deep One-Class Classification~\cite{deepsvdd}. The working principle of the method is to encode the input data onto a latent space and train the network to cluster normal data close together while anomalies get mapped further away in that latent space.
%\todo[inline, color=green!40]{Deep SAD is a semi-supervised anomaly detection method proposed in cite, which is based on an unsupervised method (Deep SVDD) and additionally allows for providing some labeled data which is used during the training phase to improve the method's performance}
\newsection{algorithm_description}{Algorithm Description}
%\todo[inline]{explain deepsad in detail}

%Deep SAD is a typical clustering based anomaly detection technique which is described in \cite{anomaly_detection_survey} to generally have a two step approach to anomaly detection. First a clustering algorithm is used to cluster data closely together around a centroid and secondly the distances from data to that centroid is calculated and interpreted as an anomaly score. This general idea can also be found in the definition of the Deep SAD algorithm, which uses the encoder part of an autoencoder architecture which is trained to cluster data around a centroid in the latent space of its output. The datas geometric distance to that centroid in the latent space is defined as an anomaly score. Deep SAD is a semi-supervised training based method which can work completely unsupervised (no labeled data available) in which case it falls back to its predecessor method Deep SVDD but additionally allows the introduction of labeleld data samples during training to more accurately map known normal samples near the centroid and known anomalous samples further away from it.

\threadtodo
{give general overview about how it works}
{overview helps reader understand method, then go into detail}
{how clustering AD generally works, how it does in DeepSAD}
{since the reader knows the general idea $\rightarrow$ what is the step-by-step?}

Deep SAD's overall mechanics are similar to clustering-based anomaly detection methods, which according to \citetitle{anomaly_detection_survey}~\cite{anomaly_detection_survey} typically follow a two-step approach. First, a clustering algorithm groups data points around a centroid; then, the distances of individual data points from this centroid are calculated and used as an anomaly score. In Deep SAD, these concepts are implemented by employing a neural network, which is jointly trained to map input data onto a latent space and to minimize the volume of an data-encompassing hypersphere, whose center is the aforementioned centroid. The data's geometric distance in the latent space to the hypersphere center is used as the anomaly score, where a larger distance between data and centroid corresponds to a higher probability of a sample being anomalous. This is achieved by shrinking the data-encompassing hypersphere during training, proportionally to all training data, of which is required that there is significantly more normal than anomalous data present. The outcome of this approach is that normal data gets clustered more closely around the centroid, while anomalies appear further away from it as can be seen in the toy example depicted in figure~\ref{fig:deep_svdd_transformation}.

%Deep SAD is an anomaly detection algorithm that belongs to the category of clustering-based methods, which according to~\cite{anomaly_detection_survey} typically follow a two-step approach. First, a clustering algorithm groups data points around a centroid; then, the distances of individual data points from this centroid are calculated and used as an anomaly score. In addition to that, DeepSAD also utilizes a spectral component by mapping the input data onto a lower-dimensional space, which enables it to detect anomalies in high-dimensional complex data types. In Deep SAD, these concepts are implemented by employing a neural network, which is jointly trained to map data into a latent space and to minimize the volume of an data-encompassing hypersphere whose center is the aforementioned centroid. The geometric distance in the latent space to the hypersphere center is used as the anomaly score, where a larger distance between data and centroid corresponds to a higher probability of a sample being anomalous. This is achieved by shrinking the data-encompassing hypersphere during training, proportionally to all training data, of which is required that there is significantly more normal than anomalous data present. The outcome of this approach is that normal data gets clustered more closely around the centroid, while anomalies appear further away from it as can be seen in the toy example depicted in figure~\ref{fig:deep_svdd_transformation}.

\fig{deep_svdd_transformation}{figures/deep_svdd_transformation}{DeepSAD teaches a neural network to transform data into a latent space and minimize the volume of an data-encompassing hypersphere centered around a predetermined centroid $\textbf{c}$. \\Reproduced from~\cite{deepsvdd}.}

\threadtodo
{first step is pre-training, why does it use an autoencoder?}
{go chronologically over how the algorithm works. starting at pre-training}
{pre-training is autoencoder, self-supervised, dimensionality reduction}
{pre-training done $\rightarrow$ how are the pre-training results used?}

Before DeepSAD's training can begin, a pre-training step is required, during which an autoencoder is trained on all available input data. One of DeepSAD's goals is to map input data onto a lower dimensional latent space, in which the separation between normal and anomalous data can be achieved. To this end DeepSAD and its predecessor Deep SVDD make use of the autoencoder's reconstruction goal, whose successful training ensures confidence in the encoder architecture's suitability for extracting the input datas' most prominent information to the latent space inbetween the encoder and decoder. DeepSAD goes on to use just the encoder as its main network architecture, discarding the decoder at this step, since reconstruction of the input is unnecessary.

%The results of the pre-training are used twofold. Firstly the encoders' weights at the end of pre-training can be used to initialize Deep SAD's weights for the main training step which aligns with the aforementioned Infomax principle, since we can assume the autoencoder maximized the shared information between the input and the latent space represenation. Secondly an initial forward-pass is run on the encoder network for all available training data samples and the results' mean position in the latent space is used to define the hypersphere center $\mathbf{c}$ which according to~\cite{deepsad} allows for faster convergence during the main training step than randomly chosen centroids. An alternative method of initializing the hypersphere center could be to use only labeled normal examples for the forward-pass, so not to pollute $\mathbf{c}$'s position with anomalous samples, which would only be possible if sufficient labeled normal samples are available. From this point on, the hypersphere center $\mathbf{c}$ stays fixed and does not change, which is necessary since it being a free optimization variable could lead to a trivial hypersphere collapse solution if the network was trained fully unsupervised.

\threadtodo
{what is pre-training output used for, how is centroid calculated and why}
{reader knows about pre-training, what are next steps and how is it used}
{pre-training weights used to init main network, c is mean of forward pass, collapse}
{network built and initialized, centroid fixed $\rightarrow$ start main training}

The pre-training results are used in two more key ways. First, the encoder weights obtained from the autoencoder pre-training initialize DeepSAD’s network for the main training phase. Second, we perform an initial forward pass through the encoder on all training samples, and the mean of these latent representations is set as the hypersphere center, $\mathbf{c}$. According to \citeauthor{deepsad}, this initialization method leads to faster convergence during the main training phase compared to using a randomly selected centroid. An alternative would be to compute $\mathbf{c}$ using only the labeled normal examples, which would prevent the center from being influenced by anomalous samples; however, this requires a sufficient number of labeled normal samples. Once defined, the hypersphere center $\mathbf{c}$ remains fixed, as allowing it to be optimized freely could in the unsupervised case lead to a hypersphere collapse-a trivial solution where the network learns to map all inputs directly onto the centroid $\mathbf{c}$.

\threadtodo
{how does the main training work, what data is used, what is the optimization target}
{main training is next step since all preconditions are met}
{main training is SGD backpropagation, minimizing volume, un-/labeled data used}
{network is trained $\rightarrow$ how does one use it for AD?}

In the main training step, DeepSAD's network is trained using SGD backpropagation. The unlabeled training data is used with the goal to minimize an data-encompassing hypersphere. Since one of the pre-conditions of training was the significant prevelance of normal data over anomalies in the training set, normal samples collectively cluster more tightly around the centroid, while the rarer anomalous samples do not contribute as significantly to the optimization, resulting in them staying further from the hypersphere center. The labeled data includes binary class labels signifying their status as either normal or anomalous samples. Labeled anomalies are pushed away from the center by defining their optimization target as maximizing the distance between them and $\mathbf{c}$. Labeled normal samples are treated similar to unlabeled samples with the difference that DeepSAD includes a hyperparameter capable of controling the proportion with which labeled and unlabeled data contribute to the overall optimization. The resulting network has learned to map normal data samples closer to $\mathbf{c}$ in the latent space and anomalies further away.

\fig{deepsad_procedure}{diagrams/deepsad_procedure/deepsad_procedure}{(WORK IN PROGRESS) Depiction of DeepSAD's training procedure, including data flows and tweakable hyperparameters.}

\threadtodo
{how to use the trained network?}
{since we finished training, we need to know how to utilize it}
{forward pass, calculate distance from c =  anomaly score, analog, unknown magnitude}
{General knowledge of the algorithm achieved $\rightarrow$ go into more detail}

To infer if a previously unknown data sample is normal or anomalous, the sample is fed in a forward-pass through the fully trained network. During inference, the centroid $\mathbf{c}$ needs to be known, to calculate the geometric distance of the samples latent representation to $\mathbf{c}$. This distance is tantamount to an anomaly score, which correlates with the likelihood of the sample being anomalous. Due to differences in input data type, training success and latent space dimensionality, the anomaly score's magnitude has to be judged on an individual basis for each trained network. This means, scores produced by one network that signify normal data, may very well clearly indicate an anomaly for another network. The geometric distance between two points in space is a scalar analog value, therefore post-processing of the score is necessary to achieve a binary classification of normal and anomalous if desired.

DeepSAD's full training and inference procedure is visualized in figure~\ref{fig:deepsad_procedure}, which gives a comprehensive overview of the dataflows, tuneable hyperparameters and individual steps involved.

\newsection{algorithm_details}{Algorithm Details and Hyperparameters}
%\todo[inline]{backpropagation optimization formula, hyperaparameters explanation}

%As a pre-training step an autoencoder architecture is trained and its weights are used to initialize its encoder part before training of the method itself begins. \citeauthor{deepsad} argue in~\cite{deepsad} that this pre-training step which was already present in~\cite{deepsvdd}, allows them to not only interpret the method in geometric terms as minimum volume estimation but also in probalistic terms as entropy minimization over the latent distribution, since the autoencoding objective implicitely maximizes the mutual information between the data and its latent space represenation. This insight-that the method follows the Infomax principle with the additional objective of the latent distribution having mininmal entropy-allowed \citeauthor{deepsad} to introduce an additional term in Deep SAD's - over Deep SVDD's objective, which encorporates labeled data to better model the nature of normal and anomalous data. They show that Deep SAD's objective can be interpreted as normal data's distribution in the latent space being modeled to have low entropy and anomalous data's distribution in that latent space being modeled as having high entropy, which they argue captures the nature of the difference between normal and anomalous data by interpreting anomalies ``as being generated from an infinite mixture of distributions that are different from normal data distribution''~\cite{deepsad}.

Since Deep SAD is heavily based on its predecessor \citetitle{deepsvdd}~(Deep SVDD)~\cite{deepsvdd} it is helpful to first understand Deep SVDD's optimization objective, so we start with explaining it here.  For input space $\mathcal{X} \subseteq \mathbb{R}^D$, output space $\mathcal{Z} \subseteq \mathbb{R}^d$ and a neural network $\phi(\wc; \mathcal{W}) : \mathcal{X} \to \mathcal{Z}$ where $\mathcal{W}$ depicts the neural networks' weights with $L$ layers $\{\mathbf{W}_1, \dots, \mathbf{W}_L\}$, $n$ the number of unlabeled training samples $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, $\mathbf{c}$ the center of the hypersphere in the latent space, Deep SVDD teaches the neural network to cluster normal data closely together in the latent space by defining its optimization objective as seen in~\ref{eq:deepsvdd_optimization_objective}.

\begin{equation}
	\label{eq:deepsvdd_optimization_objective}
	\min_{\mathcal{W}} \quad
	\frac{1}{n} \sum_{i=1}^{n}\|\phi(\mathbf{x}_{i};\mathcal{W})-\mathbf{c}\|^{2}
	+\frac{\lambda}{2}\sum_{\ell=1}^{L}\|\mathbf{W}^{\ell}\|_{F}^{2}.
\end{equation}

As can be seen from \ref{eq:deepsvdd_optimization_objective}, Deep SVDD is an unsupervised method which does not rely on labeled data to train the network to differentiate between normal and anomalous data. The first term of the optimization objective depicts the shrinking of the data-encompassing hypersphere around the given center $\mathbf{c}$. For each data sample $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, its geometric distance to $\mathbf{c}$ in the latent space produced by the neural network $\phi(\wc; \mathcal{W})$ is minimized proportionally to the amount of data samples $n$. The second term is a standard L2 regularization term which prevents overfitting with hyperparameter $\lambda > 0$  and $\|\wc\|_F$ denoting the Frobenius norm.

\citeauthor{deepsad} argue that the pre-training step employing an autoencoder—originally introduced in Deep SVDD—not only allows a geometric interpretation of the method as minimum volume estimation i.e., the shrinking of the data encompassing hypersphere but also a probabilistic one as entropy minimization over the latent distribution. The autoencoding objective during pre-training implicitly maximizes the mutual information between the data and its latent representation, aligning the approach with the Infomax principle while encouraging a latent space with minimal entropy. This insight enabled \citeauthor{deepsad} to introduce an additional term in DeepSAD’s objective, beyond that of its predecessor Deep SVDD, which incorporates labeled data to better capture the characteristics of normal and anomalous data. They demonstrate that DeepSAD’s objective effectively models the latent distribution of normal data as having low entropy, while that of anomalous data is characterized by higher entropy. In this framework, anomalies are interpreted as being generated from an infinite mixture of distributions that differ from the normal data distribution.

The introduction of the aforementioned term in Deep SAD's objective allows it to learn in a semi-supervised way, though it can operate in a fully unsupervised mode—effectively reverting to its predecessor, Deep SVDD—when no labeled data are available. This additional supervision helps the model better position known normal samples near the hypersphere center and push known anomalies farther away, thereby enhancing its ability to differentiate between normal and anomalous data.


From \ref{eq:deepsvdd_optimization_objective} it is easy to understand Deep SAD's optimization objective seen in \ref{eq:deepsad_optimization_objective} which additionally defines $m$ number of labeled data samples $\{(\mathbf{\tilde{x}}_1, \tilde{y}_1), \dots, (\mathbf{\tilde{x}}_m, \tilde{y}_1)\} \in \mathcal{X} \times \mathcal{Y}$ and $\mathcal{Y} = \{-1,+1\}$ for which $\tilde{y} = +1$ denotes normal and $\tilde{y} = -1$ anomalous samples as well as a new hyperparameter $\eta > 0$ which can be used to balance the strength with which labeled and unlabeled samples contribute to the training.

\begin{equation}
	\label{eq:deepsad_optimization_objective}
	\min_{\mathcal{W}} \quad
	\frac{1}{n+m} \sum_{i=1}^{n}\|\phi(\mathbf{x}_{i};\mathcal{W})-\mathbf{c}\|^{2}
	+\frac{\eta}{n+m}\sum_{j=1}^{m}\left(\|\phi(\tilde{\mathbf{x}}_{j};\mathcal{W})-\mathbf{c}\|^{2}\right)^{\tilde{y}_{j}}
	+\frac{\lambda}{2}\sum_{\ell=1}^{L}\|\mathbf{W}^{\ell}\|_{F}^{2}.
\end{equation}

The first term of \ref{eq:deepsad_optimization_objective} stays mostly the same, differing only in its consideration of the introduced $m$ labeled datasamples for its proportionality. The second term is newly introduced to incorporate the labeled data samples with hyperparameter $\eta$'s strength, by either minimizing or maximizing the distance between the samples latent represenation and $\mathbf{c}$ depending on each data samples label $\tilde{y}$. The third term, is kept identical compared to Deep SVDD as standard L2 regularization. It can also be observed that in case of $m = 0$ labeled samples, Deep SAD falls back to the same optimization objective of Deep SVDD and can therefore be used in a completely unsupervised fashion as well.


\newsubsubsectionNoTOC{Hyperparameters}

The neural network architecture of DeepSAD is not fixed but rather dependent on the data type the algorithm is supposed to operate on. This is due to the way it employs an autoencoder for pre-training and the encoder part of the network for its main training step. This makes the adaption of an autoencoder architecture suitable to the specific application necessary but also allows for flexibility in choosing a fitting architecture depending on the application's requirements. For this reason the specific architecture employed, may be considered a hyperparameter of the Deep SAD algorithm.

%The latent space dimensionality is also freely choosable and should intuitively be chosen as small as possible while still being able to capture

% not necessary since arch is discussed in setup chhapter \todo[inline]{Talk about choosing the correct architecture (give example receptive fields for image data from object detection?)}

\todo[inline]{latent space size, talk about auto encoder performance, trying out sensible dimensionalities and find reconstruction elbow, choose smallest possible, but as large as necessary}
\todo[inline]{latent space size for AE shows that most likely all of the important data may be captured inside this dim (since recons;truction is possible) but we may only require some of the encoded patterns to differentiate normal from anomaly so smaller may still be possible? should this be discussed here or not? maybe only discuss; AE considerations and then move this discussion to discussion / results}

\todo[inline]{eta, think of possible important scenarios, learning rate, epochs}

%\todo[inline, color=green!40]{Core idea of the algorithm is to learn a transformation to map input data into a latent space where normal data clusters close together and anomalous data gets mapped further away. to achieve this the methods first includes a pretraining step of an auto-encoder to extract the most relevant information, second it fixes a hypersphere center in the auto-encoders latent space as a target point for normal data and third it traings the network to map normal data closer to that hypersphere center. Fourth The resulting network can map new data into this latent space and interpret its distance from the hypersphere center as an anomaly score which is larger the more anomalous the datapoint is}
%\todo[inline, color=green!40]{explanation pre-training step: architecture of the autoencoder is dependent on the input data shape, but any data shape is generally permissible. for the autoencoder we do not need any labels since the optimization target is always the input itself. the latent space dimensionality can be chosen based on the input datas complexity (search citations). generally a higher dimensional latent space has more learning capacity but tends to overfit more easily (find cite). the pre-training step is used to find weights for the encoder which genereally extract robust and critical data from the input because TODO read deepsad paper (cite deepsad). as training data typically all data (normal and anomalous) is used during this step.}
%\todo[inline, color=green!40]{explanation hypersphere center step: an additional positive ramification of the pretraining is that the mean of all pre-training's latent spaces can be used as the hypersphere target around which normal data is supposed to cluster. this is advantageous because it allows the main training to converge faster than choosing a random point in the latent space as hypersphere center. from this point onward the center C is fixed for the main training and inference and does not change anymore.}
%\todo[inline, color=green!40]{explanation training step: during the main training step the method starts with the pre-trained weights of the encoder but removes the decoder from the architecture since it optimizes the output in the latent space and does not need to reproduce the input data format. it does so by minimizing the geometric distance of each input data's latent space represenation to the previously defined hypersphere center c. Due to normal data being more common in the inputs this results in normal data clustering closely to C and anormal data being pushed away from it. additionally during this step the labeled data is used to more correctly map normal and anormal data}
%\todo[inline, color=green!40]{explanation inference step: with the trained network we can transform new input data into the latent space and calculate its distance from the hypersphere center which will be smaller the more confident the network is in the data being normal and larger the more likely the data is anomalous. This output score is an analog value dependent on multiple factors like the latent space dimensionality, encoder architecture and ??? and has to be interpreted further to be used (for example thresholding)}

%\todo[inline, color=green!40]{in formula X we see the optimization target of the algorithm. explain in one paragraph the variables in the optimization formula}
%\todo[inline, color=green!40]{explain the three terms (unlabeled, labeled, regularization)}

\newchapter{data_preprocessing}{Data and Preprocessing}

\threadtodo
{Introduce data chapter, what will be covered here, incite interest}
{all background covered, deepsad explained, data natural next step}
{emotional why data scarce, lot of data necessary, what will be covered}
{what will we talk about next $\rightarrow$ requirements}

%\todo[inline, color=green!40]{good data important for learning based methods and for evaluation. in this chapter we talk about the requirements we have for our data and the difficulties that come with them and will then give some information about the dataset that was used as well as how the data was preprocessed for the experiments (sec 4.2)}

%Fortunately situations like earthquakes, structural failures and other circumstances where rescue robots need to be employed are uncommon occurences. When such an operation is conducted, the main focus lies on the fast and safe rescue of any survivors from the hazardous environment, therefore it makes sense that data collection is not a priority. Paired with the rare occurences this leads to a lack of publicly available data of such situations. To improve any method, a large enough, diversified and high quality dataset is always necessary to provide a comprehensive evaluation. Additionally, in this work we evaluate a training based method, which increases the requirements on the data manifold, which makes it all the more complex to find a suitable dataset. In this chapter we will state the requirements we defined for the data, talk about the dataset that was chosen for this task, including some statistics and points of interest, as well as how it was preprocessed for the training and evaluation of the methods.

Situations such as earthquakes, structural failures, and other emergencies that require rescue robots are fortunately rare. When these operations do occur, the primary focus is on the rapid and safe rescue of survivors rather than on data collection. Consequently, there is a scarcity of publicly available data from such scenarios. To improve any method, however, a large, diverse, and high-quality dataset is essential for comprehensive evaluation. This challenge is further compounded in our work, as we evaluate a training-based approach that imposes even higher demands on the data, especially requiring a great deal of diverse training samples, making it difficult to find a suitable dataset.

In this chapter, we outline the specific requirements we established for the data, describe the dataset selected for this task—including key statistics and notable features—and explain the preprocessing steps applied for training and evaluating the methods.


%\todo[inline]{describe data sources, limitations}
%\todo[inline]{screenshots of camera/3d data?}
%\todo[inline]{difficulties: no ground truth, different lidar sensors/settings, different data shapes, available metadata, ...}

%\todo[inline, color=green!40]{we require lidar sensor data that was collected in a domain as closely related to our target domain (rescue robots indoors, cave-ins, ) as possible which also includes some kind of appreciable degradation for which we have some kind of labeling possibility. ideally the degradation should be from smoke/dust/aerosol particles. most data should be without degradation (since we require more normal than anormal data to train the method as described in X) but we need enough anormal data so we can confidently evaluate the methods performance}

%Our main requirement for the data was for it to be as closely related to the target domain of rescue operations as possible. Since autonomous robots get largely used in situations where a structural failures occured we require of the data to be subterranean. This provides the additional benefit, that data from this domain oftentimes already has some amount of airborne particles like dust due to limited ventilation and oftentimes exposed rock, which is to be expected to also be present in rescue situations. The second and by far more limiting requirement on the data, was that there has to be appreciable degradation due to airborne particles as would occur during a fire from smoke. The type of data has to at least include lidar but for better understanding other types of visual data e.g., visual camera images would be benefical. The amount of data has to be sufficient for training the learning based methods while containing mostly good quality data without degradation, since the semi-supervised method implicitely requires a larger amount of normal than anomalous training for successful training. Nonetheless, the number of anomalous data samples has to be large enough that a comprehensive evaluation of the methods' performance is possible.
\newsection{data_req}{Data Requirements and Challenges}

\threadtodo
{list requirements we had for data}
{what were our requirements for choosing a dataset}
{list from basic to more complex with explanations}
{ground truth for evaluation $\rightarrow$ ground truth/labeling challenges}

%Our primary requirement for the dataset was that it closely reflects the target domain of rescue operations. Because autonomous robots are predominantly deployed in scenarios involving structural failures, the data should be taken from subterranean environments. This setting not only aligns with the operational context but also inherently includes a larger than normal amount of airborne particles (e.g., dust) from limited ventilation and exposed rock surfaces, which is typically encountered during rescue missions.

%A second, more challenging requirement is that the dataset must exhibit significant degradation due to airborne particles, as would be expected in scenarios involving smoke from fires. The dataset should at minimum include LiDAR data, and ideally also incorporate other visual modalities (e.g., camera images) to provide a more comprehensive understanding of the environment.

%Additionally, the dataset must be sufficiently large for training learning-based methods. Since the semi-supervised approach we utilize relies on a predominance of normal data over anomalous data, it is critical that the dataset predominantly consists of high-quality, degradation-free samples. At the same time, there must be enough anomalous samples to allow for a thorough evaluation of the method’s performance.

To ensure our chosen dataset meets the needs of reliable degradation quantification in subterranean rescue scenarios, we imposed the following requirements:

\begin{enumerate}
	\item \textbf{Data Modalities:}\\
	      The dataset must include LiDAR sensor data, since we decided to train and evaluate our method on what should be the most universally used sensor type in the given domain. To keep our method as generalized as possible, we chose to only require range-based point cloud data and forego sensor-specific data such as intensity or reflectivity, though it may be of interest for future work. It is also desirable to have complementary visual data such as camera images, for better context, manual verification and understanding of the data.

	\item \textbf{Context \& Collection Method:}\\
	      To mirror the real-world conditions of autonomous rescue robots, the data should originate from locations such as subterranean environments (tunnels, caves, collapsed structures), which closely reflect what would be encountered during rescue missions. Ideally, it should be captured from a ground-based, self-driving robot platform in motion instead of aerial, handheld, or stationary collection, to ensure similar circumstances to the target domain.

	\item \textbf{Degradation Characteristics:}\\
	      Because our goal is to quantify the degradation of lidar data encountered by rescue robots, the dataset must exhibit significant degradation of LiDAR returns from aerosols (i.e., dust or smoke particles), which should be the most frequent and challenging degradation encountered. This requirement is key to evaluating how well our method detects and measures the severity of such challenging conditions.

	\item \textbf{Volume \& Class Balance:}\\
	      The dataset must be large enough to train deep learning models effectively. Since our semi-supervised approach depends on learning a robust model of “normal” data, the majority of samples should be high-quality, degradation-free scans. Simultaneously, there must be a sufficient number of degraded (anomalous) scans to permit a comprehensive evaluation of quantification performance.

	\item \textbf{Ground-Truth Labels:}\\
	      Finally, to evaluate and tune our method, we need some form of ground truth indicating which scans are degraded. Obtaining reliable labels in this domain is challenging. Manual annotation of degradation levels is laborious and somewhat subjective. We address these challenges and issues of labeling data for evaluation next.

\end{enumerate}


%\newsubsubsectionNoTOC{Labeling Challenges}

\threadtodo
{What are the challenges for correctly labeling the data}
{we alluded to labels being challenging before this}
{difficult to define degradation, difficult to capture, objective leads to puppet}
{with all requirements and challenges know $\rightarrow$ what dataset did we choose}

Quantitative benchmarking of degradation quantification requires a degradation label for every scan. Ideally that label would be a continuous degradation score, although a binary label would still enable meaningful comparison. As the rest of this section shows, producing any reliable label is already challenging and assigning meaningful analog scores may not be feasible at all. Compounding the problem, no public search-and-rescue (SAR) LiDAR data set offers such ground truth as far as we know. To understand the challenges around labeling lidar data degradation, we will look at what constitutes degradation in this context.

In section~\ref{sec:lidar_related_work} we discussed some internal and environmental error causes of lidar sensors, such as multi-return ambiguities or atmospheric scattering respectively. While we are aware of research into singular failure modes, such as \citetitle{lidar_errormodel_particles}~\cite{lidar_errormodel_particles} or research trying to model the totality of error souces occuring in other domains, such as .\citetitle{lidar_errormodel_automotive}~\cite{lidar_errormodel_automotive}, there appears to be no such model for the search and rescue domain and its unique environmental circumstances. Although, scientific consensus appears to be, that airborne particles are the biggest contributor to degradation in SAR~\cite{lidar_errormodel_consensus}, we think that a more versatile definition is required to ensure confidence during critical SAR missions, which are often of a volatile nature. We are left with an ambiguous definition of what constitutes lidar point cloud degradation in the SAR domain.

We considered which types of objective measurements may be available to produce ground-truth labels, such as particulate matter sensors, lidar point clouds' inherent properties such as range-dropout rate and others, but we fear that using purely objective measures to label the data, would limit our learning based method to imitating the labels' sources instead of differentiating all possible degradation patterns from high quality data. Due to the incomplete error model in this domain, there may be novel or compound error sources that would not be captured using such an approach. As an example, we did observe dense smoke reflecting enough rays to produce phantom objects, which may fool SLAM algorithms. Such a case may even be labeleled incorrectly as normal by one of the aforementioned objective measurement labeling options, if the surroundings do not exhibit enough dispersed smoke particles already.

To mitigate the aforementioned risks we adopt a human-centric, binary labelling strategy. We judged analog and multi-level discrete rating scales to be too subjective for human consideration, which only left us with the simplistic, but hopefully more reliable binary choice. We used two labeling approaches, producing two evaluation sets, whose motivation and details will be discussed in more detail in section~\ref{sec:preprocessing}. Rationale for the exact labeling procedures requires knowledge of the actual dataset we ended up choosing, which we will present in the next section.

\newsection{data_dataset}{Chosen Dataset}

\threadtodo
{give a comprehensive overview about chosen dataset}
{all requirements/challenges are clear, now reader wants to know about subter dataset}
{overview, domain, sensors, lidar, experiments, volume, statistics}
{statistics about degradation, not full picture $\rightarrow$ how did we preprocess and label}

%\todo[inline, color=green!40]{list sensors on the platform}
%Based on the previously discussed requirements and labeling difficulties we decided to train and evaluate the methods on \emph{Multimodal Dataset from Harsh Sub-Terranean Environment with Aerosol Particles for Frontier Exploration}~\cite{subter}. The dataset is comprised of data from multiple sensors on a moving sensor platform which was driven through tunnels and rooms in a subterranean setting. What makes it especially fitting for our use case is that during some of the experiments, an artifical smoke machine was employed to simulate aerosol particles.
%The sensors employed during capture of the dataset include:
Based on the previously discussed requirements and the challenges of obtaining reliable labels, we selected the \citetitle{subter}~\cite{subter} for training and evaluation. This dataset comprises multimodal sensor data collected from a robotic platform navigating tunnels and rooms in a subterranean environment, an underground tunnel in Luleå, Sweden. Notably, some experiments incorporated an artificial smoke machine to simulate heavy degradation from aerosol particles, making the dataset particularly well-suited to our use case. A Pioneer 3-AT2 robotic platform, which can be seen in figure~\ref{fig:subter_platform_photo}, was used to mount a multitude of sensors that are described in table~\ref{tab:subter-sensors} and whose mounting locations are depicted in figure~\ref{fig:subter_platform_sketch}.

% \begin{itemize}
% 	\item Lidar - Ouster OS1-32
% 	\item mmWave RADARs - 4 IWR6843AoP ES2.0 based radar models
% 	\item Lidar - Velodyne Velarray M1600
% 	\item IR-enabled RBG-D Camera - OAK-D Pro
% 	\item IMU - Pixhawk 2.1 Cube Orange,
% \end{itemize}

%-------------------------------------------------
% Compact sensor overview (row numbers follow Fig.~\ref{fig:subter_platform})
%-------------------------------------------------
\todo[inline]{todo: check table for accuracy/errors}
\begin{table}[htbp]
	\centering
	\caption{On–board sensors recorded in the \citetitle{subter} dataset.	Numbers match the labels in Fig.~\ref{fig:subter_platform}; only the most salient details are shown for quick reference.}
	\label{tab:subter-sensors}
	\setlength{\tabcolsep}{4pt}
	\renewcommand{\arraystretch}{1.25}
	\rowcolors{2}{gray!08}{white}
	\scriptsize
	\begin{tabular}{cp{4cm}p{4.5cm}p{5.5cm}}
		\textbf{\#} & \textbf{Sensor}                                         & \textbf{Recorded Data}       & \textbf{Key Specs}                                         \\
		1           & \sensorcell{Spinning 3-D LiDAR}{Ouster OS1-32}          & 3-D cloud, reflectivity      & 10 Hz, 32 ch, 360° × 42.4°, $\leq$ 120 m \rule{0pt}{2.6ex} \\
		2           & \sensorcell{mm-wave RADAR (×4)}{TI IWR6843AoP}          & 4 × 60° RADAR point clouds   & 30 Hz, 60 GHz, 9 m max, 0.05 m res.                        \\
		3           & \sensorcell{Solid-state LiDAR}{Velodyne Velarray M1600} & Forward LiDAR cloud          & 10 Hz, 160 ch, 120° × 32°, 0.1–30 m                        \\
		4           & \sensorcell{RGB-D / stereo cam}{Luxonis OAK-D Pro}      & stereo b/w images, depth map & 15 fps, 75 mm baseline, active IR 930 nm                   \\
		5           & \sensorcell{LED flood-light}{RS PRO WL28R}              & Illumination for stereo cam  & 7 W, 650 lm (no data stream)                               \\
		6           & \sensorcell{IMU}{Pixhawk 2.1 Cube Orange}               & Accel, gyro, mag, baro       & 190 Hz, 9-DoF, vibration-damped                            \\
		7           & \sensorcell{On-board PC}{Intel NUC i7}                  & Time-synced logging          & Quad-core i7, 16 GB RAM, 500 GB SSD                        \\
	\end{tabular}

\end{table}


%-------------------------------------------------
% Compact sensor overview (row numbers follow Fig.~2b)
%-------------------------------------------------
% \begin{table}[htbp]
% 	\centering
% 	\caption{Key on-board sensors deployed during data collection in \citetitle{subter}~\cite{subter}.  Sensor numbers correspond to the labels in figure~\ref{fig:subter_platform}.}
% 	\label{tab:sensor-suite-compact}
% 	\scriptsize
% 	\begin{tabular}{@{}clp{3.5cm}p{4.8cm}@{}}
% 		\# & Sensor (model)          & Recorded data type(s)                   & Core specifications used in the dataset         \\
% 		1  & Ouster OS1-32           & 3-D spinning-LiDAR cloud + reflectivity & 10 Hz, 32 ch, 360° by 45° FOV, <= 120 m range   \\
% 		2  & 4 × TI IWR6843AoP       & 360° mm-wave RADAR point clouds         & 30 Hz, 60 GHz, 9 m max range, 0.05 m range-res. \\
% 		3  & Velodyne Velarray M1600 & Solid-state LiDAR cloud                 & 10 Hz, 160 ch, 120° by 32° FOV, 0.1–30 m range  \\
% 		4  & Luxonis OAK-D Pro       & RGB, stereo depth, point cloud          & 15 fps, 75 mm baseline, active IR (930 nm)      \\
% 		5  & RS PRO WL28R LED        & Scene illumination only                 & 650 lm flood-light (no data stream)             \\
% 		6  & Pixhawk 2.1 Cube Orange & IMU (accel, gyro, mag, baro)            & 190 Hz, 9-DoF, vibration damped                 \\
% 		7  & Intel NUC i7 (ROS host) & Time-synced logging                     & Quad-core i7, 16 GB RAM, 500 GB SSD             \\
% 	\end{tabular}
% \end{table}


%\todo[inline, color=green!40]{lidar data of 360° sensor is captured at 10 frames per second. each sensor output consists of point cloud which resulted from measurement of 32 vertical channels for each of which 2048 measurement points are taken during each measurement equiangular distributed around the whole horizontal 360°, so the sensor measures 32 * 2048 = 65536 measurements 10 times a second for which ideally every one produces a point in the point cloud consisting of x,y,z coordinates (relative to sensor platform) as well as some other values per measurement (reflectivity, intensity originally measured range value)}

%We mainly utilize the data from the \emph{Ouster OS1-32} lidar sensor, which produces 10 frames per second with a resolution of 32 vertical channels by 2048 measurements per channel, both equiangularly spaced over the vertical and horizontal fields of view of 42.4° and 360° respectively. Every measurement of the lidar therefore results in a point cloud with a maximum of 65536 points. Every point contains the \emph{X}, \emph{Y} and \emph{Z} coordinates in meters with the sensor location as origin, as well as values for the \emph{range}, \emph{intensity} and \emph{reflectivity} which are typical data measured by lidar sensors. The data is dense, meaning missing measurements are still present in the data of each point cloud with zero values for most fields.

%\todo[inline, color=green!40]{short description of sensor platform and refer to photo}

We use data from the \emph{Ouster OS1-32} LiDAR sensor, which was configured to capture 10 frames per second with a resolution of 32 vertical channels and 2048 measurements per channel. These settings yield equiangular measurements across a vertical field of view of 42.4° and a complete 360° horizontal field of view. Consequently, every LiDAR scan can generate up to 65,536 points. Each point contains the \emph{X}, \emph{Y}, and \emph{Z} coordinates (in meters, with the sensor location as the origin) along with values for \emph{range}, \emph{intensity}, and \emph{reflectivity}—typical metrics measured by LiDAR sensors. The datasets' point clouds are saved in a dense format, meaning each of the 65,536 measurements is present in the data, although fields for missing measurements contain zeroes.

% \begin{figure}
% 	\centering
% 	\subfigure{\includegraphics[width=0.45\textwidth]{figures/data_subter_platform_photo.jpg}\label{fig:subter_platform_sketch}}%
% 	\hfill
% 	\subfigure{\includegraphics[width=0.45\textwidth]{figures/data_subter_platform_sketch.png}\label{fig:subter_platform_photo}}%
% 	\caption{\todo[inline, color=green!40]{better caption} 1-OS1-32, 2-mmWave RADARs, 3-M1600, 4-OAK-D Pro. 5-LED, 6-IMU, and 7-Intel NUC. Reproduced from~\cite{subter}}\label{fig:subter_platform}
% \end{figure}

%-------------------------------------------------
% Platform photographs – (a) Pioneer base, (b) numbered sensor layout
%-------------------------------------------------
\begin{figure}[htbp]
	\centering
	\subfigure[Pioneer 3-AT2 mobile base carrying the sensor tower. The four-wheel, skid-steered platform supports up to 30 kg payload and can negotiate rough terrain—providing the mobility required for subterranean data collection.]
	{\includegraphics[width=0.45\textwidth]{figures/data_subter_platform_photo.jpg}
		\label{fig:subter_platform_photo}}
	\hfill
	\subfigure[Sensor layout and numbering. Components: 1 OS1-32 LiDAR, 2 mm-wave RADARs, 3 M1600 LiDAR, 4 OAK-D Pro camera, 5 LED flood-light, 6 IMU, 7 Intel NUC. See Table~\ref{tab:subter-sensors} for detailed specifications.]
	{\includegraphics[width=0.45\textwidth]{figures/data_subter_platform_sketch.png}
		\label{fig:subter_platform_sketch}}
	\caption{Robotic platform and sensor configuration used to record the dataset.}
	\label{fig:subter_platform}
\end{figure}


%During the measurement campaign 14 experiments were conducted, of which 10 did not contain the utilization of the artifical smoke machine and 4 which did contain the artifical degradation, henceforth refered to as normal and anomalous experiments respectively. During 13 of the experiments the sensor platform was in near constant movement (sometimes translation - sometimes rotation) with only 1 anomalous experiment having the sensor platform stationary. This means we do not have 2 stationary experiments to directly compare the data from a normal and an anomalous experiment, where the sensor platform was not moved, nonetheless the genereal experiments are similar enough for direct comparisons. During anomalous experiments the artifical smoke machine appears to have been running for some time before data collection, since in camera images and lidar data alike, the water vapor appears to be distributed quite evenly throughout the closer perimeter of the smoke machine. The stationary experiment is also unique in that the smoke machine is quite close to the sensor platform and actively produces new smoke, which is dense enough for the lidar data to see the surface of the newly produced water vapor as a solid object.

During the measurement campaign, a total of 14 experiments were conducted—10 prior to operating the artificial smoke machine (hereafter referred to as normal experiments) and 4 after it has already been running for some time (anomalous experiments). In 13 of these experiments, the sensor platform was in near-constant motion (either translating at roughly 1m/s or rotating), with only one anomalous experiment conducted while the platform remained stationary. Although this means we do not have two stationary experiments from the same exact position for a direct comparison between normal and anomalous conditions, the overall experiments are similar enough to allow for meaningful comparisons. In addition to the presence of water vapor from the smoke machine, the experiments vary in illumination conditions, the presence of humans on the measurement grounds, and additional static artifacts. For our purposes, only the artificial smoke is relevant; differences in lighting or incidental static objects do not affect our analysis. Regardless of illumination, the LiDAR sensor consistently produces comparable point clouds, and the presence of static objects does not influence our quantification of point cloud degradation.

In the anomalous experiments, the artificial smoke machine appears to have been running for some time before data collection began, as evidenced by both camera images and LiDAR data showing an even distribution of water vapor around the machine. The stationary experiment is particularly unique: the smoke machine was positioned very close to the sensor platform and was actively generating new, dense smoke, to the extent that the LiDAR registered the surface of the fresh water vapor as if it were a solid object.

%\todo[inline, color=green!40]{shortly mention the differences in conditions for these experiments and why they do not matter for us}

%The 14 experiments differ regarding the available illumination, the presence of humans-traversing the measurement grounds- or additional static objects as artifcats and of course regarding the presence of the water vapor from the smoke machine. Aside from the artifical smoke which is essential for our use case, the other differences during the individual experiments are of no interestet to us and do not affect it in any way. Regardless of illumination, the lidar sensor produces indistinguishable point clouds and any static objects do not factor into our quantification of the point clouds' degradation.

%\todo[inline, color=green!40]{include representative image of point cloud and camera image}

The figures~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show an representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 lidar sensor at practically the same time.

\fig{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_camera}). Point color corresponds to measurement range and axis in center of figure is the lidar's position.}
\fig{data_screenshot_camera}{figures/data_screenshot_camera.png}{Screenshot of IR camera output of an experiment without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_pointcloud})}

%\todo[inline, color=green!40]{talk about how much data is available (maybe a plot about data?), number of experiments with/without degradation, other factors in these experiments which do not concern our use-case of them}
%Regarding the amount of data, of the 10 normal experiments the shortest was 88.7 seconds and the longest 363.1 seconds with a mean of 157.65 seconds between all 10 experiments, which results in 15765 non-degraded point clouds. Of the 4 anomalous experiments, the shortest was the stationary one with 11.7 seconds and the longest was 62.1 seconds, having a mean of 47.325 seconds, resulting in 1893 degraded point clouds. This gives us 17658 point clouds alltogether with 89.28\% of them being non-degraded/normal samples and the other 10.72\% of them begin degraded/anomalous samples.

Regarding the dataset volume, the 10 normal experiments ranged from 88.7 to 363.1 seconds, with an average duration of 157.65 seconds. At a capture rate of 10 frames per second, these experiments yield 15,765 non-degraded point clouds. In contrast, the 4 anomalous experiments, including one stationary experiment lasting 11.7 seconds and another extending to 62.1 seconds, averaged 47.33 seconds, resulting in 1,893 degraded point clouds. In total, the dataset comprises 17,658 point clouds, with approximately 89.28\% classified as non-degraded (normal) and 10.72\% as degraded (anomalous). The distribution of experimental data is visualized in figure~\ref{fig:data_points_pie}.

\fig{data_points_pie}{figures/data_points_pie.png}{Pie chart visualizing the amount and distribution of normal and anomalous point clouds in \cite{subter}}

%BEGIN missing points
%\todo[inline]{merge next two paragraphs into: \emph{how the degradation affects the poinntclouds - statistics. as discussed this may not be all but it gives an overview about the introduced degradation }}
%As we can see in figure~\ref{fig:data_missing_points}, the artifical smoke introduced as explicit degradation during some experiments results in more missing measurements during scans, which can be explained by measurement rays hitting airborne particles but not being reflected back to the sensor in a way it can measure.

%END missing points

The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline.  One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance.  Figure~\ref{fig:data_missing_points} shows the resulting right–shift of the missing-point histogram, a known effect for lidar sensors in aerosol-filled environments~\cite{when_the_dust_settles}. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box-plot in Fig.~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, consistent with the behaviour reported in \citetitle{when_the_dust_settles}~\cite{when_the_dust_settles}.

\fig{data_missing_points}{figures/data_missing_points.png}{Density histogram showing the percentage of missing measurements per scan for normal experiments without degradation and anomalous experiments with artifical smoke introduced as degradation.}

\fig{particles_near_sensor}{figures/particles_near_sensor_boxplot_zoomed_500.png}{Box diagram depicting the percentage of measurements closer than 50 centimeters to the sensor for normal and anomalous experiments}

Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes.  They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of LiDAR quality. Next we will discuss how the lidar scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so we could train and evaluate our quantification degradation methods.

%BEGIN early returns

% In experiments with artifical smoke present, we observe many points in the point cloud very close to the sensor where there are no solid objects and therefore the points have to be produced by airborne particles from the artifical smoke. The phenomenon can be explained, in that the closer to the sensor an airborne particle is hit, the higher the chance of it reflecting the ray in a way the lidar can measure. In \ref{fig:particles_near_sensor} we see a box diagram depicting how significantly more measurements of the anomaly expirements produce a range smaller than 50 centimeters. Due to the sensor platform's setup and its paths taken during experiments we can conclude that any measurement with a range smaller than 50 centimeters has to be erroneous. While the amount of these returns near the sensor could most likely be used to estimate the sensor data quality while the sensor itself is located inside an environment containing airborne particles, this method would not allow to anticipate sensor data degradation before the sensor itself enters the affected area. Since lidar is used to sense the visible geometry from a distance, it would be desireable to quantify the data degradation of an area before the sensor itself enters it. Due to these reasons we did not use this phenomenon in our work.

%In experiments with artificial smoke, we observe numerous points in the point cloud very close to the sensor, even though no solid objects exist at that range, which is consistent with lidar's expected behavior for these circumstances~\cite{when_the_dust_settles}. These points are generated by airborne particles in the artificial smoke, for which hold that closer airborne particles to the sensor have a higher probability of reflecting the laser beam in a measurable way. As shown in Figure~\ref{fig:particles_near_sensor}, a box diagram illustrates that significantly more measurements during these experiments report ranges shorter than 50 centimeters. Given the sensor platform's setup and its experimental trajectory, we conclude that any measurement with a range under 50 centimeters is erroneous.

%While the density of these near-sensor returns might be used to estimate data quality when the sensor is already in an environment with airborne particles, this method cannot anticipate data degradation before the sensor enters such an area. Since LiDAR is intended to capture visible geometry from a distance, it is preferable to quantify potential degradation of an area in advance. For these reasons, we did not incorporate this phenomenon into our subsequent analysis.

%END early returns

\newsection{preprocessing}{Preprocessing Steps and Labeling}

\threadtodo
{explain preprocessing and rationale}
{raw dataset has been explained, how did we change the data before use}
{projection because easier autoencoder, how does projection work, index-based}
{preprocessing known $\rightarrow$ how did we label the data for use}

%\newsubsubsectionNoTOC{Preprocessing}
%\todo{describe how 3d lidar data was preprocessed (2d projection), labeling}
%\todo[inline]{screenshots of 2d projections?}

%\todo[inline, color=green!40]{while as described in sec X the method Deep SAD is not dependend on any specific type/structure of data it requires to train an auto encoder in the pretraining step. such autoencoders are better understood in the image domain since there are many uses cases for this such as X (TODO citation needed), there are also 3d data auto encoders such as X (todo find example). same as the reference paper (rain cite) we chose to transform the 3d data to 2d by using a spherical spherical projection to map each of the 3d points onto a 2d plane where the range of each measurement can be expressed as the brightness of a single pixel. this leaves us with a 2d image of resolution 32x2048 (channels by horizontal measurements), which is helpful for visualization as well as for choosing a simpler architecture for the autoencoder of deepsad, the data in the rosbag is sparse meaning that measurements of the lidar which did not produce any value (no return ray detected before sensor specific timeout) are simply not present in the lidar scan. meaning we have at most 65xxx measurements per scan but mostly fewer than this, (maybe statistic about this? could aslo be interesting to show smoke experiment stuff)}

%As described in section~\ref{sec:algorithm_description} the method we want to evaluate is datatype agnostic and can be adjusted to work with any kind of data. The data from~\cite{subter} that we will train on is a point cloud per scan created by the lidar sensor which contains up to 65536 points with \emph{X}, \emph{Y}, and \emph{Z} coordinates (in meters) per point. To adjust the architecture of Deep SAD to work with a specific datatype, we have to define an autoencoder architecture that works for the given datatype. While autoencoders can be created for any datatype, as~\cite{autoencoder_survey} points out over 60\% of research papers pertaining autoencoders in recent years look at image classification and reconstruction, so we have a better understanding of their architectures for two dimensional images than for three dimensional point clouds.

As described in Section~\ref{sec:algorithm_description}, the method under evaluation is data type agnostic and can be adapted to work with any kind of data by choosing a suitable autoencoder architecture. In our case, the input data are point clouds produced by a lidar sensor. Each point cloud contains up to 65,536 points, with each point represented by its \emph{X}, \emph{Y}, and \emph{Z} coordinates. To tailor the Deep SAD architecture to this specific data type, we would need to design an autoencoder suitable for processing three-dimensional point clouds. Although autoencoders can be developed for various data types, \citetitle{autoencoder_survey}~\cite{autoencoder_survey} noted that over 60\% of recent research on autoencoders focuses on two-dimensional image classification and reconstruction. Consequently, there is a more established understanding of autoencoder architectures for images compared to those for three-dimensional point clouds.

%\todo[inline, color=green!40]{to achieve this transformation we used the helpful measurement index and channel present in each measurement point of the dataset which allowed a perfect reconstruction of the 2d projection without calculating the pixel position in the projection of each measurement via angles which in our experience typically leads to some ambiguity in the projection (multiple measurements mapping to the same pixel due to precision loss/other errors) the measurement index increases even for unavailable measurements (no ray return) so we can simply create the 2d projection by mapping the normalized range (FIXME really normalized) value to the pixel position y = channel, x = measurement index. by initalizing the array to NaN values originally we have a 2d data structure with the range values and NaN on pixel positions where originally no measurement took place (missing measurements in scans due to no ray return)}

%For this reason we decided to preprocess the point clouds by converting them to two dimensional grayscale images using spherical projection. Additionally, \cite{degradation_quantification_rain}-which we modeled our approach after-successfully chose this approach. In the projected image each measurement is encoded to a single pixel, whose grayscale value $v$ is the normalized range of the measurement $v = \sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}$. Due to the settings of the datasets' lidar, this results in images with the resolution of 2048 pixels wide by 32 pixels tall. Missing measurements of the point cloud are mapped to pixels with a brightness of 0. To create the mapping we used the measurements indices and channels which are available since the dataset contains dense point clouds and which can be used since the point indices are ordered from 0 to 65535 horizontally ascending channel by channel. For point clouds without indices which can be directly mapped, as is often the case for sparse ones, it would be necessary to use the pitch and yaw angles to the sensor origin to map each point to a pixel on the projection.

For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven sucessful in related work~\cite{degradation_quantification_rain}—encodes each LiDAR measurement as a single pixel, where the pixel’s grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the LiDAR sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$.

To create this mapping, we leveraged the available measurement indices and channel information inherent in the dense point clouds, which are ordered from 0 to 65,535 in a horizontally ascending, channel-by-channel manner. For sparser point clouds without such indices, one would need to rely on the pitch and yaw angles relative to the sensor's origin to correctly map each point to its corresponding pixel.

%In figure~\ref{fig:data_projections} we see two projections of lidar point clouds from the experiments, which are visually different from the preprocessed data for better understanding of the reader. While the point clouds were converted to grayscale images with a resolution of 2048 by 32 pixels, these projections can be hard to interpret for humans. For this reason the projections are depicted using the viridis colormap and vertically stretched, so single measurements are multiple pixels tall, since otherwise the image is only 32 pixels tall and hard to decipher. The top projection was created from a lidar scan where no artifical smoke and therefore appreciable degradation was present, whereas the lower projection is from an experiment with artifical smoke and a lot of degradation.

Figure~\ref{fig:data_projections} displays two examples of LiDAR point cloud projections to aid in the reader’s understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The top projection is derived from a scan without artificial smoke—and therefore minimal degradation—while the lower projection comes from an experiment where artificial smoke introduced significant degradation.

\todo[inline, color=green!40]{add same projections as they are used in training? grayscale without vertical scaling}

\fig{data_projections}{figures/data_2d_projections.png}{Two-dimensional projections of two pointclouds, one from an experiment without degradation and one from an experiment with artifical smoke as degradation. To aid the readers perception, the images are vertically stretched and a colormap has been applied to the pixels' reciprocal range values, while the actual training data is grayscale.}


%\newsubsubsectionNoTOC{Labeling}

\threadtodo
{explain labeling techniques and rationales}
{raw dataset has been explained, how did we label the data before use}
{experiment-based labeling, problems, manual labeling for traing, both for evaluation}
{method and labeled preprocessed data known $\rightarrow$ explain experimental setup}

%\todo[inline, color=green!40]{another important preprocessing step is labeling of the lidar frames as normal/anormal. this is one hand used during training (experiments with zero labeled up to most of the data being labeled) and on the other hand is important for evaluation of the method performance. originally we do not have any labels on the data regarding degradation and no analog values from another sensor which measures current smoke particles in the air. our simple approach was to label all frames from experiments which included artifical degradation by fog machine smoke as anomalous and all frames from experiments without artifical degradation as normal.}

%We discussed the requirements to data labels in section~\ref{sec:data}, where we mentioned the challenges but also importance of correctly labeled data, especially for evaluation. Since to our knowledege no public dataset with objective labels regarding dataset degradation of lidar data in subterranean environments is available and the dataset chosen for evaluation in this thesis \cite{subter} does not contain any explicit data or measurements about the dedata degradation, we had to choose a method of how we would label the data ourselves for evaluation. After considering multiple avenues, we decided to simply label all point clouds created during experiments with artifical smoke present as anomalies and all point clouds from other experiments as normal data.

The remaining challenge, was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in section~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available dataset provide objective labels for LiDAR data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using the datas' statistical properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in section~\ref{sec:data_req}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments.

%\todo[inline, color=green!40]{this simple labeling method is quite flawed since we do not label based on the actual degradation of the scan (not by some kind of threshold of analog measurement threshold, statistical info about scan) since (TODO FIXME) this would result in training which only learns this given metric (example missing measurement points) which would make this methodology useless since we could simply use that same measurement as an more simple way to quantify the scan's degradation. }

%This simplistic approach has both Advantages and disadvantages. The approach is simple to implement and provides a clear and straightforward distinction between normal and anomalous data. As a negative, there are clearly point clouds without subjective degradation present in the experiments with added degradation, which-using this method-get labeled as anomalies even though for actual trainging and evaluation purposes they should not be labeleld as such. Since we do not have an objective measure available, we looked into other ways to label the data such as statistical data about missing measurements per point cloud or the aforementioned phenomenon of more erroneous measurements up close to the sensor in degraded environments, but we feared that any statistical property of the data or any combination of them would only result in the method learning to replicate those statistical evaluations rather than to actually quantify the degradation in a generalized way. The classification of wether smoke was present during an experiment or not is different here in that it is not dependent on the data but is rather an expression of the environment itself, during the recording of the data.

The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this, is that during the three non-static anomalous experiments the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the temporal beginnings and ends of the anomalous experiments capture point clouds which are subjectively not degraded and appear similar to ones from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier-the proportion of missing points and the amount of erroneous points close to the sensor per pointcloud-as can be seen in figure~\ref{fig:data_anomalies_timeline}.

\fig{data_anomalies_timeline}{figures/data_combined_anomalies_timeline.png}{Missing points and points with a measured range smaller than 50cm per point cloud over a normalized timeline of the individual experiments. This illustrates the rise, plateau and fall of degradation intensity during the anomalous experiments, owed to the spacial proximity to the degradation source (smoke machine). One of the normal experiments (without artifical smoke) is included as a baseline.}

Afraid that the incorrectly labeled data may negatively impact DeepSAD's semi-supervised training, we chose to manually remove the anomalous labels from the beginning and end of the anomalous experiments, for training purposes. This refinement gave us more confidence in the training signal but reduced the number of labeled anomalies. For evaluation, we therefore report results under both schemes:

\begin{enumerate}
	\item \textbf{Experiment-based labels:} All scans from anomalous experiments marked anomalous, including border cases—yielding conservative performance metrics that reflect real-world label noise.
	\item \textbf{Manually-refined labels:} Only unequivocally degraded scans marked anomalous—producing near-ideal separation in a lot of cases.
\end{enumerate}

By evaluating and comparing both approaches, we hope to demonstrate a more thorough performance investigatation than with only one of the two.

%. This resulted in less labeled anomalies which are subjectively easy to assign a binary label to, due to the high degree of degradation present in them. Figure~\ref{fig:data_anomalies_timeline} also alludes to an issue, we faced when using the experiment-based labeling approach during evaluation, namely that the normal experiment which we included as a baseline, shows indication of similar degrees of degradation to the beginning and end states of the anomalous experiments. From this we figured, that while still subjective, an unknown amount of samples will most likely be incorrectly labeled, resulting in a lowered ceiling for performance metrics when using them for evaluation. For this reason we decided to utilize both, the experiment-based and the manually selected labels for evaluation, to achieve a more complete understanding of the compared methods' performance. While we can be quite certain that the manually labeled samples were correctly labeled, the strong level of data degradation present in them led to perfect classifiers in our training, which alone is not very useful when gauging the methods performance. On the other hand, the experiment-based labels resulted in lower performance metrics than initially expected, but which we can explain due to the incorrectly labeled samples, we highlighted in this section.

%\todo[inline]{TODO maybe evaluate based on different thresholds? missing datapoints, number of detected outliers, number of particles in phantom circle around sensor?}
%\todo[inline]{maybe also mention that we considered labeling using output of down-the-pipeline algorithm (e.g., SLAM) and how it performs/how confident it is and retrospectively label the quality of the data based on that}

\newchapter{experimental_setup}{Experimental Setup}

\threadtodo
{introduce experimental setup, give overview of what will be covered}
{motivation, bg, method and data is know and understood, how was it used}
{codebase, hardware description overview of training setup, details of deepsad setup}
{overview of chapter given $\rightarrow$ give sequential setup overview}

%The authors of \citetitle{deepsad} share the code to their implementation and testing methodologies online on \url{https://github.com/lukasruff/Deep-SAD-PyTorch}, which we used in our thesis as a base to work from. The codebase implements a framework for loading predefined datasets, training and testing DeepSAD as well as some baselines that were used in the original paper. In the coming sections we will describe how we adapted this code to our needs, how we loaded the data from \citetitle{subter}, how we set up DeepSAD's network, how we trained, tested and compared DeepSAD and two baseline algorithms, namely an isolation forest and an one-class support vector machine. We also shortly go over the hardware and software used during experiments and give guidelines for training and inference times.

We built our experiments on the official DeepSAD PyTorch implementation and evaluation framework, available at \url{https://github.com/lukasruff/Deep-SAD-PyTorch}. This codebase provides routines for loading standard datasets, training DeepSAD and several baseline models, and evaluating their performance.

In the following sections, we detail our adaptations to this framework:

\begin{itemize}
	\item Data integration: preprocessing and loading the dataset from \citetitle{subter}.
	\item Model architecture: configuring DeepSAD’s encoder to match our pointcloud input format, contrasting two distinct neural network architectures to investigate their impact on the method's output.
	\item Training \& evaluation: training DeepSAD alongside two classical baselines—Isolation Forest and one-class SVM—and comparing their degradation-quantification performance.
	\item Experimental environment: the hardware and software stack used, with typical training and inference runtimes.
\end{itemize}

Together, these components define the full experimental pipeline, from data preprocessing to the evaluation metrics we use to compare methods.

\section{Framework \& Data Preparation}
% Combines: Framework Initialization + Data Integration
% Goals: introduce codebase, how you adapted it, dataset loading/preprocessing, labeling
\newsubsubsectionNoTOC{DeepSAD PyTorch codebase and our adaptations}

\threadtodo
{Explain deepsad codebase as starting point}
{what is the starting point?}
{codebase, github, dataloading, training, testing, baselines}
{codebase understood $\rightarrow$ how was it adapted}

DeepSAD's PyTorch implementation includes standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}, as well as suitable network architectures for the corresponding datatypes. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the ROC area under curve for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added functionality for dataloading our chosen dataset, added DeepSAD models that work with the lidar projections datatype, added more evaluation methods and an inference module.

\newsubsubsectionNoTOC{SubTER dataset preprocessing, train/test splits, and label strategy}

\threadtodo
{explain how dataloading was adapted}
{loading data first point step to new training}
{preprocessed numpy (script), load, labels/meta, split, k-fold}
{k-fold $\rightarrow$ also adapted in training/testing}

%dataset in rosbag format (one bag file per experiment) was preprocessed as mentioned in chapter X by projecting the 3d lidar data (xzy pointcloud) using a spherical projection in a python script and saved as a npy araray of dimensions frames by height by width with value normalized distance (1 over sqrt(distance)) using numpy save method for simplicity while loading and to avoid having to do this preprocessing during each experiment. the projection was done using the meta information in the bag which includes the channel (height/row) and the index which is available since the data is non-sparse/dense, which means that for each possible measurement a data is available in the original rosbag even if the sensor did not record a return ray for this measurement, which means there is no data and it could be left out in a sparse array saving file size. this is very helpful since it allows the direct mapping of all measurements to the spherical projection using channel as the height index and measurement index modulo (measurements / channel) as the width index for each measurement. the reason that this is useful is that otherwise the projection would have to be calculated, meaning the angles between the origin and each point from the point cloud would have to be used to reconstruct the mapping between each measurement and a pixel in the projection. we also tried this method originally which lead to many ambiguities in the mappings were sometimes multiple measurements were erroneously mapped to the same pixel with no clear way to differentiate between which of them was mapped incorrectly. this is most likely due to quantification errors, systematic and sporadic measurement errors and other unforseen problems. for these reasons the index based mapping is a boon to us in this dataset. it should also be mentioned that lidar sensors originally calculate the distance to an object by measuring the time it takes for an emitted ray to return (bg chapter lidar ref) and the point cloud point is only calculated using this data and the known measurement angles. for this reason it is typically possible to configure lidar sensors to provide this original data which is basically the same as the 2d projection directly, without having to calculate it from the pointcloud.
%\todo[inline]{why normalize range?}

The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 LiDAR. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” and save them as NumPy arrays. We apply a spherical projection that maps each LiDAR measurement to a pixel in a 2D image of size Height × Width, where Height = number of vertical channels (32) and Width = measurements per rotation (2048). Instead of computing per-point azimuth and elevation angles at runtime, we exploit the sensor’s metadata:

\begin{itemize}
	\item \textbf{Channel index:} directly gives the row (vertical position) of each measurement.
	\item \textbf{Measurement index:} by taking the measurement index modulo Width, we obtain the column (horizontal position) in the 360° sweep.
\end{itemize}

The measurement index is only available because the SubTER data is dense—every possible channel × measurement pair appears in the bag, even if the LiDAR did not record a return. We can perform a direct 1:1 mapping without collision or missing entries. This avoids the ambiguities we previously encountered when reconstructing the projection via angle computations alone, which sometimes mapped multiple points to the same pixel due to numerical errors in angle estimation.

For each projected pixel, we compute $\mathbf{v}_i = \sqrt{{x_i}^2 + {y_i}^2 + {z_i}^2}$ where $\mathbf{v}_i$ is the reciprocal range value assigned to each pixel in the projection and $x_i, y_i$ and $z_i$ are the corresponding measurement's 3d coordinates. This transformation both compresses the dynamic range and emphasizes close-range returns—critical for detecting near-sensor degradation. We then save the resulting tensor of shape (Number of Frames, Height, Width) using NumPy’s save function. Storing precomputed projections allows rapid data loading during training and evaluation.

Many modern LiDARs can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.

\newsubsubsectionNoTOC{Any implementation challenges or custom data loaders}

%the original code base utilized pytorch's dataloaders, so we added a new one which used the framework's existing structure but loaded the aforementioned numpy files. in addition to loading these we assign the two kinds of evaluation labels discussed in section~\ref{sec:preprocessing} by using the original experiment's name which either contained the word smoke or not as the deciding factor for correspdondingly anomalous and normal labels (which are -1 and +1 respectively) as the first type of evaluation labels called henceforth "experiment-based labels" and loaded a JSON file which contained manually chosen start and end frames  for the 4 experiments containing smoke degradation and used these to only assign -1 (the anomalous label) to these frames which were manually selected to be definitely degraded and +1 to all experiments which once again didn't have the word smoke in their file names and therefore did not contain artifical degradation by smoke machine. the frames before the manually chosen start frame and the ones after the manually chosen end frame were labeled as "unknown" with a 0 value and not used in the evaluation. this second type of evaluation label method is henceforth called "manually defined" evaluation labels.

We extended the DeepSAD framework’s PyTorch \texttt{DataLoader} by implementing a custom \texttt{Dataset} class that ingests our precomputed NumPy range-image files and attaches appropriate evaluation labels.

Each experiment’s frames are stored as a single \texttt{.npy} file of shape \((\text{Number of Frames}, H, W)\), containing the reciprocal range values described in Section~\ref{sec:preprocessing}. Our \texttt{Dataset} initializer scans a directory of these files, loads the NumPy arrays from file into memory, transforms them into PyTorch tensors and assigns evaluation and training labels accordingly.

The first labeling scheme, called \emph{experiment-based labels}, assigns
\[
	y_{\mathrm{exp}} =
	\begin{cases}
		-1 & \text{if the filename contains “smoke”, signifying anomalous/degraded data,} \\
		+1 & \text{otherwise, signifying normal data.}
	\end{cases}
\]
At load time, any file with “smoke” in its name is treated as anomalous (label \(-1\)), and all others (normal experiments) are labeled \(+1\).

To obtain a second source of ground truth, we also support \emph{manually-defined labels}. A companion JSON file specifies a start and end frame index for each of the four smoke experiments—defining the interval of unequivocal degradation. During loading the second label $y_{man}$ is assigned as follows:

\[
	y_{\mathrm{man}} =
	\begin{cases}
		-1 & \text{Frames within the manually selected window from smoke experiments}  \\
		+1 & \text{All frames from non-smoke experiments}                              \\
		0  & \text{Frames outside the manually selected window from smoke experiments}
	\end{cases}
\]

We pass instances of this \texttt{Dataset} to PyTorch’s \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed lidar projection, both evaluation labels and a semi-supervised training label. This modular design lets us train and evaluate DeepSAD under both labeling regimes without duplicating data-handling code.

%since deepsad is a semi-supervised method which allows for optional training labels to improve performance over the fully unsupervised case, the pytorch dataset is passed parameters which define the number of samples which should have training labels during training, individually for anomalies and normal data samples. this allows for experiments that compare unsupervised to semi-supervised cases and how the introduction of different number of labeled data during training affects the model's performance. for the semi-supervised training labels the manually defined evaluation labels are used as an initial source of which randomized labels are removed (by replacing their value with 0 which indicates no known status of either anomalous or normal) until the desired number of training labels per class (normal / anomalous) which was passed as a configuration parameter was is reached.

%because the amount of data is not too large, we also implemented k-fold training and evaluation to improve confidence in the evaluation results, which means an integer is passed as the number of desired folds, which in turn will define the split between training and evaluation data. for our trainings we always chose a 5 fold training and evaluation which results in a 20 80 split of training evaluation data. for the implementation of the k-fold crossvalidation data loading we utilized the KFold class included in sklearn's model\_selection module.

%additionally we implemented another pytorch dataset which loads a single experiment from the corresponding numpy file for inference, which does not use k-fold crossvalidation nor shuffles the frames around to allow for sequential calculations of forward passes on fully trained models to validate their funcationality and produce sequential frame-by-frame infreence results (anomaly scores) for one complete experiment from start to finish

%\todo[inline]{semi-supervised trainingn labels, k_fold, inference dataloader}

DeepSAD supports both unsupervised and semi-supervised training by optionally incorporating a small number of labeled samples. To control this, our custom PyTorch \texttt{Dataset} accepts two integer parameters, \texttt{num\_labelled\_normal} and \texttt{num\_labelled\_anomalous}, which specify how many samples of each class should retain their labels during training. All other samples are assigned a label of 0 (``unknown'') and treated as unlabeled.

When using semi-supervised mode, we begin with the manually-defined evaluation labels. We then randomly un-label (set to 0) enough samples of each class until exactly \texttt{num\_labelled\_normal} normals and \texttt{num\_labelled\_anomalous} anomalies remain labeled. This mechanism allows us to systematically compare unsupervised mode, where \texttt{num\_labelled\_normal} = \texttt{num\_labelled\_anomalous} = 0, and Semi-supervised modes with varying label budgets.

To obtain robust performance estimates on our relatively small dataset, we implement $k$-fold cross-validation. A single integer parameter, \texttt{num\_folds}, controls the number of splits. We use scikit-learn’s \texttt{KFold} (from \texttt{sklearn.model\_selection}) with \texttt{shuffle=True} and a fixed random seed to partition each experiment’s frames into \texttt{num\_folds} disjoint folds. Training then proceeds across $k$ rounds, each time training on $(k-1)/k$ of the data and evaluating on the remaining $1/k$. In our experiments, we set \texttt{num\_folds=5}, yielding an 80/20 train/evaluation split per fold.

For inference (i.e.\ model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire traversal.

\section{Model Configuration \& Evaluation Protocol}

Since the neural network architecture trained in the deepsad method is not fixed as described in section~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed lidar data projections. Since \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our usecase, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth reffered to as "efficient architecture" to incorporate a few modern techniques, befitting our usecase.

\newsubsubsectionNoTOC{Network architectures (LeNet variant, custom encoder) and how they suit the point‑cloud input}

\todo[inline]{STANDARDIZE ALL DIMENSIONS TO (CHANNEL, WIDTH, HEIGHT)}

The LeNet-inspired autoencoder can be split into an encoder network (figure~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (figure~\ref{fig:setup_arch_lenet_decoder}) with a latent space inbetween the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in section~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use-case, once trained.

%The LeNet-inspired encoder network (see figure~\ref{fig:setup_arch_lenet_encoder}) consists of two convolution steps with pooling layers, and finally a dense layer which populates the latent space. \todo[inline]{lenet explanation from chatgpt?} The first convolutional layer uses a 3x3 kernel and outputs 8 channels, which depicts  the number of features/structures/patterns the network can learn to extract from the input and results in an output dimensionality of 2048x32x8 which is reduced to 1024x16x8 by a 2x2 pooling layer. \todo[inline]{batch normalization, and something else like softmax or relu blah?} The second convolution reduces the 8 channels to 4 with another 3x3 kernel \todo[inline]{why? explain rationale} and is followed by another 2x2 pooling layer resulting in a 512x8x4 dimensionality, which is then flattened and input into a dense layer. The dense layer's output dimension is the chosen latent space dimensionality, which is as previously mentioned another tuneable hyperparameter.

\fig{setup_arch_lenet_encoder}{diagrams/arch_lenet_encoder}{UNFINISHED - Visualization of the original LeNet-inspired encoder architecture.}

The LeNet-inspired encoder network (see figure~\ref{fig:setup_arch_lenet_encoder}) is a compact convolutional neural network that reduces image data into a lower-dimensional latent space. It consists of two stages of convolution, normalization, non-linear activation, and pooling, followed by a dense layer that defines the latent representation. Conceptually, the convolutional layers learn small filters that detect visual patterns in the input (such as edges or textures). Batch normalization ensures that these learned signals remain numerically stable during training, while a LeakyReLU activation introduces non-linearity, allowing the network to capture more complex relationships. Pooling operations then downsample the feature maps, which reduces the spatial size of the data and emphasizes the most important features. Finally, a dense layer transforms the extracted feature maps into the latent space, which serves as the datas' representation in the reduced dimensionality latent space.

Concretely, the first convolutional layer uses a $3\times 3$ kernel with 8 output channels, corresponding to 8 learnable filters. For input images of size $2048\times 32\times 1$, this produces an intermediate representation of shape $2048\times 32\times 8$, which is reduced to $1024\times 16\times 8$ by a $2\times 2$ pooling layer. The second convolutional layer again applies a $3\times 3$ kernel but outputs 4 channels, followed by another pooling step, resulting in a feature map of shape $512\times 8\times 4$. This feature map is flattened and passed into a fully connected layer. The dimensionality of the output of this layer corresponds to the latent space, whose size is a tunable hyperparameter chosen according to the needs of the application.

% Its decoder network (see figure~\ref{fig:setup_arch_lenet_decoder}) is a mirrored version of the encoder, with a dense layer after the latent space and two pairs of 2x2 upsampling and transpose convolution layers which use 4 and 8 input channels respectively with the second one reducing its output to one channel resulting in the 2048x32x1 output dimensionality, equal to the input's, which is required for the autoencoding objective to be possible.

\fig{setup_arch_lenet_decoder}{diagrams/arch_lenet_decoder}{UNFINISHED - Visualization of the original LeNet-inspired decoder architecture.}

The decoder network (see figure~\ref{fig:setup_arch_lenet_decoder}) mirrors the encoder and reconstructs the input from its latent representation. A dense layer first expands the latent vector into a feature map of shape $512\times 8\times 4$, which is then upsampled and refined in two successive stages. Each stage consists of an interpolation step that doubles the spatial resolution, followed by a transpose convolution that learns how to add structural detail. The first stage operates on 4 channels, and the second on 8 channels, with the final transpose convolution reducing the output to a single channel. The result is a reconstructed output of size $2048\times 32 \times 1$, matching the original input dimensionality required for the autoencoding objective.

%\todo[inline]{what problems and possible improvements did we find when investigating this architecture}
%\todo[inline]{starting point - receptive field, possible loss of information due to narrow RF during convolutions, which motivated us to investigate the impact of an improved arch}

%Even though this architecture is quite simplistic, it also appeared capable to achieve our degradation quantification objective during initial testing. Nonetheless, we identified a few possible shortcomings and improvements, which motivated us to improve upon this architecture and evaluate the impact of choosing a capable architecture for DeepSAD on our usecase.

%The biggest deficiency we identified was the unproportional aspect ratio of the convolutional layer's receptive field. We proposed that it may lead to worse performance and generalization by hindering the capture of certain degradation patterns during the encoding step, due to its badly chosen size and aspect ratio.

%The receptive field of a neural network is a measure of how many input data affect each output data. Is is oftentimes calculated for convolutional networks, which have twodimensional images as input data and can be useful for understanding the network's field of view. The RF size has been shown to be an important factor in a network's capability of extracting/identifying a feature/pattern, in that the RF size has to be at least as large as the pattern meant to capture but should also not be chosen too large, since this can lead to bad performance when capturing smaller more fine-grained patterns. \todo[inline]{refer to graphic 1}

%\todo[inline]{2 subgraphics, left overall neural field , right zoomed in n x n RF affecting single output neuron}

%The receptieve field of convolutional neural networks is often given as a single integer $n$, implying a symmetric receptive field of size $n \times n$ affects each output neuron \todo[inline]{refer to graphic 2}. Formally the RF can be calculated independently for each input dimension and does not have to be square.

%For the LeNet-inspired encoder we calculated the RF size to be \todo[inline]{result and calculation here}. The problem we identified stems from the vast difference in resolution per axis in the input data. Due to the working mechanics of spinning lidar sensors, it is typical for the point cloud's produced by them to have a denser resolution in their x axis than in their y axis. This is the result of a fixed number of vertical channels (typically 32 to 128) rotating on the sensor's x axis and producing oftentimes over 1000 measurements per channel over the 360 degree field of view. We can therefore calculate the measurements per degree for both axis, which is tantamount to pixels per degree for our spherical projection images. The vertical pixel per degree are with A ppd nearly 10 times larger than the B ppd of the horizontal axis \todo[inline]{show calculation and correct values of ppd}. This discrapency in resolution per axis means that a square pixel RF results in a unproportional RF when viewed in terms of degrees, namely N by M degrees of RF.


%Since our understanding of the kinds of degradation present in lidar data is limited we want to make sure the network is capable of capturing many types of degradation patterns. To increase the network's chance of learning to identify such patterns we explored a new network architecture possessing a more square RF when viewed in terms of degrees and also included some other improvements.


Even though the LeNet-inspired encoder proved capable of achieving our degradation quantification objective in initial experiments, we identified several shortcomings that motivated the design of a second, more efficient architecture. The most important issue concerns the shape of the CNN's receptive field (RF) which describes the region of the input that influences a single output activation. Its size and aspect ratio determine which structures the network can effectively capture: if the RF is too small, larger patterns cannot be detected, while an excessively large RF may hinder the network from learning to recognize fine details. For standard image data, the RF is often expressed as a symmetric $n \times n$ region, but in principle it can be computed independently per axis.

\todo[inline]{RF concept figur}

The RF shape's issue arises from the fact that spinning multi-beam LiDAR oftentimes produce point clouds posessing dense horizontal but limited vertical resolution. In our case this, this results in a pixel-per-degree resolution of approximately $0.99^{\circ}$/pixel vertically and $0.18^{\circ}$/pixel horizontally \todo[inline]{double-check with calculation graphic/table}. Consequently, the LeNet-inspired encoder’s calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the network’s ability to capture degradation patterns that extend differently across the two axes.

\todo[inline]{add schematic showing rectangular angular RF overlaid on LiDAR projection}

%\todo[inline]{start by explaining lenet architecture, encoder and decoder split, encoder network is the one being trained during the main training step, together as autoencoder during pretraining, decoder of lenet pretty much mirrored architecture of encoder, after preprocessing left with image data (2d projections, grayscale = 1 channel) so input is 2048x32x1. convolutional layers with pooling afterwards (2 convolution + pooling) convolutions to multiple channels (8, 4?) each channel capable of capturing a different pattern/structure of input. fully connected layer before latent space, latent space size not fixed since its also a hyperparameter and depended on how well the normal vs anomalous data can be captured and differentiated in the dimensionality of the latent space}
%\todo[inline]{batch normalization, relu? something....}

To adjust for this, we decided to modify the network architecture and included further modificatons to improve the method's performance. The encoder (see figure~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modificatons:
\begin{itemize}
	\item \textbf{Non-square convolution kernels.} Depthwise-separable convolutions with kernel size $3 \times 17$ are used instead of square kernels, resulting in an RF of $10 \times 52$ pixels, corresponding to $9.93^{\circ} \times 9.14^{\circ}$, substantially more balanced than the LeNet-inspired network's RF.
	\item \textbf{Circular padding along azimuth.} The horizontal axis is circularly padded to respect the wrap-around of $360^{\circ}$ LiDAR data, preventing artificial seams at the image boundaries.
	\item \textbf{Aggressive horizontal pooling.} A $1 \times 4$ pooling operation is applied early in the network, which reduces the over-sampled horizontal resolution (2048~px to 512~px) while keeping vertical detail intact.
	\item \textbf{Depthwise-separable convolutions with channel shuffle.} Inspired by MobileNet and ShuffleNet, this reduces the number of parameters and computations while retaining representational capacity, making the network more suitable for embedded platforms, while simultaneously allowing more learnable channels without increasing computational demand.
	\item \textbf{Max pooling.} Standard max pooling is used instead of average pooling, since it preserves sharp activations that are often indicative of localized degradation.
	\item \textbf{Channel compression before latent mapping.} After feature extraction, a $1 \times 1$ convolution reduces the number of channels before flattening, which lowers the parameter count of the final fully connected layer without sacrificing feature richness.
\end{itemize}

\fig{setup_arch_ef_encoder}{diagrams/arch_ef_encoder}{UNFINISHED - Visualization of the efficient encoder architecture.}


\paragraph{Decoder.}
The decoder (see figure~\ref{fig:setup_arch_ef_decoder}) mirrors the encoder’s structure but introduces changes to improve reconstruction stability:
\begin{itemize}
	\item \textbf{Nearest-neighbor upsampling followed by convolution.} Instead of relying solely on transposed convolutions, each upsampling stage first enlarges the feature map using parameter-free nearest-neighbor interpolation, followed by a depthwise-separable convolution. This strategy reduces the risk of checkerboard artifacts while still allowing the network to learn fine detail.
	\item \textbf{Asymmetric upsampling schedule.} Horizontal resolution is restored more aggressively (e.g., scale factor $1 \times 4$) to reflect the anisotropic downsampling performed in the encoder.
	\item \textbf{Final convolution with circular padding.} The output is generated using a $(3 \times 17)$ convolution with circular padding along the azimuth similar to the new encoder, ensuring consistent treatment of the 360° LiDAR input.
\end{itemize}

\fig{setup_arch_ef_decoder}{diagrams/arch_ef_decoder}{UNFINISHED - Visualization of the efficient decoder architecture.}

Even though both encoders were designed for the same input dimensionality of $2048 \times 32$, their computational requirements differ significantly. To quantify this, we compared the number of trainable parameters and the number of multiply–accumulate operations (MACs) for different latent space sizes used in our experiments.

\begin{table}[h]
	\centering
	\caption{Comparison of parameter count and MACs for SubTer\_LeNet and SubTer\_Efficient encoders across different latent space sizes.}
	\begin{tabular}{c|cc|cc}
		\toprule
		\multirow{2}{*}{Latent dim} & \multicolumn{2}{c|}{SubTer\_LeNet} & \multicolumn{2}{c}{SubTer\_Efficient}                  \\
		                            & Params                             & MACs                                  & Params & MACs  \\
		\midrule
		32                          & 8.40M                              & 17.41G                                & 1.17M  & 2.54G \\
		64                          & 16.38M                             & 17.41G                                & 1.22M  & 2.54G \\
		128                         & 32.35M                             & 17.41G                                & 1.33M  & 2.54G \\
		256                         & 64.30M                             & 17.41G                                & 1.55M  & 2.54G \\
		512                         & 128.19M                            & 17.41G                                & 1.99M  & 2.54G \\
		768                         & 192.07M                            & 17.41G                                & 2.43M  & 2.54G \\
		1024                        & 255.96M                            & 17.41G                                & 2.87M  & 2.54G \\
		\bottomrule
	\end{tabular}
	\label{tab:lenet_vs_efficient}
\end{table}

\todo[inline]{rework table and calculate with actual scripts and network archs in deepsad codebase}

As can be seen, the efficient encoder requires an order of magnitude fewer parameters and significantly fewer operations while maintaining a comparable representational capacity. The key reason is the use of depth–wise separable convolutions, aggressive pooling along the densely sampled horizontal axis, and a channel squeezing strategy before the fully connected layer. Interestingly, the Efficient network also processes more intermediate channels (up to 32 compared to only 8 in the LeNet variant), which increases its ability to capture a richer set of patterns despite the reduced computational cost. This combination of efficiency and representational power makes the Efficient encoder a more suitable backbone for our anomaly detection task.

\todo[inline]{mention that as we see in AE results the efficient arch is capable of reproducing inputs better and especially so in lower dimensional latent spaces}

\threadtodo
{how was training/testing adapted (networks overview), inference, ae tuning}
{data has been loaded, how is it processed}
{networks defined, training/testing k-fold, more metrics, inference + ae tuning implemented}
{training procesure known $\rightarrow$ what methods were evaluated}

\threadtodo
{custom arch necessary, first lenet then second arch to evaluate importance of arch}
{training process understood, but what networks were actually trained}
{custom arch, lenet from paper and simple, receptive field problem, arch really important?}
{motivation behind archs given $\rightarrow$ what do they look like}
%\todo[inline]{architectures, visualization, receptive field (explanation, images, x/y resolution)}
\threadtodo
{show and explain both archs}
{we know why we need them but what do they look like}
{visualization of archs, explain LeNet and why other arch was chosen that way}
{both archs known $\rightarrow$ what about the other inputs/hyperparameters}
%\todo[inline]{hyperparameters, LR, eta, epochs, latent space size (hyper param search), semi labels}
\threadtodo
{give overview of hyperparameters}
{deepsad arch known, other hyperparameters?}
{LR, eta, epochs, latent space size (hyper param search), semi labels}
{everything that goes into training known $\rightarrow$ what experiments were actually done?}

\newsubsubsectionNoTOC{Baseline methods (Isolation Forest, one-class SVM) and feature extraction via the encoder}
\threadtodo
{what methods were evaluated}
{we know what testing/training was implemented for deepsad, but what is it compared to}
{isoforest, ocsvm adapted, for ocsvm only dim reduced feasible (ae from deepsad)}
{compared methods known $\rightarrow$ what methods were used}


To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and One-Class SVM (OCSVM). Both are included in the original DeepSAD codebase and the associated paper, and they represent well-understood but conceptually different families of anomaly detection. In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the DeepSAD autoencoder’s \emph{encoder} as a learned dimensionality reduction (to the same latent size as DeepSAD). This choice is motivated by practicality (compute) and inductive bias: the encoder captures non-linear, domain-specific structure of the LiDAR range images, which linear methods like PCA may miss. Together, these two baselines cover complementary perspectives: tree-based partitioning (Isolation Forest) and kernel-based boundary learning (OCSVM), providing a broad and well-established basis for comparison.

Isolation Forest is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.

Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient since no explicit density estimation is required, and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the LiDAR input representation, providing a strong non-neural baseline for comparison against DeepSAD.

While Isolation Forest relies on random partitioning of the input space, OCSVM takes a very different approach by learning a flexible boundary around normal samples. OCSVM is trained only on data assumed to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies.

The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data.

During training, the algorithm balances two competing objectives: capturing as many of the normal samples as possible inside the boundary, while keeping the region compact enough to exclude potential outliers. Once this boundary is established, applying OCSVM is straightforward — any new data point is checked against the learned boundary, with points inside considered normal and those outside flagged as anomalous.

We adapted the baseline implementations to our data loader and input format \todo[inline]{briefly describe file layout / preprocessing}, and added support for multiple evaluation targets per frame (two labels per data point), reporting both results per experiment. For OCSVM, the dimensionality reduction step is \emph{always} performed with the corresponding DeepSAD encoder and its autoencoder pretraining weights that match the evaluated setting (i.e., same latent size and backbone). Both baselines, like DeepSAD, output continuous anomaly scores. This allows us to evaluate them directly without committing to a fixed threshold.

\section{Experiment Matrix \& Computational Environment}
% Combines: Experiment Matrix + Hardware & Runtimes
% Goals: clearly enumerate each experiment configuration and give practical runtime details
\newsubsubsectionNoTOC{Table of experiment variants (architectures, hyperparameters, data splits)}
\threadtodo
{give overview of experiments and their motivations}
{training setup clear, but not what was trained/tested}
{explanation of what was searched for (ae latent space first), other hyperparams and why}
{all experiments known $\rightarrow$ how long do they take to train}
\newsubsubsectionNoTOC{Hardware specifications (GPU/CPU, memory), software versions, typical training/inference runtimes}
\threadtodo
{give overview about hardware setup and how long things take to train}
{we know what we trained but not how long that takes}
{table of hardware and of how long different trainings took}
{experiment setup understood $\rightarrow$ what were the experiments' results}

\newchapter{results_discussion}{Results and Discussion}
\newsection{results}{Results}
\todo[inline]{some results, ROC curves, for both global and local}

\newsection{hyperparameter_analysis}{Hyperparameter Analysis}
\todo[inline]{result for different amounts of labeled data}

\newchapter{conclusion_future_work}{Conclusion and Future Work}
\newsection{conclusion}{Conclusion}
\todo[inline]{summarize what has been achieved}

\newsection{future_work}{Future Work}
\todo[inline]{confirm results with real smoke data}


% end mainmatter
% **************************************************************************************************

\appendix
\ifthenelse{\equal{\DocumentType}{thesis}}
{
	\setcounter{mypageno}{\value{page}}
	\frontmatter \pagestyle{plain} \pagenumbering{Roman}
	\setcounter{page}{\value{mypageno}}
}{}

\printbibliography
\listoffigures
\listoftables
\printglossary[type=\acronymtype]

% **************************************************************************************************

% place all floats and create label on last page
\FloatBarrier\label{end-of-document}
\end{document}