Compare commits

..

3 Commits

Author SHA1 Message Date
Jan Kowalczyk
545b65d3d5 feedback WIP 2025-10-11 15:58:44 +02:00
Jan Kowalczyk
8db244901e feedback wip 2025-10-11 15:21:53 +02:00
Jan Kowalczyk
72afe9ebdc nicer looking abstract 2025-10-11 13:38:39 +02:00
5 changed files with 491 additions and 145 deletions

View File

@@ -275,6 +275,180 @@
\endverb
\keyw{outlier detection,Anomaly detection}
\endentry
\entry{bg_svm}{article}{}{}
\name{author}{2}{}{%
{{hash=17acda211a651e90e228f1776ee07818}{%
family={Cortes},
familyi={C\bibinitperiod},
given={Corinna},
giveni={C\bibinitperiod}}}%
{{hash=c2b3e05872463585b4be6aab10d10d63}{%
family={Vapnik},
familyi={V\bibinitperiod},
given={Vladimir},
giveni={V\bibinitperiod}}}%
}
\list{publisher}{1}{%
{Springer}%
}
\strng{namehash}{4c67d5268f413e83454c8adc14ab43c3}
\strng{fullhash}{4c67d5268f413e83454c8adc14ab43c3}
\strng{fullhashraw}{4c67d5268f413e83454c8adc14ab43c3}
\strng{bibnamehash}{4c67d5268f413e83454c8adc14ab43c3}
\strng{authorbibnamehash}{4c67d5268f413e83454c8adc14ab43c3}
\strng{authornamehash}{4c67d5268f413e83454c8adc14ab43c3}
\strng{authorfullhash}{4c67d5268f413e83454c8adc14ab43c3}
\strng{authorfullhashraw}{4c67d5268f413e83454c8adc14ab43c3}
\field{sortinit}{8}
\field{sortinithash}{a231b008ebf0ecbe0b4d96dcc159445f}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{journaltitle}{Machine learning}
\field{number}{3}
\field{title}{Support-vector networks}
\field{volume}{20}
\field{year}{1995}
\field{pages}{273\bibrangedash 297}
\range{pages}{25}
\endentry
\entry{bg_kmeans}{article}{}{}
\name{author}{1}{}{%
{{hash=e6326ee35fdec69f1c1ef364c98e6216}{%
family={Lloyd},
familyi={L\bibinitperiod},
given={S.},
giveni={S\bibinitperiod}}}%
}
\strng{namehash}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{fullhash}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{fullhashraw}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{bibnamehash}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{authorbibnamehash}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{authornamehash}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{authorfullhash}{e6326ee35fdec69f1c1ef364c98e6216}
\strng{authorfullhashraw}{e6326ee35fdec69f1c1ef364c98e6216}
\field{sortinit}{9}
\field{sortinithash}{0a5ebc79d83c96b6579069544c73c7d4}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{journaltitle}{IEEE Transactions on Information Theory}
\field{number}{2}
\field{title}{Least squares quantization in PCM}
\field{volume}{28}
\field{year}{1982}
\field{pages}{129\bibrangedash 137}
\range{pages}{9}
\verb{doi}
\verb 10.1109/TIT.1982.1056489
\endverb
\keyw{Noise;Quantization (signal);Voltage;Receivers;Pulse modulation;Sufficient conditions;Stochastic processes;Probabilistic logic;Urban areas;Q measurement}
\endentry
\entry{bg_dbscan}{inproceedings}{}{}
\name{author}{4}{}{%
{{hash=2c062e64ed26aacc08a62155e7944f04}{%
family={Ester},
familyi={E\bibinitperiod},
given={Martin},
giveni={M\bibinitperiod}}}%
{{hash=9559fe65ed2c0877cf14a66fe1f8e9b3}{%
family={Kriegel},
familyi={K\bibinitperiod},
given={Hans-Peter},
giveni={H\bibinithyphendelim P\bibinitperiod}}}%
{{hash=802157026f850823b2027c2100cb359a}{%
family={Sander},
familyi={S\bibinitperiod},
given={Jörg},
giveni={J\bibinitperiod}}}%
{{hash=2dda16c0a5d50fc830d0d4a3787937fa}{%
family={Xu},
familyi={X\bibinitperiod},
given={Xiaowei},
giveni={X\bibinitperiod}}}%
}
\name{editor}{3}{}{%
{{hash=ebe3c105175ad500b489b3be8fab0279}{%
family={Simoudis},
familyi={S\bibinitperiod},
given={Evangelos},
giveni={E\bibinitperiod}}}%
{{hash=7cacfe272c4d395c979d6aecd2f5ec9c}{%
family={Han},
familyi={H\bibinitperiod},
given={Jiawei},
giveni={J\bibinitperiod}}}%
{{hash=d72660528ebbfc30c6661be74afda5c2}{%
family={Fayyad},
familyi={F\bibinitperiod},
given={Usama\bibnamedelima M.},
giveni={U\bibinitperiod\bibinitdelim M\bibinitperiod}}}%
}
\list{publisher}{1}{%
{AAAI Press}%
}
\strng{namehash}{9158a41d23cb4e154e78366d59c05728}
\strng{fullhash}{3270dfaa31e8210b3bd04b1bcf4a29a3}
\strng{fullhashraw}{3270dfaa31e8210b3bd04b1bcf4a29a3}
\strng{bibnamehash}{3270dfaa31e8210b3bd04b1bcf4a29a3}
\strng{authorbibnamehash}{3270dfaa31e8210b3bd04b1bcf4a29a3}
\strng{authornamehash}{9158a41d23cb4e154e78366d59c05728}
\strng{authorfullhash}{3270dfaa31e8210b3bd04b1bcf4a29a3}
\strng{authorfullhashraw}{3270dfaa31e8210b3bd04b1bcf4a29a3}
\strng{editorbibnamehash}{939413ab4a7ec18b5cc72dff25105ef5}
\strng{editornamehash}{f04653518ea0c78cffc4312148d46893}
\strng{editorfullhash}{939413ab4a7ec18b5cc72dff25105ef5}
\strng{editorfullhashraw}{939413ab4a7ec18b5cc72dff25105ef5}
\field{sortinit}{1}
\field{sortinithash}{4f6aaa89bab872aa0999fec09ff8e98a}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{booktitle}{KDD}
\field{isbn}{1-57735-004-9}
\field{title}{A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise.}
\field{year}{1996}
\field{pages}{226\bibrangedash 231}
\range{pages}{6}
\verb{urlraw}
\verb http://dblp.uni-trier.de/db/conf/kdd/kdd96.html#EsterKSX96
\endverb
\verb{url}
\verb http://dblp.uni-trier.de/db/conf/kdd/kdd96.html#EsterKSX96
\endverb
\endentry
\entry{bg_pca}{article}{}{}
\name{author}{1}{}{%
{{hash=716c197f50c5e070b09b67f32636d3e7}{%
family={F.R.S.},
familyi={F\bibinitperiod},
given={Karl\bibnamedelima Pearson},
giveni={K\bibinitperiod\bibinitdelim P\bibinitperiod}}}%
}
\list{publisher}{1}{%
{Taylor & Francis}%
}
\strng{namehash}{716c197f50c5e070b09b67f32636d3e7}
\strng{fullhash}{716c197f50c5e070b09b67f32636d3e7}
\strng{fullhashraw}{716c197f50c5e070b09b67f32636d3e7}
\strng{bibnamehash}{716c197f50c5e070b09b67f32636d3e7}
\strng{authorbibnamehash}{716c197f50c5e070b09b67f32636d3e7}
\strng{authornamehash}{716c197f50c5e070b09b67f32636d3e7}
\strng{authorfullhash}{716c197f50c5e070b09b67f32636d3e7}
\strng{authorfullhashraw}{716c197f50c5e070b09b67f32636d3e7}
\field{sortinit}{1}
\field{sortinithash}{4f6aaa89bab872aa0999fec09ff8e98a}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{journaltitle}{The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science}
\field{number}{11}
\field{title}{LIII. On lines and planes of closest fit to systems of points in space}
\field{volume}{2}
\field{year}{1901}
\field{pages}{559\bibrangedash 572}
\range{pages}{14}
\verb{doi}
\verb 10.1080/14786440109462720
\endverb
\endentry
\entry{deepsad}{article}{}{}
\name{author}{7}{}{%
{{hash=002c037bd5c44a3c55a7523254ff0522}{%
@@ -322,8 +496,8 @@
\strng{authorfullhash}{b6771072ca1bb3c6a1aad2b4043727e6}
\strng{authorfullhashraw}{b6771072ca1bb3c6a1aad2b4043727e6}
\field{extraname}{1}
\field{sortinit}{8}
\field{sortinithash}{a231b008ebf0ecbe0b4d96dcc159445f}
\field{sortinit}{1}
\field{sortinithash}{4f6aaa89bab872aa0999fec09ff8e98a}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{eprinttype}{arXiv}
@@ -574,8 +748,8 @@
\strng{authornamehash}{1f3a901804f6733643aff983bcb44e58}
\strng{authorfullhash}{103d5e118395cff5749e9050a3f9888e}
\strng{authorfullhashraw}{103d5e118395cff5749e9050a3f9888e}
\field{sortinit}{1}
\field{sortinithash}{4f6aaa89bab872aa0999fec09ff8e98a}
\field{sortinit}{2}
\field{sortinithash}{8b555b3791beccb63322c22f3320aa9a}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{0950-7051}
@@ -667,8 +841,8 @@
\strng{editorfullhash}{83be554d58af5be1788b5c3616f0e92a}
\strng{editorfullhashraw}{83be554d58af5be1788b5c3616f0e92a}
\field{extraname}{2}
\field{sortinit}{1}
\field{sortinithash}{4f6aaa89bab872aa0999fec09ff8e98a}
\field{sortinit}{2}
\field{sortinithash}{8b555b3791beccb63322c22f3320aa9a}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{abstract}{Despite the great advances made by deep learning in many machine learning problems, there is a relative dearth of deep learning approaches for anomaly detection. Those approaches which do exist involve networks trained to perform a task other than anomaly detection, namely generative models or compression, which are in turn adapted for use in anomaly detection; they are not trained on an anomaly detection based objective. In this paper we introduce a new anomaly detection method—Deep Support Vector Data Description—, which is trained on an anomaly detection based objective. The adaptation to the deep regime necessitates that our neural network and training procedure satisfy certain properties, which we demonstrate theoretically. We show the effectiveness of our method on MNIST and CIFAR-10 image benchmark datasets as well as on the detection of adversarial examples of GTSRB stop signs.}
@@ -720,6 +894,38 @@
\verb https://lilianweng.github.io/posts/2018-08-12-vae/
\endverb
\endentry
\entry{bg_infomax}{article}{}{}
\name{author}{1}{}{%
{{hash=9bf3bf02cd4427c0d9eab547e61fc6ff}{%
family={Linsker},
familyi={L\bibinitperiod},
given={R.},
giveni={R\bibinitperiod}}}%
}
\strng{namehash}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{fullhash}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{fullhashraw}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{bibnamehash}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{authorbibnamehash}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{authornamehash}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{authorfullhash}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\strng{authorfullhashraw}{9bf3bf02cd4427c0d9eab547e61fc6ff}
\field{sortinit}{2}
\field{sortinithash}{8b555b3791beccb63322c22f3320aa9a}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{journaltitle}{Computer}
\field{number}{3}
\field{title}{Self-organization in a perceptual network}
\field{volume}{21}
\field{year}{1988}
\field{pages}{105\bibrangedash 117}
\range{pages}{13}
\verb{doi}
\verb 10.1109/2.36
\endverb
\keyw{Intelligent networks;Biological information theory;Circuits;Biology computing;Animal structures;Neuroscience;Genetics;System testing;Neural networks;Constraint theory}
\endentry
\entry{bg_autoencoder_ad}{inbook}{}{}
\name{author}{4}{}{%
{{hash=976ff3d638254bc84287783be910c8ab}{%
@@ -958,6 +1164,53 @@
\field{title}{1D MEMS Micro-Scanning LiDAR}
\field{year}{2018}
\endentry
\entry{bg_slam}{article}{}{}
\name{author}{2}{}{%
{{hash=f20739d463254c239085b0098114da44}{%
family={Smith},
familyi={S\bibinitperiod},
given={Randall\bibnamedelima C.},
giveni={R\bibinitperiod\bibinitdelim C\bibinitperiod}}}%
{{hash=9ec288d3d1be96333e0fae9796707e68}{%
family={Cheeseman},
familyi={C\bibinitperiod},
given={Peter},
giveni={P\bibinitperiod}}}%
}
\list{publisher}{1}{%
{SAGE Publications}%
}
\strng{namehash}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{fullhash}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{fullhashraw}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{bibnamehash}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{authorbibnamehash}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{authornamehash}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{authorfullhash}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\strng{authorfullhashraw}{7031c0ebfd4f9d2d33ef0ddcb231c367}
\field{sortinit}{3}
\field{sortinithash}{ad6fe7482ffbd7b9f99c9e8b5dccd3d7}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{1741-3176}
\field{journaltitle}{The International Journal of Robotics Research}
\field{month}{12}
\field{number}{4}
\field{title}{On the Representation and Estimation of Spatial Uncertainty}
\field{volume}{5}
\field{year}{1986}
\field{pages}{56\bibrangedash 68}
\range{pages}{13}
\verb{doi}
\verb 10.1177/027836498600500404
\endverb
\verb{urlraw}
\verb http://dx.doi.org/10.1177/027836498600500404
\endverb
\verb{url}
\verb http://dx.doi.org/10.1177/027836498600500404
\endverb
\endentry
\entry{lidar_denoising_survey}{article}{}{}
\name{author}{4}{}{%
{{hash=30663aad72dc59a49b7023f9c332b58a}{%
@@ -1207,8 +1460,8 @@
\strng{authornamehash}{d17e6557c5836d2d978179999ea1037f}
\strng{authorfullhash}{3ae53fe582e8a815b118d26947eaa326}
\strng{authorfullhashraw}{3ae53fe582e8a815b118d26947eaa326}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{sortinit}{4}
\field{sortinithash}{9381316451d1b9788675a07e972a12a7}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{note}{\url{http://www.deeplearningbook.org}}
@@ -1246,8 +1499,8 @@
\strng{authornamehash}{e9af9fcd8483f077f0dcdbd95213a56e}
\strng{authorfullhash}{8179a2c222d1565711a7f216e4da6e56}
\strng{authorfullhashraw}{8179a2c222d1565711a7f216e4da6e56}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{sortinit}{4}
\field{sortinithash}{9381316451d1b9788675a07e972a12a7}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{month}{05}
@@ -1288,8 +1541,8 @@
\strng{authornamehash}{01a32420f9995c8592740c3ad622e775}
\strng{authorfullhash}{c0310d5b84b91b546714624d9baf92c2}
\strng{authorfullhashraw}{c0310d5b84b91b546714624d9baf92c2}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{sortinit}{4}
\field{sortinithash}{9381316451d1b9788675a07e972a12a7}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{1424-8220}
@@ -1481,8 +1734,8 @@
\strng{authornamehash}{1eed07a9c59db157d86a149850002efb}
\strng{authorfullhash}{5cd0fc84a08d52373df410079c09015c}
\strng{authorfullhashraw}{5cd0fc84a08d52373df410079c09015c}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{sortinit}{4}
\field{sortinithash}{9381316451d1b9788675a07e972a12a7}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{1941-0468}
@@ -1547,8 +1800,8 @@
\strng{authorfullhash}{31c8cde264eb0da1d45f468f719f7a54}
\strng{authorfullhashraw}{31c8cde264eb0da1d45f468f719f7a54}
\field{extraname}{2}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{sortinit}{4}
\field{sortinithash}{9381316451d1b9788675a07e972a12a7}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{booktitle}{2023 31st Mediterranean Conference on Control and Automation (MED)}
@@ -1596,8 +1849,8 @@
\strng{authornamehash}{ea684bebf6033a20ad34a33644ec89fc}
\strng{authorfullhash}{d6ad1c32e8f7738554f79d65d954b4f9}
\strng{authorfullhashraw}{d6ad1c32e8f7738554f79d65d954b4f9}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{1556-4967}
@@ -1645,8 +1898,8 @@
\strng{authornamehash}{5e0b9f9cab8ce61be5266767752c12dc}
\strng{authorfullhash}{d932d7249aa0617596765b2fc72a8152}
\strng{authorfullhashraw}{d932d7249aa0617596765b2fc72a8152}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{sortinit}{5}
\field{sortinithash}{20e9b4b0b173788c5dace24730f47d8c}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{abstract}{Autoencoder is an unsupervised learning model, which can automatically learn data features from a large number of samples and can act as a dimensionality reduction method. With the development of deep learning technology, autoencoder has attracted the attention of many scholars. Researchers have proposed several improved versions of autoencoder based on different application fields. First, this paper explains the principle of a conventional autoencoder and investigates the primary development process of an autoencoder. Second, We proposed a taxonomy of autoencoders according to their structures and principles. The related autoencoder models are comprehensively analyzed and discussed. This paper introduces the application progress of autoencoders in different fields, such as image classification and natural language processing, etc. Finally, the shortcomings of the current autoencoder algorithm are summarized, and prospected for its future development directions are addressed.}
@@ -1687,8 +1940,8 @@
\strng{authornamehash}{c4d64624ede10e1baa66843e963d7c13}
\strng{authorfullhash}{c4d64624ede10e1baa66843e963d7c13}
\strng{authorfullhashraw}{c4d64624ede10e1baa66843e963d7c13}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{title}{ODDS Library}
@@ -1735,8 +1988,8 @@
\strng{authornamehash}{dd2ddc978fe083bcff1aa1379cd19643}
\strng{authorfullhash}{4dd3ca3cdc8023700c28169734d6ad61}
\strng{authorfullhashraw}{4dd3ca3cdc8023700c28169734d6ad61}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{0018-9219}
@@ -1811,8 +2064,8 @@
\strng{authornamehash}{0fca66725a9966a967fc7893b180ddef}
\strng{authorfullhash}{0e37676c60146890b0c3819a1c8e441b}
\strng{authorfullhashraw}{0e37676c60146890b0c3819a1c8e441b}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{issn}{2296-7745}
@@ -1885,8 +2138,8 @@
\strng{authornamehash}{e1fc6cab9b6009340e110518e53868c4}
\strng{authorfullhash}{cffcf38c642164887a370768f5701b8e}
\strng{authorfullhashraw}{cffcf38c642164887a370768f5701b8e}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{title}{MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications}
@@ -1937,8 +2190,8 @@
\strng{authorfullhash}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\strng{authorfullhashraw}{2b7b29fe45fee2bd5ddb1dd1cbbff521}
\field{extraname}{2}
\field{sortinit}{7}
\field{sortinithash}{108d0be1b1bee9773a1173443802c0a3}
\field{sortinit}{6}
\field{sortinithash}{b33bc299efb3c36abec520a4c896a66d}
\field{labelnamesource}{author}
\field{labeltitlesource}{title}
\field{booktitle}{2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}

Binary file not shown.

View File

@@ -91,6 +91,8 @@
\makecell[l]{#1 \\ \emph{#2}}
}
\newcommand\rev[1]{\colorbox{yellow}{#1}}
% correct bad hyphenation
\hyphenation{}
@@ -112,7 +114,7 @@
% information fields
% general
\newcommand{\DocumentTitle}{Lidar Degradation Quantification for Robot Navigation in Hazy Environments}
\newcommand{\DocumentTitle}{\rev{LiDAR} Degradation Quantification for Robot Navigation in Hazy Environments}
%\newcommand{\DocumentSubtitle}{}
\newcommand{\ShortTitle}{} % used in headers (keep short!)
% for thesis: Firstname Surename, current university degree (e.g. BSc)
@@ -198,31 +200,31 @@
\newchapter{introduction}{Introduction}
Autonomous robots are increasingly used in search and rescue (SAR) missions, as they can take on dangerous tasks without putting additional human lives at risk. These tasks include navigating hazardous environments such as collapsed structures, identifying and locating victims, and assessing whether the environment is safe for human rescue teams. To perceive their surroundings, robots rely on multiple sensor systems such as lidar, radar, time-of-flight (ToF), ultrasound, optical cameras, or infrared cameras. Among these, lidar is the most widely used due to its high accuracy. The data from these sensors allows robots to map their environments, navigate through them, and make decisions such as which paths to prioritize. Many of the underlying algorithms are based on deep learning, trained on large datasets from which the models learn characteristic patterns.
Autonomous robots are increasingly used in search and rescue (SAR) missions, as they can take on dangerous tasks without putting additional human lives at risk. These tasks include navigating hazardous environments such as collapsed structures, identifying and locating victims, and assessing whether the environment is safe for human rescue teams. To perceive their surroundings, robots rely on multiple sensor systems such as \rev{LiDAR}, radar, time-of-flight (ToF), ultrasound, optical cameras, or infrared cameras. Among these, \rev{LiDAR} is the most widely used due to its high accuracy. The data from these sensors allows robots to map their environments, navigate through them, and make decisions such as which paths to prioritize. Many of the underlying algorithms are based on deep learning, trained on large datasets from which the models learn characteristic patterns.
Search and rescue environments pose difficult conditions for sensor systems to produce reliable data. A prominent challenge is the presence of aerosol particles such as smoke and dust, which can obstruct visibility and cause sensors to generate erroneous data. If such degraded conditions were not represented in the training data of the robots algorithms, these errors may lead to unexpected outputs and potentially endanger both the robot and human rescue targets. This is especially critical for autonomous robots, whose decisions rely entirely on sensor data without human oversight. To mitigate these risks, robots must be able to assess the trustworthiness of their sensor data.
For remote controlled robots a human operator can make these decisions but many search and rescue missions do not allow remote control due to environmental factors, such as radio signal attenuation or the search area's size and therefore demand autonomous robots. Therefore, during the design for such robots we arrive at the following critical question:
For remote controlled robots a human operator can make these decisions but many search and rescue missions do not allow remote control due to environmental factors, such as radio signal attenuation or the search area's size and therefore demand autonomous robots. Therefore, during the design for such robots we arrive at the following critical \rev{research} question:
\begin{quote} Can autonomous robots quantify the reliability of lidar sensor data in hazardous environments to make more informed decisions? \end{quote}
\begin{quote} Can autonomous robots quantify the reliability of \rev{LiDAR} sensor data in hazardous environments to make more informed decisions? \end{quote}
In this thesis we aim to answer this question by assessing a deep learning-based anomaly detection method and its performance when quantifying the sensor data's degradation. The employed algorithm is a semi-supervised anomaly detection algorithm which uses manually labeled training data to improve its performance over unsupervised methods. We compare the method's performance with common baseline methods from the same class of algorithms. The model's output is an anomaly score which quantifies the data reliability and can be used by algorithms that rely on the sensor data. These reliant algorithms may decide to for example slow down the robot to collect more data, choose alternative routes, signal for help or rely more heavily on other sensor's input data.
Our experiments demonstrate that anomaly detection methods are indeed applicable to this task, allowing lidar data degradation to be quantified on subterranean datasets representative of SAR environments. Among the tested approaches, the semi-supervised method consistently outperformed established baselines. At the same time, the lack of suitable training data—and in particular the scarcity of reliable evaluation labels—proved to be a major limitation, constraining the extent to which the expected real-world performance of these methods could be assessed.
Our experiments demonstrate that anomaly detection methods are indeed applicable to this task, allowing \rev{LiDAR} data degradation to be quantified on subterranean datasets representative of SAR environments. Among the tested approaches, the semi-supervised method consistently outperformed established baselines. At the same time, the lack of suitable training data—and in particular the scarcity of reliable evaluation labels—proved to be a major limitation, constraining the extent to which the expected real-world performance of these methods could be assessed.
\newsection{scope_research}{Scope of Research}
In this thesis, we focus our research on the unique challenges faced by autonomous rescue robots, specifically the degradation of sensor data caused by airborne particles. While degradation in sensor data can also arise from adverse weather, material properties, or dynamic elements such as moving leaves, these factors are considered less relevant to the rescue scenarios targeted by our study and are therefore excluded. Although our method is versatile enough to quantify various types of degradation, our evaluation is limited to degradation from airborne particles, as this is the most prevalent issue in the operational environments of autonomous rescue robots.
While robotic computer vision systems often incorporate a variety of sensors—such as time-of-flight cameras, infrared cameras, and ultrasound sensors—we found that autonomous rescue robots primarily depend on lidar data for mapping and navigation. Lidar sensors offer high accuracy, high resolution, and an extensive field of view (often a full 360° horizontally and a substantial vertical coverage), which are essential for constructing comprehensive environmental maps in challenging scenarios. Furthermore, the cost of lidar sensors has decreased significantly in recent decades, driven by their widespread adoption in autonomous driving, drones, and robotics, as well as manufacturing advancements like microelectromechanical systems (MEMS). For these reasons, our research is focused exclusively on lidar sensor data—specifically, the point clouds generated within a defined coordinate system. Although sensor fusion techniques are commonly used to enhance data accuracy and confidence, incorporating fused data would not only add significant complexity to our study but also limit our analysis to platforms equipped with all the sensor types involved. Consequently, we concentrate on quantifying sensor degradation solely through lidar data.
While robotic computer vision systems often incorporate a variety of sensors—such as time-of-flight cameras, infrared cameras, and ultrasound sensors—we found that autonomous rescue robots primarily depend on \rev{LiDAR} data for mapping and navigation. \rev{LiDAR} sensors offer high accuracy, high resolution, and an extensive field of view (often a full 360° \rev{horizontal} and a substantial vertical coverage), which are essential for constructing comprehensive environmental maps in challenging scenarios. Furthermore, the cost of \rev{LiDAR} sensors has decreased significantly in recent decades, driven by their widespread adoption in autonomous driving, drones, and robotics, as well as manufacturing advancements like microelectromechanical systems (MEMS). For these reasons, our research is focused exclusively on \rev{LiDAR} sensor data—specifically, the point clouds generated within a defined coordinate system. Although sensor fusion techniques are commonly used to enhance data accuracy and confidence, incorporating fused data would not only add significant complexity to our study but also limit our analysis to platforms equipped with all the sensor types involved. Consequently, we concentrate on quantifying sensor degradation solely through \rev{LiDAR} data.
The method we employ produces an analog score that reflects the confidence in the sensor data, with lower confidence indicating higher degradation. Although we do not investigate the direct applications of this score, potential uses include simple thresholding to decide whether to proceed with a given action as well as dynamically adjusting the robot's speed based on data quality to collect additional data when confidence is low. Importantly, this output score is a snapshot for each lidar scan and does not incorporate temporal information. While many lidar sensors capture multiple scans per second—enabling the possibility of time-series analyses such as running averages or more advanced statistical evaluations—we focus solely on individual scans without examining the differences between successive scans.
The method we employ produces an analog score that reflects the confidence in the sensor data, with lower confidence indicating higher degradation. Although we do not investigate the direct applications of this score, potential uses include simple thresholding to decide whether to proceed with a given action as well as dynamically adjusting the robot's speed based on data quality to collect additional data when confidence is low. Importantly, this output score is a snapshot for each \rev{LiDAR} scan and does not incorporate temporal information. While many \rev{LiDAR} sensors capture multiple scans per second—enabling the possibility of time-series analyses such as running averages or more advanced statistical evaluations—we focus solely on individual scans without examining the differences between successive scans.
\newsection{thesis_structure}{Structure of the Thesis}
The remainder of this thesis is organized as follows.
Chapter~\ref{chp:background} introduces the theoretical background and related work, covering anomaly detection methods, semi-supervised learning algorithms, autoencoders, and the fundamentals of lidar sensing.
Chapter~\ref{chp:background} introduces the theoretical background and related work, covering anomaly detection methods, semi-supervised learning algorithms, autoencoders, and the fundamentals of \rev{LiDAR} sensing.
Chapter~\ref{chp:deepsad} presents the DeepSAD algorithm in detail, including its optimization objective, network architecture, and hyperparameters.
In Chapter~\ref{chp:data_preprocessing}, we describe the dataset, the preprocessing pipeline, and our labeling strategies.
Chapter~\ref{chp:experimental_setup} outlines the experimental design, implementation details, and evaluation protocol.
@@ -236,9 +238,9 @@ Finally, Chapter~\ref{chp:conclusion_future_work} concludes the thesis by summar
This thesis tackles a broad, interdisciplinary challenge at the intersection of robotics, computer vision, and data science. In this chapter, we introduce the background of anomaly detection, which we formulate our degradation quantification problem as. Anomaly detection has its roots in statistical analysis and has been successfully applied in various domains. Recently, the incorporation of learning-based techniques, particularly deep learning, has enabled more efficient and effective analysis of large datasets.
Because anomalies are, by nature, often unpredictable in form and structure, unsupervised learning methods are widely used since they do not require pre-assigned labels—a significant advantage when dealing with unforeseen data patterns. However, these methods can be further refined through the integration of a small amount of labeled data, giving rise to semi-supervised approaches. The method evaluated in this thesis, DeepSAD, is a semi-supervised deep learning approach that also leverages an autoencoder architecture in its design. Autoencoders have gained widespread adoption in deep learning for their ability to extract features from unlabeled data, which is particularly useful for handling complex data types such as lidar scans.
Because anomalies are, by nature, often unpredictable in form and structure, unsupervised learning methods are widely used since they do not require pre-assigned labels—a significant advantage when dealing with unforeseen data patterns. However, these methods can be further refined through the integration of a small amount of labeled data, giving rise to semi-supervised approaches. The method evaluated in this thesis, DeepSAD, is a semi-supervised deep learning approach that also leverages an autoencoder architecture in its design. Autoencoders have gained widespread adoption in deep learning for their ability to extract features from unlabeled data, which is particularly useful for handling complex data types such as \rev{LiDAR} scans.
Lidar sensors function by projecting lasers in multiple directions near-simultaneously, measuring the time it takes for each reflected ray to return. Using the angles and travel times, the sensor constructs a point cloud that is often accurate enough to map the sensor's surroundings. In the following sections, we will delve into these technologies, review how they work, how they are generally used, how we employ them in this thesis and explore related work from these backgrounds.
\rev{LiDAR} sensors function by projecting lasers in multiple directions near-simultaneously, measuring the time it takes for each reflected ray to return. Using the angles and travel times, the sensor constructs a point cloud that is often accurate enough to map the sensor's surroundings. In the following sections, we will delve into these technologies, review how they work, how they are generally used, how we employ them in this thesis and explore related work from these backgrounds.
\newsection{anomaly_detection}{Anomaly Detection}
@@ -248,25 +250,25 @@ Anomaly detection refers to the process of detecting unexpected patterns of data
Figure~\ref{fig:anomaly_detection_overview} depicts a simple but illustrative example of data which can be classified as either normal or anomalous and shows the problem anomaly detection methods try to generally solve. A successful anomaly detection method would somehow learn to differentiate normal from anomalous data, for example by learning the boundaries around the available normal data and classifying it as either normal or anomalous based on its location inside or outside of those boundaries. Another possible approach could calculate an analog value which correlates with the likelihood of an sample being anomalous, for example by using the sample's distance from the closest normal data cluster's center.
\figc{anomaly_detection_overview}{figures/anomaly_detection_overview}{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$ as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}}{width=0.5\textwidth}
\figc{anomaly_detection_overview}{figures/anomaly_detection_overview}{An illustrative example of anomalous and normal data containing 2-dimensional data with clusters of normal data $N_1$ and $N_2$ as well as two single anomalies $o_1$ and $o_2$ and a cluster of anomalies $O_3$. Reproduced from~\cite{anomaly_detection_survey}\rev{.}}{width=0.5\textwidth}
By their very nature anomalies are rare occurences and oftentimes unpredictable in nature, which makes it hard to define all possible anomalies in any system. It also makes it very challenging to create an algorithm which is capable of detecting anomalies which may have never occured before and may not have been known to exist during the creation of the detection algorithm. There are many possible approaches to this problem, though they can be roughly grouped into six distinct categories based on the techniques used~\cite{anomaly_detection_survey}:
\begin{enumerate}
\item \textbf{Classification Based} - A classification technique such as an Support Vector Machine (SVM) or a fitting neural network is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples.
\item \textbf{Clustering Based} - Clustering techniques such as K-Means clustering or DBSCAN aim to group similar data together into clusters, differentiating it from dissimilar data which may belong to another or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not belong to any cluster at all or at least be an appreciable distance from the closest normal cluster's center.
\item \textbf{Nearest Neighbor Based} - Similar to the clustering based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to their $k^{th}$ nearest neighbor or the density of their local neighborhood, to judge wether a sample is anomalous.
\item \textbf{Statistical} - These methods try to fit a statistical model of the normal behaviour to the data. After the distribution from which normal data originates is defined, samples can be found to be normal or anomalous based on their likelihood to arise from said distribution.
\item \textbf{Information Theoretic} - The main assumption for information theoretic anomaly detection methods, is that anomalies differ somehow in their information content from anomalous data. An information theoretic measure is therefore used to determine iregularities in the data's information content, enabling the detection of anomalous samples.
\item \textbf{Spectral} - Spectral approaches assume the possibility to map data into a lower-dimensional space, where normal data appears significantly different from anomalous data. To this end a dimensionality reduction technique such as Principal Component Analysis (PCA) is used to embed the data into a lower dimensional subspace. Although it may be easier to differentiate normal and anomalous data in that subspace, it is still necessary to employ a technique capable of this feat, so spectral methods are oftentimes used as a pre-processing step followed by an anomaly detection method operating on the data's subspace.
\item \textbf{Classification Based} \\ A classification technique such as \rev{Support Vector Machine (SVM)~\cite{bg_svm}} is used to classify samples as either normal or anomalous based on labeled training data. Alternatively, if not enough labeled training data is available a one-class classification algorithm can be employed. In that case, the algorithm assumes all training samples to be normal and then learns a boundary around the normal samples to differentiate them from anomalous samples.
\item \textbf{Clustering Based} \\ Clustering techniques such as \rev{K-Means~\cite{bg_kmeans}} or DBSCAN\rev{~\cite{bg_dbscan}} aim to group similar \rev{data into} clusters, differentiating it from dissimilar data which may belong to another or no cluster at all. Anomaly detection methods from this category employ such a technique, with the assumption that normal data will assemble into one or more clusters due to their similar properties, while anomalies may create their own smaller clusters, not \rev{belonging} to any cluster at all or at least be \rev{at} an appreciable distance from the closest normal cluster's center.
\item \textbf{Nearest Neighbor Based} \\ Similar to the clustering based category, these techniques assume normal data is more closely clustered than anomalies and therefore utilize either a sample's distance to their $k^{th}$ nearest neighbor or the density of their local neighborhood, to judge wether a sample is anomalous.
\item \textbf{Statistical} \\ These methods try to fit a statistical model of the normal behaviour to the data. After the distribution from which normal data originates is defined, samples can be found to be normal or anomalous based on their likelihood to \rev{arising from that} distribution.
\item \textbf{Information Theoretic} \\ The main assumption for information theoretic anomaly detection methods, is that anomalies differ somehow in their information content from anomalous data. An information theoretic measure is therefore used to determine \rev{irregularities} in the data's information content, enabling the detection of anomalous samples.
\item \textbf{Spectral} \\ Spectral approaches assume the possibility to map data into a lower-dimensional space, where normal data appears significantly different from anomalous data. To this end a dimensionality reduction technique such as Principal Component Analysis (PCA)\rev{~\cite{bg_pca}} is used to embed the data into a lower dimensional \rev{subspace. Spectral} methods are oftentimes used as a pre-processing step followed by another anomaly detection method operating on the data's subspace.
\end{enumerate}
In this thesis we used an anomaly detection method, namely \citetitle{deepsad}~\cite{deepsad} (DeepSAD) to model our problem -how to quantify the degradation of lidar sensor data- as an anomaly detection problem. We do this by classifying good quality data as normal and degraded data as anomalous and rely on a method which can express each samples likelihood of being anomalous as an analog anomaly score, which enables us to interpret it as the datas degradation quantification value.
In this thesis we used an anomaly detection method, namely \citetitle{deepsad}\rev{~(DeepSAD)~\cite{deepsad}} to model our problem\rev{}how to quantify the degradation of \rev{LiDAR} sensor data\rev{}as an anomaly detection problem. We do this by classifying good quality data as normal and degraded data as anomalous and rely on a method which can express each samples likelihood of being anomalous as an analog anomaly score, which enables us to interpret it as the \rev{data} degradation quantification value.
Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering based approach with a spectral pre-processing component, in that it uses a neural network to reduce the inputs dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high dimensional it makes sense to use a spectral method to reduce the datas dimensionality and an approach which results in an analog value rather than a binary classification is useful for our use case since we want to quantify not only classify the data degradation.
Chapter~\ref{chp:deepsad} describes DeepSAD in more detail, which shows that it is a clustering based approach with a spectral pre-processing component, in that it uses a neural network to reduce the inputs dimensionality while simultaneously clustering normal data closely around a given centroid. It then produces an anomaly score by calculating the geometric distance between a data sample and the aforementioned cluster centroid, assuming the distance is shorter for normal than for anomalous data. Since our data is high dimensional it makes sense to use a spectral method to reduce \rev{its} dimensionality\rev{, furthermore} an approach which results in an analog value rather than a binary classification is useful for our use case since we want to quantify not only classify the data degradation.
There is a wide array of problems in domains similar to the one we research in this paper, for which modeling them as anomaly detection problems has been proven successful. The degradation of pointclouds, produced by an industrial 3D sensor, has been modeled as an anomaly detection task in \citetitle{bg_ad_pointclouds_scans}~\cite{bg_ad_pointclouds_scans}. \citeauthor{bg_ad_pointclouds_scans} propose a student-teacher model capable of infering a pointwise anomaly score for degradation in point clouds. The teacher network is trained on an anomaly-free dataset to extract dense features of the point clouds' local geometries, after which an identical student network is trained to emulate the teacher networks' outputs. For degraded pointclouds the regression between the teacher's and student's outputs is calculated and interpreted as the anomaly score, with the rationalization that the student network has not observed features produced by anomalous geometries during training, leaving it incapable of producing a similar output as the teacher for those regions. Another example would be \citetitle{bg_ad_pointclouds_poles}~\cite{bg_ad_pointclouds_poles}, which proposes a method to detect and classify pole-like objects in urban point cloud data, to differentiate between natural and man-made objects such as street signs, for autonomous driving purposes. An anomaly detection method was used to identify the vertical pole-like objects in the point clouds and then the preprocessed objects were grouped by similarity using a clustering algorithm to then classify them as either trees or man-made poles.
There is a wide \rev{set} of problems in domains similar to the one we research in this \rev{thesis}, for which modeling them as anomaly detection problems has been proven successful. The degradation of pointclouds, produced by an industrial 3D sensor, has been modeled as an anomaly detection task in \rev{\cite{bg_ad_pointclouds_scans}}. \citeauthor{bg_ad_pointclouds_scans} propose a student-teacher model capable of infering a pointwise anomaly score for degradation in point clouds. The teacher network is trained on an anomaly-free dataset to extract dense features of the point clouds' local geometries, after which an identical student network is trained to emulate the teacher networks' outputs. For degraded pointclouds the regression between the teacher's and student's outputs is calculated and interpreted as the anomaly score, with the rationalization that the student network has not observed features produced by anomalous geometries during training, leaving it incapable of producing a similar output as the teacher for those regions. Another example would be \rev{\cite{bg_ad_pointclouds_poles}}, which proposes a method to detect and classify pole-like objects in urban point cloud data, to differentiate between natural and man-made objects such as street signs, for autonomous driving purposes. An anomaly detection method was used to identify the vertical pole-like objects in the point clouds and then the preprocessed objects were grouped by similarity using a clustering algorithm to then classify them as either trees or man-made poles.
As already shortly mentioned at the beginning of this section, anomaly detection methods and their usage are oftentimes challenged by the limited availability of anomalous data, owing to the very nature of anomalies which are rare occurences. Oftentimes the intended use case is to even find unknown anomalies in a given dataset which have not yet been identified. In addition, it can be challenging to classify anomalies correctly for complex data, since the very definition of an anomaly is dependent on many factors such as the type of data, the intended use case or even how the data evolves over time. For these reasons most types of anomaly detection approaches limit their reliance on anomalous data during training and many of them do not differentiate between normal and anomalous data at all. DeepSAD is a semi-supervised method which is characterized by using a mixture of labeled and unlabeled data.
@@ -276,7 +278,7 @@ As already shortly mentioned at the beginning of this section, anomaly detection
Machine learning refers to algorithms capable of learning patterns from existing data to perform tasks on previously unseen data, without being explicitely programmed to do so~\cite{machine_learning_first_definition}. Central to many approaches is the definition of an objective function that measures how well the model is performing. The models parameters are then adjusted to optimize this objective. By leveraging these data-driven methods, machine learning can handle complex tasks across a wide range of domains.
Among the techniques employed in machine learning algorithms, neural networks have become especially prominent over the past few decades due to their ability to achieve state-of-the-art results across a wide variety of domains. They are most commonly composed of layers of interconnected artificial neurons. Each neuron computes a weighted sum of its inputs, adds a bias term, and then applies a nonlinear activation function, enabling them to model complex non-linear relationships. These layers are typically organized into three types:
Among the techniques employed in machine \rev{learning,} neural networks have become especially prominent over the past few decades due to their ability to achieve state-of-the-art results across a wide variety of domains. They are most commonly composed of layers of interconnected artificial neurons. Each neuron computes a weighted sum of its inputs, adds a bias term, and then applies a nonlinear activation function, enabling them to model complex non-linear relationships. These layers are typically organized into three types:
\begin{itemize}
\item Input layer, which receives raw data.
@@ -284,70 +286,70 @@ Among the techniques employed in machine learning algorithms, neural networks ha
\item Output layer, which produces the networks final prediction.
\end{itemize}
As outlined above, neural network training is formulated as an optimization problem: we define an objective function that measures how well the model is achieving its task and then we adjust the networks parameters to optimize that objective. The most common approach is stochastic gradient descent (SGD) or one of its variants (e.g., Adam). In each training iteration, the network first performs a forward pass to compute its outputs and evaluate the objective, then a backward pass—known as backpropagation—to calculate gradients of the objective with respect to every weight in the network. These gradients indicate the direction in which each weight should change to improve performance, and the weights are updated accordingly. Repeating this process over many iterations (also called epochs) allows the network to progressively refine its parameters and better fulfill its task.
As outlined above, neural network training is formulated as an optimization problem: we define an objective function that measures how well the model is achieving its task and then we adjust the networks parameters to optimize that objective. The most common approach is stochastic gradient descent (SGD) or one of its \rev{variants.} In each training iteration, the network first performs a forward pass to compute its outputs and evaluate the objective, then a backward pass—known as backpropagation—to calculate gradients of the objective with respect to every weight in the network. These gradients indicate the direction in which each weight should change to improve performance, and the weights are updated accordingly. Repeating this process over many iterations (also called epochs) allows the network to progressively refine its parameters and better fulfill its task.
Aside from the underlying technique, one can also categorize machine learning algorithms by the type of feedback provided during learning, for the network to improve. Broadly speaking, three main categories—supervised, unsupervised and reinforcement learning—exist, although many other approaches do not exactly fit any of these categories and have spawned less common categories like semi-supervised or self-supervised learning.
In supervised learning, each input sample is paired with a “ground-truth” label representing the desired output. During training, the model makes a prediction and a loss function quantifies the difference between the prediction and the truth label. The learning algorithm then adjusts its parameters to minimize this loss, improving its performance over time. Labels are typically categorical (used for classification tasks, such as distinguishing “cat” from “dog”) or continuous (used for regression tasks, like predicting a temperature or distance). Figure~\ref{fig:ml_learning_schema_concept}~b) illustrates this principle with a classification example, where labelled data is used to learn a boundary between two classes.
In supervised learning, each input sample is paired with a “ground-truth” label representing the desired output. During training, the model makes a prediction and a loss function quantifies the difference between the prediction and the truth label. The learning algorithm then adjusts its parameters to minimize this loss, improving its performance over time. Labels are typically categorical (used for classification tasks, such as distinguishing “cat” from “dog”) or continuous (used for regression tasks, like predicting a temperature or distance). Figure~\ref{fig:ml_learning_schema_concept}~\rev{(b)} illustrates this principle with a classification example, where labelled data is used to learn a boundary between two classes.
\figc{ml_learning_schema_concept}{figures/ml_learning_schema_concept.png}{Conceptual illustration of unsupervised (a) and supervised (b) learning. In (a), the inputs are two-dimensional data without labels, and the algorithm groups them into clusters without external guidance. In (b), the inputs have class labels (colors), which serve as training signals for learning a boundary between the two classes. Reproduced from~\cite{ml_supervised_unsupervised_figure_source}.}{width=0.6\textwidth}
In unsupervised learning, models work directly with raw data, without any ground-truth labels to guide the learning process. Instead, they optimize an objective that reflects the discovery of useful structure—whether that is grouping similar data points together or finding a compact representation of the data. For example, cluster analysis partitions the dataset into groups so that points within the same cluster are more similar to each other (according to a chosen similarity metric) than to points in other clusters, which can be seen in the toy example in figure~\ref{fig:ml_learning_schema_concept}~a). Dimensionality reduction methods, on the other hand, project high-dimensional data into a lower-dimensional space, optimizing for minimal loss of the original datas meaningful information.
In unsupervised learning, models work directly with raw data, without any ground-truth labels to guide the learning process. Instead, they optimize an objective that reflects the discovery of useful structure—whether that is grouping similar data points together or finding a compact representation of the data. For example, cluster analysis partitions the dataset into groups so that points within the same cluster are more similar to each other (according to a chosen similarity metric) than to points in other clusters, which can be seen in the toy example in \rev{Figure~\ref{fig:ml_learning_schema_concept}~(a)}. Dimensionality reduction methods, on the other hand, project high-dimensional data into a lower-dimensional space, optimizing for minimal loss of the original datas meaningful information.
%In reinforcement learning, the model—often called an agent—learns by interacting with an environment, that provides feedback in the form of rewards or penalties. At each step, the agent observes the environments state, selects an action, and an interpreter judges the action's outcome based on how the environment changed, providing a scalar reward or penalty that reflects the desirability of that outcome. The agents objective is to adjust its decision-making strategy to maximize the cumulative reward over time, balancing exploration of new actions with exploitation of known high-reward behaviors. This trial-and-error approach is well suited to sequential decision problems in complex settings, such as autonomous navigation or robotic control, where each choice affects both the immediate state and future possibilities.
In reinforcement learning, an agent learns by trial and error while interacting with an environment. After each action, it receives feedback in the form of rewards or penalties and adapts its strategy to maximize the total reward over time. This makes reinforcement learning particularly suited for sequential decision-making tasks such as robotics or game playing.
Semi-Supervised learning algorithms are an inbetween category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. Semi-supervised methods are oftentimes an effort to improve a machine learning algorithm belonging to either the supervised or unsupervised category. Supervised methods such as classification tasks are enhanced by using large amounts of unlabeled data to augment the supervised training without additional need of labeling work. Alternatively, unsupervised methods like clustering algorithms may not only use unlabeled data but improve their performance by considering some hand-labeled data during training.
Semi-Supervised learning algorithms are an \rev{in-between} category of supervised and unsupervised algorithms, in that they use a mixture of labeled and unlabeled data. Typically vastly more unlabeled data is used during training of such algorithms than labeled data, due to the effort and expertise required to label large quantities of data correctly. Semi-supervised methods are oftentimes an effort to improve a machine learning algorithm belonging to either the supervised or unsupervised category. Supervised methods such as classification tasks are enhanced by using large amounts of unlabeled data to augment the supervised training without additional need of labeling work. Alternatively, unsupervised methods like clustering algorithms may not only use unlabeled data but improve their performance by considering some hand-labeled data during training.
Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their suitability varies. While supervised anomaly detection methods exist, their usability not only depends on the availability of labeled training data but also on a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labour intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making capture of enough anomalous behaviour a hard problem. Semi-Supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods which is to model the normal class behaviour and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behaviour to improve their perfomance over fully unsupervised methods. DeepSAD is a semi-supervised method which extends its unsupervised predecessor Deep SVDD~\cite{deep_svdd} by including some labeled samples during training. Both, DeepSAD and Deep SVDD also utilize an autoencoder in a pre-training step, a machine learning architecture, frequently grouped with unsupervised algorithms, even though that definition can be contested when scrutinizing it in more detail, which we will do next.
Machine learning based anomaly detection methods can utilize techniques from all of the aforementioned categories, although their suitability varies. While supervised anomaly detection methods exist, their usability not only depends on the availability of labeled training data but also on a reasonable proportionality between normal and anomalous data. Both requirements can be challenging due to labeling often being labour intensive and anomalies' intrinsic property to occur rarely when compared to normal data, making capture of enough anomalous behaviour a hard problem. Semi-Supervised anomaly detection methods are of special interest in that they may overcome these difficulties inherently present in many anomaly detection tasks~\cite{semi_ad_survey}. These methods typically have the same goal as unsupervised anomaly detection methods which is to model the normal class behaviour and delimitate it from anomalies, but they can incorporate some hand-labeled examples of normal and/or anomalous behaviour to improve their perfomance over fully unsupervised methods. DeepSAD is a semi-supervised method which extends its unsupervised predecessor Deep SVDD~\cite{deep_svdd} by including some labeled samples during training. Both, DeepSAD and Deep SVDD also utilize an autoencoder in a pre-training step, a machine learning architecture\rev{, which we will look at next}.
\newsection{autoencoder}{Autoencoder}
Autoencoders are a type of neural network architecture, whose main goal is learning to encode input data into a representative state, from which the same input can be reconstructed, hence the name. They typically consist of two functions, an encoder and a decoder with a latent space inbetween them as depicted in the toy example in figure~\ref{fig:autoencoder_general}. The encoder learns to extract the most significant features from the input and to convert them into the input's latent space representation. The reconstruction goal ensures that the most prominent features of the input get retained during the encoding phase, due to the inherent inability to reconstruct the input if too much relevant information is missing. The decoder simultaneously learns to reconstruct the original input from its encoded latent space representation, by minimizing the error between the input sample and the autoencoder's output. This optimization goal complicates the categorization of autoencoders as unsupervised methods. Although they do not require labeled data, they still compute an error against a known target—the input itself. For this reason, some authors describe them as a form of self-supervised learning, where the data provides its own supervisory signal without requiring expert labeling.
Autoencoders are a type of neural network architecture, whose main goal is learning to encode input data into a representative state, from which the same input can be reconstructed, hence the name. They typically consist of two functions, an encoder and a decoder with a latent space \rev{in between} them as depicted in the toy example in \rev{Figure}~\ref{fig:autoencoder_general}. The encoder learns to extract the most significant features from the input and to convert them into the input's latent space representation. The reconstruction goal ensures that the most prominent features of the input get retained during the encoding phase, due to the inherent inability to reconstruct the input if too much relevant information is missing. The decoder simultaneously learns to reconstruct the original input from its encoded latent space representation, by minimizing the error between the input sample and the autoencoder's output. This optimization goal complicates the categorization of autoencoders as unsupervised methods. Although they do not require labeled data, they still compute an error against a known target—the input itself. For this reason, some authors describe them as a form of self-supervised learning, where the data provides its own supervisory signal without requiring expert labeling.
\fig{autoencoder_general}{figures/autoencoder_principle.png}{Illustration of an autoencoders working principle. The encoder $\mathbf{g_\phi}$ compresses the input into a lower-dimensional bottleneck representation $\mathbf{z}$, which is then reconstructed by the decoder $\mathbf{f_\theta}$. During training, the difference between input and output serves as the loss signal to optimize both the encoders feature extraction and the decoders reconstruction. Reproduced from~\cite{ml_autoencoder_figure_source}.
}
One key use case of autoencoders is to employ them as a dimensionality reduction technique. In that case, the latent space inbetween the encoder and decoder is of a lower dimensionality than the input data itself. Due to the aforementioned reconstruction goal, the shared information between the input data and its latent space representation is maximized, which is known as following the Infomax principle. After training such an autoencoder, it may be used to generate lower-dimensional representations of the given datatype, enabling more performant computations which may have been infeasible to achieve on the original data. DeepSAD uses an autoencoder in a pre-training step to achieve this goal among others.
One key use case of autoencoders is to employ them as a dimensionality reduction technique. In that case, the latent space \rev{in between} the encoder and decoder is of a lower dimensionality than the input data itself. Due to the aforementioned reconstruction goal, the shared information between the input data and its latent space representation is maximized, which is known as following the Infomax principle\rev{~\cite{bg_infomax}}. After training such an autoencoder, it may be used to generate lower-dimensional representations of the given datatype, enabling more performant computations which may have been infeasible to achieve on the original data. DeepSAD uses an autoencoder in a pre-training step to achieve this goal among others.
Autoencoders have been shown to be useful in the anomaly detection domain by assuming that autoencoders trained on more normal than anomalous data are better at reconstructing normal behaviour than anomalous one. This assumption allows methods to utilize the reconstruction error as an anomaly score. Examples of this are the method in \citetitle{bg_autoencoder_ad}~\cite{bg_autoencoder_ad} or the one in \citetitle{bg_autoencoder_ad_2}~\cite{bg_autoencoder_ad_2} which both employ an autoencoder and the aforementioned assumption. Autoencoders have also been shown to be a suitable dimensionality reduction technique for lidar data, which is oftentimes high-dimensional and sparse, making feature extraction and dimensionality reduction popular preprocessing steps. As an example, \citetitle{bg_autoencoder_lidar}~\cite{bg_autoencoder_lidar} shows the feasibility and advantages of using an autoencoder architecture to reduce lidar-orthophoto fused feature's dimensionality for their building detection method, which can recognize buildings in visual data taken from an airplane. Similarly, we can make use of the dimensionality reduction in DeepSAD's pre-training step, since our method is intended to work with high-dimensional lidar data.
Autoencoders have been shown to be useful in the anomaly detection domain by assuming that autoencoders trained on more normal than anomalous data are better at reconstructing normal behaviour than anomalous one. This assumption allows methods to utilize the reconstruction error as an anomaly score. Examples of this are the methods in \rev{\cite{bg_autoencoder_ad} or \cite{bg_autoencoder_ad_2}} which both employ an autoencoder and the aforementioned assumption. Autoencoders have also been shown to be a suitable dimensionality reduction technique for \rev{LiDAR} data, which is oftentimes high-dimensional and sparse, making feature extraction and dimensionality reduction popular preprocessing steps. As an example, \rev{\cite{bg_autoencoder_lidar}} shows the feasibility and advantages of using an autoencoder architecture to reduce \rev{LiDAR}-orthophoto fused feature's dimensionality for their building detection method, which can recognize buildings in visual data taken from an airplane. Similarly, we can make use of the dimensionality reduction in DeepSAD's pre-training step, since our method is intended to work with high-dimensional \rev{LiDAR} data.
\newsection{lidar_related_work}{Lidar - Light Detection and Ranging}
\newsection{lidar_related_work}{\rev{LiDAR} - Light Detection and Ranging}
Lidar (Light Detection and Ranging) measures distance by emitting short laser pulses and timing how long they take to return, an approach many may be familiar with from the more commonly known radar technology, which uses radio-frequency pulses and measures their return time to gauge an object's range. Unlike radar, however, lidar operates at much shorter wavelengths and can fire millions of pulses per second, achieving millimeter-level precision and dense, high-resolution 3D point clouds. This fine granularity makes lidar ideal for applications such as detailed obstacle mapping, surface reconstruction, and autonomous navigation in complex environments.
\rev{LiDAR} (Light Detection and Ranging) measures distance by emitting short laser pulses and timing how long they take to return, an approach many may be familiar with from the more commonly known radar technology, which uses radio-frequency pulses and measures their return time to gauge an object's range. Unlike radar, however, \rev{LiDAR} operates at much shorter wavelengths and can fire millions of pulses per second, achieving millimeter-level precision and dense, high-resolution 3D point clouds. This fine granularity makes \rev{LiDAR} ideal for applications such as detailed obstacle mapping, surface reconstruction, and autonomous navigation in complex environments.
Because the speed of light in air is effectively constant, multiplying half the roundtrip time by that speed gives the distance between the lidar sensor and the reflecting object, as can be seen visualized in figure~\ref{fig:lidar_working_principle}. Modern spinning multibeam lidar systems emit up to millions of these pulses every second. Each pulse is sent at a known combination of horizontal and vertical angles, creating a regular grid of measurements: for example, 32 vertical channels swept through 360° horizontally at a fixed angular spacing. While newer solid-state designs (flash, MEMS, phased-array) are emerging, spinning multi-beam lidar remains the most commonly seen type in autonomous vehicles and robotics because of its proven range, reliability, and mature manufacturing base.
Because the speed of light in air is effectively constant, multiplying half the roundtrip time by that speed gives the distance between the \rev{LiDAR} sensor and the reflecting object, as can be seen visualized in \rev{Figure}~\ref{fig:lidar_working_principle}. Modern spinning multibeam \rev{LiDAR} systems emit up to millions of these pulses every second. Each pulse is sent at a known combination of horizontal and vertical angles, creating a regular grid of measurements: for example, 32 vertical channels swept through 360° horizontally at a fixed angular spacing. While newer solid-state designs (flash, MEMS, phased-array) are emerging, spinning multi-beam \rev{LiDAR} remains the most commonly seen type in autonomous vehicles and robotics because of its proven range, reliability, and mature manufacturing base.
\figc{lidar_working_principle}{figures/bg_lidar_principle.png}{Illustration of the working principle of a lidar sensor. The emitter sends out an optical signal that is reflected by objects in the scene and captured by the receiver. The system controller measures the time delay $\Delta t$ between emission and reception to calculate distance using $d = c \cdot \Delta t / 2$. By repeating this process across many directions—either with multiple emitter/receiver pairs or sequentially in a spinning lidar—the sensor obtains a dense set of distances that, combined with their emission angles, form a 3D point cloud of the environment. Reproduced from~\cite{bg_lidar_figure_source}.
\figc{lidar_working_principle}{figures/bg_lidar_principle.png}{Illustration of the working principle of a \rev{LiDAR} sensor. The emitter sends out an optical signal that is reflected by objects in the scene and captured by the receiver. The system controller measures the time delay $\Delta t$ between emission and reception to calculate distance using $d = c \cdot \Delta t / 2$. By repeating this process across many directions—either with multiple emitter/receiver pairs or sequentially in a spinning \rev{LiDAR}—the sensor obtains a dense set of distances that, combined with their emission angles, form a 3D point cloud of the environment. Reproduced from~\cite{bg_lidar_figure_source}.
}{width=.8\textwidth}
Each instance a lidar emits and receives a laser pulse, it can use the ray's direction and the calculated distance to produce a single three-dimensional point. By collecting up to millions of such points each second, the sensor constructs a “point cloud”—a dense set of 3D coordinates relative to the lidars own position. In addition to X, Y, and Z, many lidars also record the intensity or reflectivity of each return, providing extra information about the surface properties of the object hit by the pulse.
\rev{Each time} a \rev{LiDAR} emits and receives a laser pulse, it can use the ray's direction and the calculated distance to produce a single three-dimensional point. By collecting up to millions of such points each second, the sensor constructs a “point cloud”—a dense set of 3D coordinates relative to the \rev{LiDAR}s own position. In addition to \rev{$X$, $Y$, and $Z$}, many \rev{LiDAR}s also record the intensity or reflectivity of each return, providing extra information about the surface properties of the object hit by the pulse.
Lidars high accuracy, long range, and full-circle field of view make it indispensable for tasks like obstacle detection, simultaneous localization and mapping (SLAM), and terrain modeling in autonomous driving and mobile robotics. While complementary sensors—such as time-of-flight cameras, ultrasonic sensors, and RGB cameras—have their strengths at short range or in particular lighting, only lidar delivers the combination of precise 3D measurements over medium to long distances, consistent performance regardless of illumination, and the pointcloud density needed for safe navigation. Lidar systems do exhibit intrinsic noise (e.g., range quantization or occasional multi-return ambiguities), but in most robotic applications these effects are minor compared to environmental degradation.
\rev{LiDAR}s high accuracy, long range, and full-circle field of view make it indispensable for tasks like obstacle detection, simultaneous localization and mapping~(SLAM)~\rev{\cite{bg_slam}}, and terrain modeling in autonomous driving and mobile robotics. While complementary sensors—such as time-of-flight cameras, ultrasonic sensors, and RGB cameras—have their strengths at short range or in particular lighting, only \rev{LiDAR} delivers the combination of precise 3D measurements over medium to long distances, consistent performance regardless of illumination, and the pointcloud density needed for safe navigation. \rev{LiDAR} systems do exhibit intrinsic noise (e.g., range quantization or occasional multi-return ambiguities), but in most robotic applications these effects are minor compared to environmental degradation.
In subterranean and rescue domain scenarios, the dominant challenge is airborne particles: dust kicked up by debris or smoke from fires. These aerosols create early returns that can mask real obstacles and cause missing data behind particle clouds, undermining SLAM and perception algorithms designed for cleaner data. This degradation is a type of atmospheric scattering, which can be caused by any kind of airborne particulates (e.g., snowflakes) or liquids (e.g., water droplets). Other kinds of environmental noise exist as well, such as specular reflections caused by smooth surfaces, beam occlusion due to close objects blocking the sensor's field of view or even thermal drift-temperature affecting the sensor's circuits and mechanics, introducing biases in the measurements.
All of these may create unwanted noise in the point cloud created by the lidar, making this domain an important research topic. \citetitle{lidar_denoising_survey}~\cite{lidar_denoising_survey} gives an overview about the current state of research into denoising methods for lidar in adverse environments, categorizes them according to their approach (distance-, intensity- or learning-based) and concludes that all approaches have merits but also open challenges to solve, for autonomous systems to safely navigate these adverse environments. The current research is heavily focused on the automotive domain, which can be observed by the vastly higher number of methods filtering noise from adverse weather effects-environmental scattering from rain, snow and fog-than from dust, smoke or other particles occuring rarely in the automotive domain.
All of these may create unwanted noise in the point cloud created by the \rev{LiDAR}, making this domain an important research topic. \rev{In \cite{lidar_denoising_survey} an overview} about the current state of research into denoising methods for \rev{LiDAR} in adverse environments \rev{is given. It} categorizes them according to their approach (distance-, intensity- or learning-based) and concludes that all approaches have merits but also open challenges to solve, for autonomous systems to safely navigate these adverse environments. The current research is heavily focused on the automotive domain, which can be observed by the vastly higher number of methods filtering noise from adverse weather effects\rev{--}environmental scattering from rain, snow and fog-than from dust, smoke or other particles occuring rarely in the automotive domain.
A learning-based method to filter dust-caused degradation from lidar is introduced in \citetitle{lidar_denoising_dust}~\cite{lidar_denoising_dust}. The authors employ a convultional neural network to classify dust particles in lidar point clouds as such, enabling the filtering of those points and compare their methods to more conservative approaches, such as various outlier removal algorithms. Another relevant example would be the filtering method proposed in \citetitle{lidar_subt_dust_removal}~\cite{lidar_subt_dust_removal}, which enables the filtration of pointclouds degraded by smoke or dust in subterranean environments, with a focus on the search and rescue domain. To achieve this, they formulated a filtration framework that relies on dynamic onboard statistical cluster outlier removal, to classify and remove dust particles in point clouds.
A learning-based method to filter dust-caused degradation from \rev{LiDAR} is introduced in \rev{\cite{lidar_denoising_dust}}. The authors employ a convultional neural network to classify dust particles in \rev{LiDAR} point clouds as such, enabling the filtering of those points and compare their methods to more conservative approaches, such as various outlier removal algorithms. Another relevant example would be the filtering method proposed in \rev{\cite{lidar_subt_dust_removal}}, which enables the filtration of pointclouds degraded by smoke or dust in subterranean environments, with a focus on the search and rescue domain. To achieve this, they formulated a filtration framework that relies on dynamic onboard statistical cluster outlier removal, to classify and remove dust particles in point clouds.
Our method does not aim to remove the noise or degraded points in the lidar data, but quantify its degradation to inform other systems of the autonomous robot about the data's quality, enabling more informed decisions. One such approach, though from the autonomous driving and not from the search and rescue domain can be found in \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain}. A learning-based method to quantify the lidar sensor data degradation caused by adverse weather-effects was proposed, implemented by posing the problem as an anomaly detection task and utilizing DeepSAD to learn degraded data to be an anomaly and high quality data to be normal behaviour. DeepSAD's anomaly score was used as the degradation's quantification score. From this example we decided to imitate this method and adapt it for the search and rescue domain, although this proved challenging due to the more limited data availability. Since it was effective for this closely related use case, we also employed DeepSAD, whose detailed workings we present in the following chapter.
Our method does not aim to remove the noise or degraded points in the \rev{LiDAR} data, but quantify its degradation to inform other systems of the autonomous robot about the data's quality, enabling more informed decisions. One such approach, though from the autonomous driving and not from the search and rescue domain can be found in \rev{\cite{degradation_quantification_rain}, where a} learning-based method to quantify the \rev{LiDAR} sensor data degradation caused by adverse weather-effects was proposed. \rev{They posed} the problem as an anomaly detection task and \rev{utilized} DeepSAD to learn degraded data to be an anomaly and high quality data to be normal behaviour. DeepSAD's anomaly score was used as the degradation's quantification score. From this example we decided to imitate this method and adapt it for the search and rescue domain, although this proved challenging due to the more limited data availability. Since it was effective for this closely related use case, we also employed DeepSAD, whose detailed workings we present in the following chapter.
\newchapter{deepsad}{DeepSAD: Semi-Supervised Anomaly Detection}
In this chapter, we explore the method \citetitle{deepsad}~(DeepSAD)~\cite{deepsad}, which we employ to quantify the degradation of lidar scans caused by airborne particles in the form of artificially introduced water vapor from a theater smoke machine. A similar approach—modeling degradation quantification as an anomaly detection task—was successfully applied in \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} to assess the impact of adverse weather conditions on lidar data for autonomous driving applications. DeepSAD leverages deep learning to capture complex anomalous patterns that classical statistical methods might miss. Furthermore, by incorporating a limited amount of hand-labeled data (both normal and anomalous), it can more effectively differentiate between known anomalies and normal data compared to purely unsupervised methods, which typically learn only the most prevalent patterns in the dataset~\cite{deepsad}.
In this chapter, we explore the method \rev{DeepSAD}~\cite{deepsad}, which we employ to quantify the degradation of \rev{LiDAR} scans caused by airborne particles in the form of artificially introduced water vapor from a theater smoke machine. A similar approach—modeling degradation quantification as an anomaly detection task—was successfully applied in \rev{\cite{degradation_quantification_rain}} to assess the impact of adverse weather conditions on \rev{LiDAR} data for autonomous driving applications. DeepSAD leverages deep learning to capture complex anomalous patterns that classical statistical methods might miss. Furthermore, by incorporating a limited amount of hand-labeled data (both normal and anomalous), it can more effectively differentiate between known anomalies and normal data compared to purely unsupervised methods, which typically learn only the most prevalent patterns in the dataset~\cite{deepsad}.
\newsection{algorithm_description}{Algorithm Description}
DeepSAD's overall mechanics are similar to clustering-based anomaly detection methods, which according to \citetitle{anomaly_detection_survey}~\cite{anomaly_detection_survey} typically follow a two-step approach. First, a clustering algorithm groups data points around a centroid; then, the distances of individual data points from this centroid are calculated and used as anomaly scores. In DeepSAD, these concepts are implemented by employing a neural network, which is jointly trained to map input data onto a latent space and to minimize the volume of an data-encompassing hypersphere, whose center is the aforementioned centroid. The data's geometric distance in the latent space to the hypersphere center is used as the anomaly score, where a larger distance between data and centroid corresponds to a higher probability of a sample being anomalous. This is achieved by shrinking the data-encompassing hypersphere during training, proportionally to all training data, of which is required that there is significantly more normal than anomalous data present. The outcome of this approach is that normal data gets clustered more closely around the centroid, while anomalies appear further away from it as can be seen in the toy example depicted in figure~\ref{fig:deep_svdd_transformation}.
DeepSAD's overall mechanics are similar to clustering-based anomaly detection methods, which according to \rev{\cite{anomaly_detection_survey}} typically follow a two-step approach. First, a clustering algorithm groups data points around a centroid; then, the distances of individual data points from this centroid are calculated and used as anomaly scores. In DeepSAD, these concepts are implemented by employing a neural network, which is jointly trained to map input data onto a latent space and to minimize the volume of an data-encompassing hypersphere, whose center is the aforementioned centroid. The data's geometric distance in the latent space to the hypersphere center is used as the anomaly score, where a larger distance between data and centroid corresponds to a higher probability of a sample being anomalous. This is achieved by shrinking the data-encompassing hypersphere during training, proportionally to all training data, of which is required that there is significantly more normal than anomalous data present. The outcome of this approach is that normal data gets clustered more closely around the centroid, while anomalies appear further away from it as can be seen in the toy example depicted in \rev{Figure}~\ref{fig:deep_svdd_transformation}.
\fig{deep_svdd_transformation}{figures/deep_svdd_transformation}{DeepSAD teaches a neural network to transform data into a latent space and minimize the volume of an data-encompassing hypersphere centered around a predetermined centroid $\textbf{c}$. \\Reproduced from~\cite{deep_svdd}.}
Before DeepSAD's training can begin, a pre-training step is required, during which an autoencoder is trained on all available input data. One of DeepSAD's goals is to map input data onto a lower dimensional latent space, in which the separation between normal and anomalous data can be achieved. To this end DeepSAD and its predecessor Deep SVDD make use of the autoencoder's reconstruction goal, whose successful training ensures confidence in the encoder architecture's suitability for extracting the input datas' most prominent information to the latent space inbetween the encoder and decoder. DeepSAD goes on to use just the encoder as its main network architecture, discarding the decoder at this step, since reconstruction of the input is unnecessary.
Before DeepSAD's training can begin, a pre-training step is required, during which an autoencoder is trained on all available input data. One of DeepSAD's goals is to map input data onto a lower dimensional latent space, in which the separation between normal and anomalous data can be achieved. To this end DeepSAD and its predecessor Deep SVDD make use of the autoencoder's reconstruction goal, whose successful training ensures confidence in the encoder architecture's suitability for extracting the input datas' most prominent information to the latent space \rev{in between} the encoder and decoder. DeepSAD goes on to use just the encoder as its main network architecture, discarding the decoder at this step, since reconstruction of the input is unnecessary.
The pre-training results are used in two more key ways. First, the encoder weights obtained from the autoencoder pre-training initialize DeepSADs network for the main training phase. Second, we perform an initial forward pass through the encoder on all training samples, and the mean of these latent representations is set as the hypersphere center, $\mathbf{c}$. According to \citeauthor{deepsad}, this initialization method leads to faster convergence during the main training phase compared to using a randomly selected centroid. An alternative would be to compute $\mathbf{c}$ using only the labeled normal examples, which would prevent the center from being influenced by anomalous samples; however, this requires a sufficient number of labeled normal samples. Once defined, the hypersphere center $\mathbf{c}$ remains fixed, as allowing it to be optimized freely could in the unsupervised case lead to a hypersphere collapse-a trivial solution where the network learns to map all inputs directly onto the centroid $\mathbf{c}$.
@@ -357,13 +359,13 @@ In the main training step, DeepSAD's network is trained using SGD backpropagatio
\fig{deepsad_procedure}{diagrams/deepsad_procedure/deepsad_procedure}{Overview of the DeepSAD workflow. Training starts with unlabeled data and optional labeled samples, which are used to pre-train an autoencoder, compute the hypersphere center, and then perform main training with adjustable weighting of labeled versus unlabeled data. During inference, new samples are encoded and their distance to the hypersphere center is used as an anomaly score, with larger distances indicating stronger anomalies.}
To infer if a previously unknown data sample is normal or anomalous, the sample is fed in a forward-pass through the fully trained network. During inference, the centroid $\mathbf{c}$ needs to be known, to calculate the geometric distance between the samples latent representation and $\mathbf{c}$. This distance is tantamount to an anomaly score, which correlates with the likelihood of the sample being anomalous. Due to differences in input data type, training success and latent space dimensionality, the anomaly score's magnitude has to be judged on an individual basis for each trained network. This means, scores produced by one network that signify normal data, may very well clearly indicate an anomaly for another network. The geometric distance between two points in space is a scalar analog value, therefore post-processing of the score is necessary to achieve a binary classification of normal and anomalous if desired.
To infer if a previously unknown data sample is normal or anomalous, the sample is fed in a forward-pass through the fully trained network. During inference, the centroid $\mathbf{c}$ needs to be known, to calculate the geometric distance between the samples latent representation and $\mathbf{c}$. This distance \rev{serves as} an anomaly score, which correlates with the likelihood of the sample being anomalous. Due to differences in input data type, training success and latent space dimensionality, the anomaly score's magnitude has to be judged on an individual basis for each trained network. This means, scores produced by one network that signify normal data, may very well clearly indicate an anomaly for another network. The geometric distance between two points in space is a scalar analog value, therefore post-processing of the score is necessary to achieve a binary classification of normal and anomalous if desired.
DeepSAD's full training and inference procedure is visualized in figure~\ref{fig:deepsad_procedure}, which gives a comprehensive overview of the dataflows, tuneable hyperparameters and individual steps involved.
DeepSAD's full training and inference procedure is visualized in \rev{Figure}~\ref{fig:deepsad_procedure}, which gives a comprehensive overview of the dataflows, tuneable hyperparameters and individual steps involved.
\newsection{algorithm_details}{Algorithm Details and Hyperparameters}
Since DeepSAD is heavily based on its predecessor \citetitle{deep_svdd}~(Deep SVDD)~\cite{deep_svdd} it is helpful to first understand Deep SVDD's optimization objective, so we start with explaining it here. For input space $\mathcal{X} \subseteq \mathbb{R}^D$, output space $\mathcal{Z} \subseteq \mathbb{R}^d$ and a neural network $\phi(\wc; \mathcal{W}) : \mathcal{X} \to \mathcal{Z}$ where $\mathcal{W}$ depicts the neural networks' weights with $L$ layers $\{\mathbf{W}_1, \dots, \mathbf{W}_L\}$, $n$ the number of unlabeled training samples $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, $\mathbf{c}$ the center of the hypersphere in the latent space, Deep SVDD teaches the neural network to cluster normal data closely together in the latent space by defining its optimization objective as seen in equation~\ref{eq:deepsvdd_optimization_objective}.
Since DeepSAD is heavily based on its predecessor \rev{Deep SVDD}~\cite{deep_svdd} it is helpful to first understand Deep SVDD's optimization objective, so we start with explaining it here. For input space $\mathcal{X} \subseteq \mathbb{R}^D$, output space $\mathcal{Z} \subseteq \mathbb{R}^d$ and a neural network $\phi(\wc; \mathcal{W}) : \mathcal{X} \to \mathcal{Z}$ where $\mathcal{W}$ depicts the neural networks' weights with $L$ layers $\{\mathbf{W}_1, \dots, \mathbf{W}_L\}$, $n$ the number of unlabeled training samples $\{\mathbf{x}_1, \dots, \mathbf{x}_n\}$, $\mathbf{c}$ the center of the hypersphere in the latent space, Deep SVDD teaches the neural network to cluster normal data closely together in the latent space by defining its optimization objective as \rev{follows.}
\begin{equation}
\label{eq:deepsvdd_optimization_objective}
@@ -376,7 +378,9 @@ Deep SVDD is an unsupervised method which does not rely on labeled data to train
\citeauthor{deepsad} argue that the pre-training step employing an autoencoder—originally introduced in Deep SVDD—not only allows a geometric interpretation of the method as minimum volume estimation i.e., the shrinking of the data encompassing hypersphere but also a probabilistic one as entropy minimization over the latent distribution. The autoencoding objective during pre-training implicitly maximizes the mutual information between the data and its latent representation, aligning the approach with the Infomax principle while encouraging a latent space with minimal entropy. This insight enabled \citeauthor{deepsad} to introduce an additional term in DeepSADs objective, beyond that of its predecessor Deep SVDD, which incorporates labeled data to better capture the characteristics of normal and anomalous data. They demonstrate that DeepSADs objective effectively models the latent distribution of normal data as having low entropy, while that of anomalous data is characterized by higher entropy. In this framework, anomalies are interpreted as being generated from an infinite mixture of distributions that differ from the normal data distribution. The introduction of this aforementioned term in DeepSAD's objective allows it to learn in a semi-supervised way, which helps the model better position known normal samples near the hypersphere center and push known anomalies farther away, thereby enhancing its ability to differentiate between normal and anomalous data.
From equation~\ref{eq:deepsvdd_optimization_objective} it is easy to understand DeepSAD's optimization objective seen in equation~\ref{eq:deepsad_optimization_objective} which additionally defines $m$ number of labeled data samples $\{(\mathbf{\tilde{x}}_1, \tilde{y}_1), \dots, (\mathbf{\tilde{x}}_m, \tilde{y}_1)\} \in \mathcal{X} \times \mathcal{Y}$ and $\mathcal{Y} = \{-1,+1\}$ for which $\tilde{y} = +1$ denotes normal and $\tilde{y} = -1$ anomalous samples as well as a new hyperparameter $\eta > 0$ which can be used to balance the strength with which labeled and unlabeled samples contribute to the training.
From \rev{Equation}~\ref{eq:deepsvdd_optimization_objective} it is easy to understand DeepSAD's optimization objective seen in \rev{Equation}~\ref{eq:deepsad_optimization_objective} which additionally \rev{uses} $m$ number of labeled data samples $\{(\mathbf{\tilde{x}}_1, \tilde{y}_1), \dots, (\mathbf{\tilde{x}}_m, \tilde{y}_1)\} \in \mathcal{X} \times \mathcal{Y}$ and $\mathcal{Y} = \{-1,+1\}$ for which $\tilde{y} = +1$ denotes normal and $\tilde{y} = -1$ anomalous samples as well as a new hyperparameter $\eta > 0$ which can be used to balance the strength with which labeled and unlabeled samples contribute to the training.
\rev{The objective is}
\begin{equation}
\label{eq:deepsad_optimization_objective}
@@ -386,7 +390,7 @@ From equation~\ref{eq:deepsvdd_optimization_objective} it is easy to understand
+\frac{\lambda}{2}\sum_{\ell=1}^{L}\|\mathbf{W}^{\ell}\|_{F}^{2}.
\end{equation}
The first term of equation~\ref{eq:deepsad_optimization_objective} stays mostly the same, differing only in its consideration of the introduced $m$ labeled datasamples for its proportionality. The second term is newly introduced to incorporate the labeled data samples with hyperparameter $\eta$'s strength, by either minimizing or maximizing the distance between the samples latent represenation and $\mathbf{c}$ depending on each data samples label $\tilde{y}$. The standard L2 regularization is kept identical to Deep SVDD's optimization objective. It can also be observed that in case of $m = 0$ labeled samples, DeepSAD falls back to Deep SVDD's optimization objective and can therefore be used in a completely unsupervised fashion as well.
The first term of \rev{Equation}~\ref{eq:deepsad_optimization_objective} stays \rev{almost} the same, differing only in its consideration of the introduced $m$ labeled datasamples for its proportionality. The second term is newly introduced to incorporate the labeled data samples with hyperparameter $\eta$'s strength, by either minimizing or maximizing the distance between the samples latent represenation and $\mathbf{c}$ depending on each data samples label $\tilde{y}$. The standard L2 regularization is kept identical to Deep SVDD's optimization objective. It can also be observed that in case of $m = 0$ labeled samples, DeepSAD falls back to Deep SVDD's optimization objective and can therefore be used in a completely unsupervised fashion as well.
\paragraph{Hyperparameters}
@@ -398,7 +402,7 @@ DeepSAD relies on several tuneable hyperparameters that influence different stag
\item \textbf{Latent space dimensionality $\mathbb{R}^d$} \\
The size of the latent bottleneck is a critical parameter. If $\mathbb{R}^d$ is too small, the network cannot encode all relevant information, leading to information loss and weak representations. If $\mathbb{R}^d$ is too large, the network risks overfitting by encoding irrelevant detail, while also increasing computational cost. These insights stem from autoencoder literature \cite{deep_learning_book}, but it is unclear whether they apply directly to DeepSAD: here the autoencoder serves only for pretraining, and the encoder is subsequently fine-tuned with a different objective. Thus, the optimal choice of $\mathbb{R}^d$ may not coincide with the value that would be ideal for autoencoder reconstruction alone.
\item \textbf{Label weighting $\eta$} \\
The parameter $\eta$ controls the relative contribution of labeled versus unlabeled data in the DeepSAD objective. With $\eta = 1$, both groups contribute equally (normalized by their sample counts). Larger values of $\eta$ emphasize the labeled data, pulling labeled normals closer to the center and pushing labeled anomalies further away. Smaller values emphasize the unlabeled data, effectively reducing the influence of labels. Its impact depends not only on its numerical value but also on the quantity and quality of available labels.
The parameter $\eta$ controls the relative contribution of labeled versus unlabeled data in the DeepSAD objective. With $\eta = 1$, both groups contribute equally (normalized by their sample counts). Larger values of $\eta$ emphasize the labeled data, pulling labeled \rev{normal data} closer to the center and pushing labeled anomalies further away. Smaller values emphasize the unlabeled data, effectively reducing the influence of labels. Its impact depends not only on its numerical value but also on the quantity and quality of available labels.
\item \textbf{Learning rates $L_A$ and $L_M$} \\
Two learning rates are defined: $L_A$ for the autoencoder pretraining and $L_M$ for the main DeepSAD training. The learning rate sets the step size used during gradient descent updates and thereby controls the stability and speed of training. If it is too high, the optimization may diverge or oscillate; if too low, convergence becomes excessively slow and may get stuck in poor local minima. Schemes with adaptive learning rates such as ADAM may be applied to prevent poor choices.
\item \textbf{Number of epochs $E_A$ and $E_M$} \\
@@ -420,13 +424,13 @@ To ensure our chosen dataset meets the needs of reliable degradation quantificat
\begin{enumerate}
\item \textbf{Data Modalities:}\\
The dataset must include lidar sensor data, since we decided to train and evaluate our method on what should be the most universally used sensor type in the given domain. To keep our method as generalized as possible, we chose to only require range-based point cloud data and forego sensor-specific data such as intensity or reflectivity, though it may be of interest for future work. It is also desirable to have complementary visual data such as camera images, for better context, manual verification and understanding of the data.
The dataset must include \rev{LiDAR} sensor data, since we decided to train and evaluate our method on what should be the most universally used sensor type in the given domain. To keep our method as generalized as possible, we chose to only require range-based point cloud data and \rev{opt out of} sensor-specific data such as intensity or reflectivity, though it may be of interest for future work. It is also desirable to have complementary visual data such as camera images, for better context, manual verification and understanding of the data.
\item \textbf{Context \& Collection Method:}\\
To mirror the real-world conditions of autonomous rescue robots, the data should originate from locations such as subterranean environments (tunnels, caves, collapsed structures), which closely reflect what would be encountered during rescue missions. Ideally, it should be captured from a ground-based, self-driving robot platform in motion instead of aerial, handheld, or stationary collection, to ensure similar circumstances to the target domain.
\item \textbf{Degradation Characteristics:}\\
Because our goal is to quantify the degradation of lidar data encountered by rescue robots, the dataset must exhibit significant degradation of lidar returns from aerosols (i.e., dust or smoke particles), which should be the most frequent and challenging degradation encountered. This requirement is key to evaluating how well our method detects and measures the severity of such challenging conditions.
Because our goal is to quantify the degradation of \rev{LiDAR} data encountered by rescue robots, the dataset must exhibit significant degradation of \rev{LiDAR} returns from aerosols (i.e., dust or smoke particles), which should be the most frequent and challenging degradation encountered. This requirement is key to evaluating how well our method detects and measures the severity of such challenging conditions.
\item \textbf{Volume \& Class Balance:}\\
The dataset must be large enough to train deep learning models effectively. Since our semi-supervised approach depends on learning a robust model of “normal” data, the majority of samples should be high-quality, degradation-free scans. Simultaneously, there must be a sufficient number of degraded (anomalous) scans to permit a comprehensive evaluation of quantification performance.
@@ -438,18 +442,18 @@ To ensure our chosen dataset meets the needs of reliable degradation quantificat
Quantitative benchmarking of degradation quantification requires a degradation label for every scan. Ideally that label would be a continuous degradation score, although a binary label would still enable meaningful comparison. As the rest of this section shows, producing any reliable label is already challenging and assigning meaningful analog scores may not be feasible at all. Compounding the problem, no public search-and-rescue (SAR) lidar data set offers such ground truth as far as we know. To understand the challenges around labeling lidar data degradation, we will look at what constitutes degradation in this context.
Quantitative benchmarking of degradation quantification requires a degradation label for every scan. Ideally that label would be a continuous degradation score, although a binary label would still enable meaningful comparison. As the rest of this section shows, producing any reliable label is already challenging and assigning meaningful analog scores may not be feasible at all. Compounding the problem, no public search-and-rescue (SAR) \rev{LiDAR} data set offers such ground truth as far as we know. To understand the challenges around labeling \rev{LiDAR} data degradation, we will look at what constitutes degradation in this context.
In section~\ref{sec:lidar_related_work} we discussed some internal and environmental error causes of lidar sensors, such as multi-return ambiguities or atmospheric scattering respectively. While we are aware of research into singular failure modes, such as \citetitle{lidar_errormodel_particles}~\cite{lidar_errormodel_particles} or research trying to model the totality of error souces occuring in other domains, such as .\citetitle{lidar_errormodel_automotive}~\cite{lidar_errormodel_automotive}, there appears to be no such model for the search and rescue domain and its unique environmental circumstances. Although, scientific consensus appears to be, that airborne particles are the biggest contributor to degradation in SAR~\cite{lidar_errormodel_consensus}, we think that a more versatile definition is required to ensure confidence during critical SAR missions, which are often of a volatile nature. We are left with an ambiguous definition of what constitutes lidar point cloud degradation in the SAR domain.
In \rev{Section}~\ref{sec:lidar_related_work} we discussed some internal and environmental error causes of \rev{LiDAR} sensors, such as multi-return ambiguities or atmospheric scattering respectively. While we are aware of research into singular failure \rev{modes~\cite{lidar_errormodel_particles}} or research trying to model the totality of error souces occuring in other \rev{domains~\cite{lidar_errormodel_automotive}}, there appears to be no such model for the search and rescue domain and its unique environmental circumstances. Although, scientific consensus appears to be, that airborne particles are the biggest contributor to degradation in SAR~\cite{lidar_errormodel_consensus}, we think that a more versatile definition is required to ensure confidence during critical SAR missions, which are often of a volatile nature. We are left with an ambiguous definition of what constitutes \rev{LiDAR} point cloud degradation in the SAR domain.
We considered which types of objective measurements may be available to produce ground-truth labels, such as particulate matter sensors, lidar point clouds' inherent properties such as range-dropout rate and others, but we fear that using purely objective measures to label the data, would limit our learning based method to imitating the labels' sources instead of differentiating all possible degradation patterns from high quality data. Due to the incomplete error model in this domain, there may be novel or compound error sources that would not be captured using such an approach. As an example, we did observe dense smoke reflecting enough rays to produce phantom objects, which may fool SLAM algorithms. Such a case may even be labeleled incorrectly as normal by one of the aforementioned objective measurement labeling options, if the surroundings do not exhibit enough dispersed smoke particles already.
We considered which types of objective measurements may be available to produce ground-truth labels, such as particulate matter sensors, \rev{LiDAR} point clouds' inherent properties such as range-dropout rate and others, but we fear that using purely objective measures to label the data, would limit our learning based method to imitating the labels' sources instead of differentiating all possible degradation patterns from high quality data. Due to the incomplete error model in this domain, there may be novel or compound error sources that would not be captured using such an approach. As an example, we did observe dense smoke reflecting enough rays to produce phantom objects, which may fool SLAM algorithms. Such a case may even be labeleled incorrectly as normal by one of the aforementioned objective measurement labeling options, if the surroundings do not exhibit enough dispersed smoke particles already.
To mitigate the aforementioned risks we adopt a human-centric, binary labelling strategy. We judged analog and multi-level discrete rating scales to be too subjective for human consideration, which only left us with the simplistic, but hopefully more reliable binary choice. We used two labeling approaches, producing two evaluation sets, whose motivation and details will be discussed in more detail in section~\ref{sec:preprocessing}. Rationale for the exact labeling procedures requires knowledge of the actual dataset we ended up choosing, which we will present in the next section.
To mitigate the aforementioned risks we adopt a human-centric, binary labelling strategy. We judged analog and multi-level discrete rating scales to be too subjective for human consideration, which only left us with the simplistic, but hopefully more reliable binary choice. We used two labeling approaches, producing two evaluation sets, whose motivation and details will be discussed in more detail in \rev{Section}~\ref{sec:preprocessing}. Rationale for the exact labeling procedures requires knowledge of the actual dataset we ended up choosing, which we will present in the next section.
\newsection{data_dataset}{Chosen Dataset}
\newsection{data_dataset}{\rev{Dataset}}
Based on the previously discussed requirements and the challenges of obtaining reliable labels, we selected the \citetitle{subter}~\cite{subter} for training and evaluation. This dataset comprises multimodal sensor data collected from a robotic platform navigating tunnels and rooms in a subterranean environment, an underground tunnel in Luleå, Sweden. Notably, some experiments incorporated an artificial smoke machine to simulate heavy degradation from aerosol particles, making the dataset particularly well-suited to our use case. A Pioneer 3-AT2 robotic platform, which can be seen in figure~\ref{fig:subter_platform_photo}, was used to mount a multitude of sensors that are described in table~\ref{tab:subter-sensors} and whose mounting locations are depicted in figure~\ref{fig:subter_platform_sketch}.
Based on the previously discussed requirements and the challenges of obtaining reliable labels, we selected the \citetitle{subter}~\cite{subter} for training and evaluation. This dataset comprises multimodal sensor data collected from a robotic platform navigating tunnels and rooms in a subterranean environment, an underground tunnel in Luleå, Sweden. Notably, some experiments incorporated an artificial smoke machine to simulate heavy degradation from aerosol particles, making the dataset particularly well-suited to our use case. A Pioneer 3-AT2 robotic platform, which can be seen in \rev{Figure}~\ref{fig:subter_platform_photo}, was used to mount a multitude of sensors that are described in \rev{Table}~\ref{tab:subter-sensors} and whose mounting locations are depicted in \rev{Figure}~\ref{fig:subter_platform_sketch}.
%-------------------------------------------------
% Compact sensor overview (row numbers follow Fig.~\ref{fig:subter_platform})
@@ -464,11 +468,11 @@ Based on the previously discussed requirements and the challenges of obtaining r
\scriptsize
\begin{tabular}{cp{4cm}p{4.5cm}p{5.5cm}}
\textbf{\#} & \textbf{Sensor} & \textbf{Recorded Data} & \textbf{Key Specs} \\
1 & \sensorcell{Spinning 3-D lidar}{Ouster OS1-32} & 3-D cloud, reflectivity & 10 Hz, 32 ch, 360° × 42.4°, $\leq$ 120 m \rule{0pt}{2.6ex} \\
1 & \sensorcell{Spinning 3-D \rev{LiDAR}}{Ouster OS1-32} & 3-D cloud, reflectivity & 10 Hz, 32 ch, 360° × 42.4°, $\leq$ 120 m \rule{0pt}{2.6ex} \\
2 & \sensorcell{mm-wave RADAR (×4)}{TI IWR6843AoP} & 4 × 60° RADAR point clouds & 30 Hz, 60 GHz, 9 m max, 0.05 m res. \\
3 & \sensorcell{Solid-state lidar}{Velodyne Velarray M1600} & Forward lidar cloud & 10 Hz, 160 ch, 120° × 32°, 0.130 m \\
4 & \sensorcell{RGB-D / stereo cam}{Luxonis OAK-D Pro} & stereo b/w images, depth map & 15 fps, 75 mm baseline, active IR 930 nm \\
5 & \sensorcell{LED flood-light}{RS PRO WL28R} & Illumination for stereo cam & 7 W, 650 lm (no data stream) \\
3 & \sensorcell{Solid-state \rev{LiDAR}}{Velodyne Velarray M1600} & Forward \rev{LiDAR} cloud & 10 Hz, 160 ch, 120° × 32°, 0.130 m \\
4 & \sensorcell{RGB-D / stereo cam}{Luxonis OAK-D Pro} & \rev{Stereo} b/w images, depth map & 15 fps, 75 mm baseline, active IR 930 nm \\
5 & \sensorcell{LED flood-light}{RS PRO WL28R} & Illumination for stereo \rev{camera} & 7 W, 650 lm (no data stream) \\
6 & \sensorcell{IMU}{Pixhawk 2.1 Cube Orange} & Accel, gyro, mag, baro & 190 Hz, 9-DoF, vibration-damped \\
7 & \sensorcell{On-board PC}{Intel NUC i7} & Time-synced logging & Quad-core i7, 16 GB RAM, 500 GB SSD \\
\end{tabular}
@@ -476,7 +480,7 @@ Based on the previously discussed requirements and the challenges of obtaining r
\end{table}
We use data from the \emph{Ouster OS1-32} lidar sensor, which was configured to capture 10 frames per second with a resolution of 32 vertical channels and 2048 measurements per channel. These settings yield equiangular measurements across a vertical field of view of 42.4° and a complete 360° horizontal field of view. Consequently, every lidar scan can generate up to 65,536 points. Each point contains the \emph{X}, \emph{Y}, and \emph{Z} coordinates (in meters, with the sensor location as the origin) along with values for \emph{range}, \emph{intensity}, and \emph{reflectivity}—typical metrics measured by lidar sensors. The datasets' point clouds are saved in a dense format, meaning each of the 65,536 measurements is present in the data, although fields for missing measurements contain zeroes.
We use data from the \emph{Ouster OS1-32} \rev{LiDAR} sensor, which was configured to capture 10 frames per second with a resolution of 32 vertical channels and 2048 measurements per channel. These settings yield equiangular measurements across a vertical field of view of 42.4° and a complete 360° horizontal field of view. Consequently, every \rev{LiDAR} scan can generate up to 65,536 points. Each point contains the \emph{X}, \emph{Y}, and \emph{Z} coordinates (in meters, with the sensor location as the origin) along with values for \emph{range}, \emph{intensity}, and \emph{reflectivity}—typical metrics measured by \rev{LiDAR} sensors. The datasets' point clouds are saved in a dense format, meaning each of the 65,536 measurements is present in the data, although fields for missing measurements contain zeroes.
%-------------------------------------------------
% Platform photographs (a) Pioneer base, (b) numbered sensor layout
@@ -487,7 +491,7 @@ We use data from the \emph{Ouster OS1-32} lidar sensor, which was configured to
{\includegraphics[width=0.45\textwidth]{figures/data_subter_platform_photo.jpg}
\label{fig:subter_platform_photo}}
\hfill
\subfigure[Sensor layout and numbering. Components: 1 OS1-32 lidar, 2 mm-wave RADARs, 3 M1600 lidar, 4 OAK-D Pro camera, 5 LED flood-light, 6 IMU, 7 Intel NUC. See Table~\ref{tab:subter-sensors} for detailed specifications.]
\subfigure[Sensor layout and numbering. Components: 1 OS1-32 \rev{LiDAR}, 2 mm-wave RADARs, 3 M1600 \rev{LiDAR}, 4 OAK-D Pro camera, 5 LED flood-light, 6 IMU, 7 Intel NUC. See Table~\ref{tab:subter-sensors} for detailed specifications.]
{\includegraphics[width=0.45\textwidth]{figures/data_subter_platform_sketch.png}
\label{fig:subter_platform_sketch}}
\caption{Robotic platform and sensor configuration used to record the dataset.}
@@ -495,44 +499,44 @@ We use data from the \emph{Ouster OS1-32} lidar sensor, which was configured to
\end{figure}
During the measurement campaign, a total of 14 experiments were conducted—10 prior to operating the artificial smoke machine (hereafter referred to as normal experiments) and 4 after it has already been running for some time (anomalous experiments). In 13 of these experiments, the sensor platform was in near-constant motion (either translating at roughly 1m/s or rotating), with only one anomalous experiment conducted while the platform remained stationary. Although this means we do not have two stationary experiments from the same exact position for a direct comparison between normal and anomalous conditions, the overall experiments are similar enough to allow for meaningful comparisons. In addition to the presence of water vapor from the smoke machine, the experiments vary in illumination conditions, the presence of humans on the measurement grounds, and additional static artifacts. For our purposes, only the artificial smoke is relevant; differences in lighting or incidental static objects do not affect our analysis. Regardless of illumination, the lidar sensor consistently produces comparable point clouds, and the presence of static objects does not influence our quantification of point cloud degradation.
During the measurement campaign, a total of 14 experiments were conducted—10 prior to operating the artificial smoke machine (hereafter referred to as normal experiments) and 4 after it has already been running for some time (anomalous experiments). In 13 of these experiments, the sensor platform was in near-constant motion (either translating at roughly 1m/s or rotating), with only one anomalous experiment conducted while the platform remained stationary. Although this means we do not have two stationary experiments from the same exact position for a direct comparison between normal and anomalous conditions, the overall experiments are similar enough to allow for meaningful comparisons. In addition to the presence of water vapor from the smoke machine, the experiments vary in illumination conditions, the presence of humans on the measurement grounds, and additional static artifacts. For our purposes, only the artificial smoke is relevant; differences in lighting or incidental static objects do not affect our analysis. Regardless of illumination, the \rev{LiDAR} sensor consistently produces comparable point clouds, and the presence of static objects does not influence our quantification of point cloud degradation.
In the anomalous experiments, the artificial smoke machine appears to have been running for some time before data collection began, as evidenced by both camera images and lidar data showing an even distribution of water vapor around the machine. The stationary experiment is particularly unique: the smoke machine was positioned very close to the sensor platform and was actively generating new, dense smoke, to the extent that the lidar registered the surface of the fresh water vapor as if it were a solid object.
In the anomalous experiments, the artificial smoke machine appears to have been running for some time before data collection began, as evidenced by both camera images and \rev{LiDAR} data showing an even distribution of water vapor around the machine. The stationary experiment is particularly unique: the smoke machine was positioned very close to the sensor platform and was actively generating new, dense smoke, to the extent that the \rev{LiDAR} registered the surface of the fresh water vapor as if it were a solid object.
The figures~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show an representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 lidar sensor at practically the same time.
The \rev{Figures}~\ref{fig:data_screenshot_pointcloud}~and~\ref{fig:data_screenshot_camera} show an representative depiction of the environment of the experiments as a camera image of the IR camera and the point cloud created by the OS1 \rev{LiDAR} sensor at practically the same time.
\fig{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment's point cloud produced by the OS1-32 lidar sensor without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_camera}). Point color corresponds to measurement range and axis in center of figure is the lidar's position.}
\fig{data_screenshot_camera}{figures/data_screenshot_camera.png}{Screenshot of IR camera output of an experiment without smoke and with illumination (same frame and roughly same alignment as figure~\ref{fig:data_screenshot_pointcloud})}
\fig{data_screenshot_pointcloud}{figures/data_screenshot_pointcloud.png}{Screenshot of 3D rendering of an experiment's point cloud produced by the OS1-32 \rev{LiDAR} sensor without smoke and with illumination (same frame and roughly same alignment as \rev{Figure}~\ref{fig:data_screenshot_camera}). Point color corresponds to measurement range and axis in center of figure is the \rev{LiDAR}'s position.}
\fig{data_screenshot_camera}{figures/data_screenshot_camera.png}{Screenshot of IR camera output of an experiment without smoke and with illumination (same frame and roughly same alignment as \rev{Figure}~\ref{fig:data_screenshot_pointcloud})}
Regarding the dataset volume, the 10 normal experiments ranged from 88.7 to 363.1 seconds, with an average duration of 157.65 seconds. At a capture rate of 10 frames per second, these experiments yield 15,765 non-degraded point clouds. In contrast, the 4 anomalous experiments, including one stationary experiment lasting 11.7 seconds and another extending to 62.1 seconds, averaged 47.33 seconds, resulting in 1,893 degraded point clouds. In total, the dataset comprises 17,658 point clouds, with approximately 89.28\% classified as non-degraded (normal) and 10.72\% as degraded (anomalous). The distribution of experimental data is visualized in figure~\ref{fig:data_points_pie}.
Regarding the dataset volume, the 10 normal experiments ranged from 88.7 to 363.1 seconds, with an average duration of 157.65 seconds. At a capture rate of 10 frames per second, these experiments yield 15,765 non-degraded point clouds. In contrast, the 4 anomalous experiments, including one stationary experiment lasting 11.7 seconds and another extending to 62.1 seconds, averaged 47.33 seconds, resulting in 1,893 degraded point clouds. In total, the dataset comprises 17,658 point clouds, with approximately 89.28\% classified as non-degraded (normal) and 10.72\% as degraded (anomalous). The distribution of experimental data is visualized in \rev{Figure}~\ref{fig:data_points_pie}.
\fig{data_points_pie}{figures/data_points_pie.png}{Pie chart visualizing the amount and distribution of normal and anomalous point clouds in \cite{subter}}
The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline. One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance. Figure~\ref{fig:data_missing_points} shows the resulting rightshift of the missing-point histogram, a known effect for lidar sensors in aerosol-filled environments. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box-plot in Fig.~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, a range at which we do not expect any non-erroneous measurements. Both effects are consistent with the behaviour reported in \citetitle{when_the_dust_settles}~\cite{when_the_dust_settles}.
The artificial smoke introduces measurable changes that clearly separate the \textit{anomalous} runs from the \textit{normal} baseline. One change is a larger share of missing points per scan: smoke particles scatter or absorb the laser beam before it reaches a solid target, so the sensor reports an error instead of a distance. Figure~\ref{fig:data_missing_points} shows the resulting rightshift of the missing-point histogram, a known effect for \rev{LiDAR} sensors in aerosol-filled environments. Another demonstrative effect is the appearance of many spurious returns very close to the sensor; these near-field points arise when back-scatter from the aerosol itself is mistaken for a surface echo. The box-plot in Fig.~\ref{fig:particles_near_sensor} confirms a pronounced increase in sub-50 cm hits under smoke, a range at which we do not expect any non-erroneous measurements. Both effects are consistent with the behaviour reported in \citetitle{when_the_dust_settles}~\cite{when_the_dust_settles}.
\fig{data_missing_points}{figures/data_missing_points.png}{Density histogram showing the percentage of missing measurements per scan for normal experiments without degradation and anomalous experiments with artifical smoke introduced as degradation.}
\fig{particles_near_sensor}{figures/particles_near_sensor_boxplot_zoomed_500.png}{Box diagram depicting the percentage of measurements closer than 50 centimeters to the sensor for normal and anomalous experiments.}
Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes. They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of lidar quality. Next we will discuss how the lidar scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so we could train and evaluate our quantification degradation methods.
Taken together, the percentage of missing points and the proportion of near-sensor returns provide a concise indication of how strongly the smoke degrades our scans—capturing the two most prominent aerosol effects, drop-outs and back-scatter spikes. They do not, however, reveal the full error landscape discussed earlier (compound errors, temperature drift, multipath, \dots), so they should be read as an easily computed synopsis rather than an exhaustive measure of \rev{LiDAR} quality. Next we will discuss how the \rev{LiDAR} scans were preprocessed before use and how we actually assigned ground-truth labels to each scan, so we could train and evaluate our quantification degradation methods.
\newsection{preprocessing}{Preprocessing Steps and Labeling}
As described in Section~\ref{sec:algorithm_description}, the method under evaluation is data type agnostic and can be adapted to work with any kind of data by choosing a suitable autoencoder architecture. In our case, the input data are point clouds produced by a lidar sensor. Each point cloud contains up to 65,536 points, with each point represented by its \emph{X}, \emph{Y}, and \emph{Z} coordinates. To tailor the DeepSAD architecture to this specific data type, we would need to design an autoencoder suitable for processing three-dimensional point clouds. Although autoencoders can be developed for various data types, \citetitle{autoencoder_survey}~\cite{autoencoder_survey} noted that over 60\% of recent research on autoencoders focuses on two-dimensional image classification and reconstruction. Consequently, there is a more established understanding of autoencoder architectures for images compared to those for three-dimensional point clouds.
As described in Section~\ref{sec:algorithm_description}, the method under evaluation is data type agnostic and can be adapted to work with any kind of data by choosing a suitable autoencoder architecture. In our case, the input data are point clouds produced by a \rev{LiDAR} sensor. Each point cloud contains up to 65,536 points, with each point represented by its \emph{X}, \emph{Y}, and \emph{Z} coordinates. To tailor the DeepSAD architecture to this specific data type, we would need to design an autoencoder suitable for processing three-dimensional point clouds. Although autoencoders can be developed for various data types, \citetitle{autoencoder_survey}~\cite{autoencoder_survey} noted that over 60\% of recent research on autoencoders focuses on two-dimensional image classification and reconstruction. Consequently, there is a more established understanding of autoencoder architectures for images compared to those for three-dimensional point clouds.
For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven sucessful in related work~\cite{degradation_quantification_rain}—encodes each lidar measurement as a single pixel, where the pixels grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the lidar sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$.
For this reason and to simplify the architecture, we converted the point clouds into two-dimensional grayscale images using a spherical projection. This approach—proven sucessful in related work~\cite{degradation_quantification_rain}—encodes each \rev{LiDAR} measurement as a single pixel, where the pixels grayscale value is determined by the reciprocal range, calculated as $v = \frac{1}{\sqrt{\emph{X}^2 + \emph{Y}^2 + \emph{Z}^2}}$. Given the \rev{LiDAR} sensor's configuration, the resulting images have a resolution of 2048 pixels in width and 32 pixels in height. Missing measurements in the point cloud are mapped to pixels with a brightness value of $v = 0$.
To create this mapping, we leveraged the available measurement indices and channel information inherent in the dense point clouds, which are ordered from 0 to 65,535 in a horizontally ascending, channel-by-channel manner. For sparse point clouds without such indices, one would need to rely on the pitch and yaw angles relative to the sensor's origin to correctly map each point to its corresponding pixel, although this often leads to ambiguous mappings due to numerical errors in angle estimation.
Figure~\ref{fig:data_projections} displays two examples of lidar point cloud projections to aid in the readers understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The top projection is derived from a scan without artificial smoke—and therefore minimal degradation—while the lower projection comes from an experiment where artificial smoke introduced significant degradation.
Figure~\ref{fig:data_projections} displays two examples of \rev{LiDAR} point cloud projections to aid in the readers understanding. Although the original point clouds were converted into grayscale images with a resolution of 2048×32 pixels, these raw images can be challenging to interpret. To enhance human readability, we applied the viridis colormap and vertically stretched the images so that each measurement occupies multiple pixels in height. The top projection is derived from a scan without artificial smoke—and therefore minimal degradation—while the lower projection comes from an experiment where artificial smoke introduced significant degradation.
\fig{data_projections}{figures/data_2d_projections.png}{Two-dimensional projections of two pointclouds, one from an experiment without degradation and one from an experiment with artifical smoke as degradation. To aid the readers perception, the images are vertically stretched and a colormap has been applied to the pixels' reciprocal range values, while the actual training data is grayscale.}
The remaining challenge, was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in section~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available one provide objective labels for lidar data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using the datas' statistical properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in section~\ref{sec:data_req}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments.
The remaining challenge, was labeling a large enough portion of the dataset in a reasonably accurate manner, whose difficulties and general approach we described in \rev{Section}~\ref{sec:data_req}. Since, to our knowledge, neither our chosen dataset nor any other publicly available one provide objective labels for \rev{LiDAR} data degradation in the SAR domain, we had to define our own labeling approach. With objective measures of degradation unavailable, we explored alternative labeling methods—such as using the datas' statistical properties like the number of missing measurements per point cloud or the higher incidence of erroneous measurements near the sensor we described in \rev{Section}~\ref{sec:data_req}. Ultimately, we were concerned that these statistical approaches might lead the method to simply mimic the statistical evaluation rather than to quantify degradation in a generalized and robust manner. After considering these options, we decided to label all point clouds from experiments with artificial smoke as anomalies, while point clouds from experiments without smoke were labeled as normal data. This labeling strategy—based on the presence or absence of smoke—is fundamentally an environmental indicator, independent of the intrinsic data properties recorded during the experiments.
The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this, is that during the three non-static anomalous experiments the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the beginnings and ends of the anomalous experiments capture point clouds which are subjectively not degraded and appear similar to ones from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier-the proportion of missing points and the amount of erroneous points close to the sensor per pointcloud-as can be seen in figure~\ref{fig:data_anomalies_timeline}.
The simplicity of this labeling approach has both advantages and disadvantages. On the positive side, it is easy to implement and creates a clear distinction between normal and anomalous data. However, its simplicity is also its drawback: some point clouds from experiments with artificial smoke do not exhibit perceptible degradation, yet they are still labeled as anomalies. The reason for this, is that during the three non-static anomalous experiments the sensor platform starts recording in a tunnel roughly 20 meters from the smoke machine's location. It starts by approaching the smoke machine, navigates close to the machine for some time and then leaves its perimeter once again. Since the artificical smoke's density is far larger near the machine it originates from, the time the sensor platform spent close to it produced highly degraded point clouds, whereas the beginnings and ends of the anomalous experiments capture point clouds which are subjectively not degraded and appear similar to ones from the normal experiments. This effect is clearly illustrated by the degradation indicators which we talked about earlier-the proportion of missing points and the amount of erroneous points close to the sensor per pointcloud-as can be seen in \rev{Figure}~\ref{fig:data_anomalies_timeline}.
\fig{data_anomalies_timeline}{figures/data_combined_anomalies_timeline.png}{Missing points and points with a measured range smaller than 50cm per point cloud over a normalized timeline of the individual experiments. This illustrates the rise, plateau and fall of degradation intensity during the anomalous experiments, owed to the spacial proximity to the degradation source (smoke machine). One of the normal experiments (without artifical smoke) is included as a baseline.}
@@ -543,7 +547,7 @@ Afraid that the incorrectly labeled data may negatively impact DeepSAD's semi-su
\item \textbf{Manually-defined labels:} Only unequivocally degraded scans marked anomalous—producing near-ideal separation in a lot of cases.
\end{enumerate}
Under both evaluation schemes all frames from normal experiments were marked as normal, since they appear to have produced high quality data throughout. A visualization of how the two evaluation schemes measure up in terms of numbers of samples per class can be seen in figure~\ref{fig:data_eval_labels}.
Under both evaluation schemes all frames from normal experiments were marked as normal, since they appear to have produced high quality data throughout. A visualization of how the two evaluation schemes measure up in terms of numbers of samples per class can be seen in \rev{Figure}~\ref{fig:data_eval_labels}.
\fig{data_eval_labels}{figures/data_eval_labels.png}{Pie charts visualizing the number of normal and anomalous labels applied to the dataset per labeling scheme. A large part of the experiment-based anomalous labels had to be removed for the manually-defined scheme, since they were either subjectively clearly or possibly not degraded.}
@@ -568,9 +572,9 @@ Together, these components define the full experimental pipeline, from data load
\section{Framework \& Data Preparation}
DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added functionality for dataloading our chosen dataset, added DeepSAD models that work with the lidar projections datatype, added more evaluation methods and an inference module.
DeepSAD's PyTorch implementation—our starting point—includes implementations for training on standardized datasets such as MNIST, CIFAR-10 and datasets from \citetitle{odds}~\cite{odds}. The framework can train and test DeepSAD as well as a number of baseline algorithms, namely SSAD, OCSVM, Isolation Forest, KDE and SemiDGM with the loaded data and evaluate their performance by calculating the Receiver Operating Characteristic (ROC) and its Area Under the Curve (AUC) for all given algorithms. We adapted this implementation which was originally developed for Python 3.7 to work with Python 3.12 and changed or added functionality for dataloading our chosen dataset, added DeepSAD models that work with the \rev{LiDAR} projections datatype, added more evaluation methods and an inference module.
The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 lidar. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in section~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern lidars can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
The raw SubTER dataset is provided as one ROS bag file per experiment, each containing a dense 3D point cloud from the Ouster OS1-32 \rev{LiDAR}. To streamline training and avoid repeated heavy computation, we project these point clouds offline into 2D “range images” as described in \rev{Section}~\ref{sec:preprocessing} and export them to files as NumPy arrays. Storing precomputed projections allows rapid data loading during training and evaluation. Many modern \rev{LiDARs} can be configured to output range images directly which would bypass the need for post-hoc projection. When available, such native range-image streams can further simplify preprocessing or even allow skipping this step completely.
We extended the DeepSAD frameworks PyTorch \texttt{DataLoader} by implementing a custom \texttt{Dataset} class that ingests our precomputed NumPy range-image files and attaches appropriate evaluation labels. Each experiments frames are stored as a single \texttt{.npy} file of shape \((\text{Number of Frames}, H, W)\), containing the point clouds' reciprocal range values. Our \texttt{Dataset} initializer scans a directory of these files, loads the NumPy arrays from file into memory, transforms them into PyTorch tensors and assigns evaluation and training labels accordingly.
@@ -595,7 +599,7 @@ To obtain a second source of ground truth, we also support \emph{manually-define
\end{cases}
\]
We pass instances of this \texttt{Dataset} to PyTorchs \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed lidar projection, both evaluation labels and a semi-supervised training label.
We pass instances of this \texttt{Dataset} to PyTorchs \texttt{DataLoader}, enabling batch sampling, shuffling, and multi-worker loading. The dataloader returns the preprocessed \rev{LiDAR} projection, both evaluation labels and a semi-supervised training label.
To control the supervision of DeepSAD's training, our custom PyTorch \texttt{Dataset} accepts two integer parameters, \texttt{num\_labelled\_normal} and \texttt{num\_labelled\_anomalous}, which specify how many samples of each class should retain their labels during training. We begin with the manually-defined evaluation labels, to not use mislabeled anomalous frames for the semi-supervision. Then, we randomly un-label (set to 0) enough samples of each class until exactly \texttt{num\_labelled\_normal} normals and \texttt{num\_labelled\_anomalous} anomalies remain labeled.
@@ -605,12 +609,12 @@ For inference (i.e.\ model validation on held-out experiments), we provide a sec
\section{Model Configuration \& Evaluation Protocol}
Since the neural network architecture trained in the deepsad method is not fixed as described in section~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed lidar data projections. Since \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth reffered to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
Since the neural network architecture trained in the deepsad method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \citetitle{degradation_quantification_rain}~\cite{degradation_quantification_rain} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth reffered to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
The LeNet-inspired autoencoder can be split into an encoder network (figure~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (figure~\ref{fig:setup_arch_lenet_decoder}) with a latent space inbetween the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in section~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained.
The LeNet-inspired autoencoder can be split into an encoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_encoder}) and a decoder network (\rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) with a latent space \rev{in between} the two parts. Such an arrangement is typical for autoencoder architectures as we discussed in \rev{Section}~\ref{sec:autoencoder}. The encoder network is simultaneously DeepSAD's main training architecture which is used to infer the degradation quantification in our use case, once trained.
\figc{setup_arch_lenet_encoder}{diagrams/arch_lenet_encoder}{
Architecture of the LeNet-inspired encoder. The input is a lidar range image of size
Architecture of the LeNet-inspired encoder. The input is a \rev{LiDAR} range image of size
$1\times 2048\times 32$ (channels $\times$ width $\times$ height). The first block (Conv1) applies a
$5\times 5$ convolution with 8 output channels, followed by batch normalization, LeakyReLU activation,
and $2\times 2$ max pooling, resulting in a feature map of size $8\times 1024\times 16$.
@@ -622,7 +626,7 @@ The LeNet-inspired autoencoder can be split into an encoder network (figure~\ref
representation used by DeepSAD for anomaly detection.
}{width=.8\textwidth}
The LeNet-inspired encoder network (see figure~\ref{fig:setup_arch_lenet_encoder}) is a compact convolutional neural network that reduces image data into a lower-dimensional latent space. It consists of two stages of convolution, normalization, non-linear activation, and pooling, followed by a dense layer that defines the latent representation. Conceptually, the convolutional layers learn small filters that detect visual patterns in the input (such as edges or textures). Batch normalization ensures that these learned signals remain numerically stable during training, while a LeakyReLU activation introduces non-linearity, allowing the network to capture more complex relationships. Pooling operations then downsample the feature maps, which reduces the spatial size of the data and emphasizes the most important features. Finally, a dense layer transforms the extracted feature maps into the latent space.
The LeNet-inspired encoder network (see \rev{Figure}~\ref{fig:setup_arch_lenet_encoder}) is a compact convolutional neural network that reduces image data into a lower-dimensional latent space. It consists of two stages of convolution, normalization, non-linear activation, and pooling, followed by a dense layer that defines the latent representation. Conceptually, the convolutional layers learn small filters that detect visual patterns in the input (such as edges or textures). Batch normalization ensures that these learned signals remain numerically stable during training, while a LeakyReLU activation introduces non-linearity, allowing the network to capture more complex relationships. Pooling operations then downsample the feature maps, which reduces the spatial size of the data and emphasizes the most important features. Finally, a dense layer transforms the extracted feature maps into the latent space.
\figc{setup_arch_lenet_decoder}{diagrams/arch_lenet_decoder}{
Architecture of the LeNet-inspired decoder. The input is a latent vector of dimension $d$,
@@ -637,19 +641,19 @@ The LeNet-inspired encoder network (see figure~\ref{fig:setup_arch_lenet_encoder
for the autoencoding objective.
}{width=.8\textwidth}
The decoder network (see figure~\ref{fig:setup_arch_lenet_decoder}) mirrors the encoder and reconstructs the input from its latent representation. A dense layer first expands the latent vector into a feature map of shape $4\times 512\times 8$, which is then upsampled and refined in two successive stages. Each stage consists of an interpolation step that doubles the spatial resolution, followed by a transpose convolution that learns how to add structural detail. The first stage operates on 4 channels, and the second on 8 channels, with the final transpose convolution reducing the output to a single channel. The result is a reconstructed output of size $1\times 2048\times 32$, matching the original input dimensionality required for the autoencoding objective.
The decoder network (see \rev{Figure}~\ref{fig:setup_arch_lenet_decoder}) mirrors the encoder and reconstructs the input from its latent representation. A dense layer first expands the latent vector into a feature map of shape $4\times 512\times 8$, which is then upsampled and refined in two successive stages. Each stage consists of an interpolation step that doubles the spatial resolution, followed by a transpose convolution that learns how to add structural detail. The first stage operates on 4 channels, and the second on 8 channels, with the final transpose convolution reducing the output to a single channel. The result is a reconstructed output of size $1\times 2048\times 32$, matching the original input dimensionality required for the autoencoding objective.
Even though the LeNet-inspired encoder proved capable of achieving our degradation quantification objective in initial experiments, we identified several shortcomings that motivated the design of a second, more efficient architecture. The most important issue concerns the shape of the CNN's receptive field (RF) which describes the region of the input that influences a single output activation. Its size and aspect ratio determine which structures the network can effectively capture: if the RF is too small, larger patterns cannot be detected, while an excessively large RF may hinder the network from learning to recognize fine details. For standard image data, the RF is often expressed as a symmetric $n \times n$ region, but in principle it can be computed independently per axis.
\figc{setup_ef_concept}{figures/setup_ef_concept}{Receptive fields in a CNN. Each output activation aggregates information from a region of the input; stacking layers expands this region, while kernel size, stride, and padding control how quickly it grows and what shape it takes. (A) illustrates slower, fine-grained growth; (B) shows faster expansion, producing a larger—potentially anisotropic—receptive field and highlighting the trade-off between detail and context. Reproduced from~\cite{ef_concept_source}}{width=.6\textwidth}
The RF shape's issue arises from the fact that spinning multi-beam lidar oftentimes produce point clouds posessing dense horizontal but limited vertical resolution. In our case this, this results in a pixel-per-degree resolution of approximately $5.69\,\sfrac{pixel}{deg}$ vertically and $1.01\,\sfrac{pixel}{deg}$ horizontally. Consequently, the LeNet-inspired encoders calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the networks ability to capture degradation patterns that extend differently across the two axes.
The RF shape's issue arises from the fact that spinning multi-beam \rev{LiDAR} oftentimes produce point clouds posessing dense horizontal but limited vertical resolution. In our case this, this results in a pixel-per-degree resolution of approximately $5.69\,\sfrac{pixel}{deg}$ vertically and $1.01\,\sfrac{pixel}{deg}$ horizontally. Consequently, the LeNet-inspired encoders calculated receptive field of $16 \times 16$ pixels translates to an angular size of $15.88^{\circ} \times 2.81^{\circ}$, which is highly rectangular in angular space. Such a mismatch risks limiting the networks ability to capture degradation patterns that extend differently across the two axes.
To adjust for this, we decided to modify the network architecture and included further modificatons to improve the method's performance. The encoder (see figure~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modificatons:
To adjust for this, we decided to modify the network architecture and included further modificatons to improve the method's performance. The encoder (see \rev{Figure}~\ref{fig:setup_arch_ef_encoder}) follows the same general idea as the LeNet-inspired encoder, but incorporates the following modificatons:
\begin{itemize}
\item \textbf{Non-square convolution kernels.} Depthwise-separable convolutions with kernel size $3 \times 17$ are used instead of square kernels, resulting in an RF of $10 \times 52$ pixels, corresponding to $9.93^{\circ} \times 9.14^{\circ}$, substantially more balanced than the LeNet-inspired network's RF.
\item \textbf{Circular padding along azimuth.} The horizontal axis is circularly padded to respect the wrap-around of $360^{\circ}$ lidar data, preventing artificial seams at the image boundaries.
\item \textbf{Circular padding along azimuth.} The horizontal axis is circularly padded to respect the wrap-around of $360^{\circ}$ \rev{LiDAR} data, preventing artificial seams at the image boundaries.
\item \textbf{Aggressive horizontal pooling.} A $1 \times 4$ pooling operation is applied early in the network, which reduces the over-sampled horizontal resolution (2048~px to 512~px) while keeping vertical detail intact.
\item \textbf{Depthwise-separable convolutions with channel shuffle.} Inspired by MobileNet~\cite{mobilenet} and ShuffleNet~\cite{shufflenet}, this reduces the number of parameters and computations while retaining representational capacity, making the network more suitable for embedded platforms, while simultaneously allowing more learnable channels without increasing computational demand.
\item \textbf{Max pooling.} Standard max pooling is used instead of average pooling, since it preserves sharp activations that are often indicative of localized degradation.
@@ -657,7 +661,7 @@ To adjust for this, we decided to modify the network architecture and included f
\end{itemize}
\fig{setup_arch_ef_encoder}{diagrams/arch_ef_encoder}{
Architecture of the Efficient encoder. The input is a lidar range image of size
Architecture of the Efficient encoder. The input is a \rev{LiDAR} range image of size
$1 \times 2048 \times 32$ (channels $\times$ width $\times$ height).
The first block (\textbf{Conv1}) applies a depthwiseseparable $3 \times 17$ convolution
with circular padding along the azimuth, followed by batch normalization, LeakyReLU,
@@ -675,11 +679,11 @@ To adjust for this, we decided to modify the network architecture and included f
\paragraph{Decoder.}
The decoder (see figure~\ref{fig:setup_arch_ef_decoder}) mirrors the encoders structure but introduces changes to improve reconstruction stability:
The decoder (see \rev{Figure}~\ref{fig:setup_arch_ef_decoder}) mirrors the encoders structure but introduces changes to improve reconstruction stability:
\begin{itemize}
\item \textbf{Nearest-neighbor upsampling followed by convolution.} Instead of relying solely on transposed convolutions, each upsampling stage first enlarges the feature map using parameter-free nearest-neighbor interpolation, followed by a depthwise-separable convolution. This strategy reduces the risk of checkerboard artifacts while still allowing the network to learn fine detail.
\item \textbf{Asymmetric upsampling schedule.} Horizontal resolution is restored more aggressively (e.g., scale factor $1 \times 4$) to reflect the anisotropic downsampling performed in the encoder.
\item \textbf{Final convolution with circular padding.} The output is generated using a $(3 \times 17)$ convolution with circular padding along the azimuth similar to the new encoder, ensuring consistent treatment of the 360° lidar input.
\item \textbf{Final convolution with circular padding.} The output is generated using a $(3 \times 17)$ convolution with circular padding along the azimuth similar to the new encoder, ensuring consistent treatment of the 360° \rev{LiDAR} input.
\end{itemize}
\fig{setup_arch_ef_decoder}{diagrams/arch_ef_decoder}{
@@ -698,7 +702,7 @@ The decoder (see figure~\ref{fig:setup_arch_ef_decoder}) mirrors the encoders
}
To compare the computational efficiency of the two architectures we show the number of trainable parameters and the number of multiplyaccumulate operations (MACs) for different latent space sizes used in our experiments in table~\ref{tab:params_lenet_vs_efficient}. Even though the efficient architecture employs more layers and channels which allows the network to learn to recognize more types of patterns when compared to the LeNet-inspired one, the encoders' MACs are quite similar. The more complex decoder design of the efficient network appears to contribute a lot more MACs, which leads to longer pretraining times which we report in section~\ref{sec:setup_experiments_environment}.
To compare the computational efficiency of the two architectures we show the number of trainable parameters and the number of multiplyaccumulate operations (MACs) for different latent space sizes used in our experiments in \rev{Table}~\ref{tab:params_lenet_vs_efficient}. Even though the efficient architecture employs more layers and channels which allows the network to learn to recognize more types of patterns when compared to the LeNet-inspired one, the encoders' MACs are quite similar. The more complex decoder design of the efficient network appears to contribute a lot more MACs, which leads to longer pretraining times which we report in \rev{Section}~\ref{sec:setup_experiments_environment}.
\begin{table}[!ht]
\centering
@@ -733,7 +737,7 @@ To contextualize the performance of DeepSAD, we compare against two widely used
Isolation Forest is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.
Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the lidar input representation, providing a strong non-neural baseline for comparison against DeepSAD.
Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the \rev{LiDAR} input representation, providing a strong non-neural baseline for comparison against DeepSAD.
OCSVM takes a very different approach by learning a flexible boundary around normal samples. It assumes all training data to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies.
@@ -796,7 +800,7 @@ Combining $7$ latent sizes, $2$ architectures, and $3$ labeling regimes yields $
\label{tab:exp_grid}
\end{table}
These experiments were run on a computational environment for which we summarize the hardware and software stack in table~\ref{tab:system_setup}.
These experiments were run on a computational environment for which we summarize the hardware and software stack in \rev{Table}~\ref{tab:system_setup}.
\begin{table}[p]
\centering
@@ -1002,7 +1006,7 @@ Table~\ref{tab:results_ap} summarizes average precision (AP) across latent dimen
The precision--recall curves (Figure~\ref{fig:prc_representative}) illustrate these effects more clearly. For DeepSAD, precision stays close to 1 until about 0.5 recall, after which it drops off sharply. This plateau corresponds to the fraction of truly degraded frames in the anomalous set. Once recall moves beyond this point, the evaluation demands that the model also “find” the mislabeled anomalies near the run boundaries. To do so, the decision threshold must be lowered so far that many normal frames are also flagged, which causes precision to collapse. The baselines behave differently: OCSVM shows a smooth but weaker decline without a strong high-precision plateau, while Isolation Forest collapses to near-random performance. These operational differences are hidden in a single AP number but are important for judging how the methods would behave in deployment.
Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the manually-defined labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods potential for quantifying lidar degradation in practice.
Taken together, the two evaluation schemes provide complementary insights. The experiment-based labels offer a noisy but realistic setting that shows how methods cope with ambiguous data, while the manually-defined labels confirm that DeepSAD can achieve nearly perfect separation when the ground truth is clean. The combination of both evaluations makes clear that (i) DeepSAD is stronger than the baselines under both conditions, (ii) the apparent performance limits under experiment-based labels are mainly due to label noise, and (iii) interpreting results requires care, since performance drops in the curves often reflect mislabeled samples rather than model failures. At the same time, both schemes remain binary classifications and therefore cannot directly evaluate the central question of whether anomaly scores can serve as a continuous measure of degradation. For this reason, we extend the analysis in Section~\ref{sec:results_inference}, where inference on entire unseen experiments is used to provide a more intuitive demonstration of the methods potential for quantifying \rev{LiDAR} degradation in practice.
\fig{prc_representative}{figures/results_prc.png}{Representative precisionrecall curves over all latent dimensionalities for semi-labeling regime 0/0 from experiment-based evaluation labels. DeepSAD maintains a large high-precision operating region before collapsing; OCSVM declines smoother but exhibits high standard deviation between folds; IsoForest collapses quickly and remains flat. DeepSAD's fall-off is at least partly due to known mislabeled evaluation targets.}
@@ -1038,10 +1042,10 @@ In summary, three consistent patterns emerge: (i) a very small number of labels
In addition to the evaluation of average precision and precision--recall curves obtained from $k$-fold cross-validation with varying hyperparameters, we also examine the behavior of the fully trained methods when applied to previously unseen, held-out experiments.
While the prior analysis provided valuable insights into the classification capabilities of the methods, it was limited by two factors: first, the binary ground-truth labels were of uneven quality due to aforementioned mislabeling of frames, and second, the binary formulation does not reflect our overarching goal of quantifying sensor degradation on a continuous scale.
To provide a more intuitive understanding of how the methods might perform in real-world applications, we therefore present results from running inference sequentially on entire experiments.
These frame-by-frame time-axis plots simulate online inference and illustrate how anomaly scores evolve as data is captured, thereby serving as a candidate metric for quantifying the degree of lidar degradation during operation.
These frame-by-frame time-axis plots simulate online inference and illustrate how anomaly scores evolve as data is captured, thereby serving as a candidate metric for quantifying the degree of \rev{LiDAR} degradation during operation.
\fig{results_inference_normal_vs_degraded}{figures/results_inference_normal_vs_degraded.png}{Comparison of anomaly detection methods with statistical indicators across clean (dashed) and degraded (solid) experiments. Each subplot shows one method (DeepSAD--LeNet, DeepSAD--Efficient, OCSVM, Isolation Forest). Red curves denote method anomaly scores normalized to the clean experiment; blue and green curves denote the percentage of missing lidar points and near-sensor particle hits, respectively. Clear separation between clean and degraded runs is observed for the DeepSAD variants and, to a lesser degree, for OCSVM, while Isolation Forest produces high scores even in the clean experiment. Latent Space Dimensionality was 32 and semi-supervised labeling regime was 0 normal and 0 anomalous samples during training.}
\fig{results_inference_normal_vs_degraded}{figures/results_inference_normal_vs_degraded.png}{Comparison of anomaly detection methods with statistical indicators across clean (dashed) and degraded (solid) experiments. Each subplot shows one method (DeepSAD--LeNet, DeepSAD--Efficient, OCSVM, Isolation Forest). Red curves denote method anomaly scores normalized to the clean experiment; blue and green curves denote the percentage of missing \rev{LiDAR} points and near-sensor particle hits, respectively. Clear separation between clean and degraded runs is observed for the DeepSAD variants and, to a lesser degree, for OCSVM, while Isolation Forest produces high scores even in the clean experiment. Latent Space Dimensionality was 32 and semi-supervised labeling regime was 0 normal and 0 anomalous samples during training.}
The plots in Figure~\ref{fig:results_inference_normal_vs_degraded} highlight important differences in how well the tested methods distinguish between normal and degraded sensor conditions.
Among the four approaches, the strongest separation is achieved by DeepSAD (Efficient), followed by DeepSAD (LeNet), then OCSVM.
@@ -1061,13 +1065,13 @@ The anomaly detection models are expected to have learned additional patterns th
This thesis set out to answer the research question stated in Chapter~\ref{chp:introduction}:
\begin{quote}
Can autonomous robots quantify the reliability of lidar sensor data in hazardous environments to make more informed decisions?
Can autonomous robots quantify the reliability of \rev{LiDAR} sensor data in hazardous environments to make more informed decisions?
\end{quote}
Our results indicate a qualified “yes.” Using anomaly detection (AD)—in particular DeepSAD—we can obtain scores that (i) separate clearly normal from clearly degraded scans and (ii) track degradation trends over time on held-out experiments (see Sections~\ref{sec:results_deepsad} and \ref{sec:results_inference}). At the same time, the absence of robust ground truth limits how confidently we can assess \emph{continuous} quantification quality and complicates cross-method comparisons. The remainder of this chapter summarizes what we contribute, what we learned, and what is still missing.
\paragraph{Main contributions.}
\begin{itemize}
\item \textbf{Empirical comparison for lidar degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OCSVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}).
\item \textbf{Empirical comparison for \rev{LiDAR} degradation.} A systematic evaluation of DeepSAD against Isolation Forest and OCSVM across latent sizes and labeling regimes, showing that DeepSAD consistently outperforms the baselines under both evaluation schemes (Section~\ref{sec:results_deepsad}).
\item \textbf{Latent dimensionality insight.}
Autoencoder pretraining loss decreases with larger latent spaces, but DeepSAD performance shows the opposite trend: compact bottlenecks (32128) achieve the highest AP. This contrast demonstrates that pretraining performance does not directly predict DeepSAD performance—latent dimensionality cannot be tuned via autoencoder loss alone, even though it remains useful for comparing architectures.
@@ -1088,21 +1092,21 @@ For settings similar to ours, we recommend:
\medskip
We now turn to the main limiting factor that emerged throughout this work: the lack of robust, expressive ground truth for lidar degradation and its downstream impact.
We now turn to the main limiting factor that emerged throughout this work: the lack of robust, expressive ground truth for \rev{LiDAR} degradation and its downstream impact.
\newsection{conclusion_data}{Missing Ground Truth as an Obstacle}
The most significant obstacle identified in this work is the absence of robust and comprehensive ground truth for lidar degradation. As discussed in Chapter~\ref{chp:data_preprocessing}, it is not trivial to define what “degradation” precisely means in practice. Although error models for lidar and theoretical descriptions of how airborne particles affect laser returns exist, these models typically quantify errors at the level of individual points (e.g., missing returns, spurious near-range hits). Such metrics, however, may not be sufficient to assess the impact of degraded data on downstream perception. For example, a point cloud with relatively few but highly localized errors—such as those caused by a dense smoke cloud—may cause a SLAM algorithm to misinterpret the region as a solid obstacle. In contrast, a point cloud with a greater number of dispersed errors might be easier to filter and thus cause little or no disruption in mapping. Consequently, the notion of “degradation” must extend beyond point-level error statistics to include how different error patterns propagate to downstream modules.
The most significant obstacle identified in this work is the absence of robust and comprehensive ground truth for \rev{LiDAR} degradation. As discussed in Chapter~\ref{chp:data_preprocessing}, it is not trivial to define what “degradation” precisely means in practice. Although error models for \rev{LiDAR} and theoretical descriptions of how airborne particles affect laser returns exist, these models typically quantify errors at the level of individual points (e.g., missing returns, spurious near-range hits). Such metrics, however, may not be sufficient to assess the impact of degraded data on downstream perception. For example, a point cloud with relatively few but highly localized errors—such as those caused by a dense smoke cloud—may cause a SLAM algorithm to misinterpret the region as a solid obstacle. In contrast, a point cloud with a greater number of dispersed errors might be easier to filter and thus cause little or no disruption in mapping. Consequently, the notion of “degradation” must extend beyond point-level error statistics to include how different error patterns propagate to downstream modules.
To our knowledge, no public datasets with explicit ground truth for lidar degradation exist. Even if such data were collected, for example with additional smoke sensors, it is unclear whether this would provide a usable ground truth. A smoke sensor measures only at a single point in space, while lidar observes many points across the environment from a distance, so the two do not directly translate. In our dataset, we relied on the fact that clean and degraded experiments were clearly separated: data from degraded runs was collected only after artificial smoke had been released. However, the degree of degradation varied strongly within each run. Because the smoke originated from a single machine in the middle of the sensor platform's traversal path, early and late frames were often nearly as clear as those from clean experiments. This led to mislabeled frames at the run boundaries and limited the reliability of experiment-based evaluation. As shown in Section~\ref{sec:results_deepsad}, this effect capped achievable AP scores even for strong models. The underlying difficulty is not only label noise, but also the challenge of collecting labeled subsets that are representative of the full range of anomalies.
To our knowledge, no public datasets with explicit ground truth for \rev{LiDAR} degradation exist. Even if such data were collected, for example with additional smoke sensors, it is unclear whether this would provide a usable ground truth. A smoke sensor measures only at a single point in space, while \rev{LiDAR} observes many points across the environment from a distance, so the two do not directly translate. In our dataset, we relied on the fact that clean and degraded experiments were clearly separated: data from degraded runs was collected only after artificial smoke had been released. However, the degree of degradation varied strongly within each run. Because the smoke originated from a single machine in the middle of the sensor platform's traversal path, early and late frames were often nearly as clear as those from clean experiments. This led to mislabeled frames at the run boundaries and limited the reliability of experiment-based evaluation. As shown in Section~\ref{sec:results_deepsad}, this effect capped achievable AP scores even for strong models. The underlying difficulty is not only label noise, but also the challenge of collecting labeled subsets that are representative of the full range of anomalies.
One promising direction is to evaluate degradation not directly on raw lidar frames but via its downstream impact. For example, future work could assess degradation based on discrepancies between a previously mapped 3D environment and the output of a SLAM algorithm operating under degraded conditions. In such a setup, subjective labeling may still be required in special cases (e.g., dense smoke clouds treated as solid obstacles by SLAM), but it would anchor evaluation more closely to the ultimate users of the data.
One promising direction is to evaluate degradation not directly on raw \rev{LiDAR} frames but via its downstream impact. For example, future work could assess degradation based on discrepancies between a previously mapped 3D environment and the output of a SLAM algorithm operating under degraded conditions. In such a setup, subjective labeling may still be required in special cases (e.g., dense smoke clouds treated as solid obstacles by SLAM), but it would anchor evaluation more closely to the ultimate users of the data.
Finally, the binary ground truth employed here is insufficient for the quantification goal. As shown in Section~\ref{sec:results_inference}, DeepSADs anomaly scores appear suitable not only for classification but also for expressing intermediate levels of degradation. Analog evaluation targets would therefore be highly valuable, as they would allow testing whether anomaly scores increase consistently with degradation severity, rather than only separating “normal” from “degraded.”
\newsection{conclusion_ad}{Insights into DeepSAD and AD for Degradation Quantification}
This work has shown that the DeepSAD principle is applicable to lidar degradation in hazardous environments and yields promising detection performance as well as runtime feasibility (see Sections~\ref{sec:results_deepsad} and~\ref{sec:setup_experiments_environment}). Compared to simpler baselines such as Isolation Forest and OCSVM, DeepSAD achieved much stronger separation between clean and degraded data. While OCSVM showed smoother but weaker separation and Isolation Forest produced high false positives even in clean runs, both DeepSAD variants maintained large high-precision regions before collapsing under mislabeled evaluation targets.
This work has shown that the DeepSAD principle is applicable to \rev{LiDAR} degradation in hazardous environments and yields promising detection performance as well as runtime feasibility (see Sections~\ref{sec:results_deepsad} and~\ref{sec:setup_experiments_environment}). Compared to simpler baselines such as Isolation Forest and OCSVM, DeepSAD achieved much stronger separation between clean and degraded data. While OCSVM showed smoother but weaker separation and Isolation Forest produced high false positives even in clean runs, both DeepSAD variants maintained large high-precision regions before collapsing under mislabeled evaluation targets.
However, the semi-supervised component of DeepSAD did not improve results in our setting. In fact, adding a small number of labels often reduced performance due to overfitting to narrow subsets of anomalies, while larger labeled sets stabilized training, they still did not surpass the unsupervised regime (see Section~\ref{sec:results_deepsad}). This suggests that without representative and diverse labeled anomalies, unsupervised training remains the safer choice.
@@ -1116,13 +1120,13 @@ Finally, inference experiments showed that DeepSADs anomaly scores can track
Several promising avenues remain open for future exploration:
\begin{itemize}
\item \textbf{Temporal modeling:} Instead of treating frames independently, future methods could directly model the difference between temporally consecutive frames to capture dynamic aspects of degradation.
\item \textbf{Lidar intensity:} Lidar typically save an intensity value per point, indicating the strength of the reflected optical signal, which could be incorporated to improve degradation quantification.
\item \textbf{Sensor fusion:} Combining lidar with complementary sensors (e.g., ultrasonic sensors that penetrate dense clouds) could mitigate blind spots inherent to single-sensor evaluation.
\item \textbf{Input segmentation:} The DeepSAD architecture tested here processed full 360° lidar scans. This may obscure localized degradations. Segmenting point clouds into angular sectors and computing anomaly scores per sector could provide more fine-grained quantification. Preliminary tests in this direction were promising, but were not pursued further in this thesis.
\item \textbf{Cross-sensor generalization:} Current experiments assume identical sensor resolution. Extending the method to work across different lidar types, including those with varying angular resolutions, remains an open question and would enhance applicability in heterogeneous robotic fleets and allow the incorporation of more datasets during training.
\item \textbf{\rev{LiDAR} intensity:} \rev{LiDAR} typically save an intensity value per point, indicating the strength of the reflected optical signal, which could be incorporated to improve degradation quantification.
\item \textbf{Sensor fusion:} Combining \rev{LiDAR} with complementary sensors (e.g., ultrasonic sensors that penetrate dense clouds) could mitigate blind spots inherent to single-sensor evaluation.
\item \textbf{Input segmentation:} The DeepSAD architecture tested here processed full 360° \rev{LiDAR} scans. This may obscure localized degradations. Segmenting point clouds into angular sectors and computing anomaly scores per sector could provide more fine-grained quantification. Preliminary tests in this direction were promising, but were not pursued further in this thesis.
\item \textbf{Cross-sensor generalization:} Current experiments assume identical sensor resolution. Extending the method to work across different \rev{LiDAR} types, including those with varying angular resolutions, remains an open question and would enhance applicability in heterogeneous robotic fleets and allow the incorporation of more datasets during training.
\end{itemize}
In summary, while this thesis demonstrates the feasibility of using anomaly detection for lidar degradation quantification, significant challenges remain. Chief among them are the definition and collection of ground truth, the development of analog evaluation targets, and architectural adaptations for more complex real-world scenarios. Addressing these challenges will be critical for moving from proof-of-concept to practical deployment in rescue robotics and beyond.
In summary, while this thesis demonstrates the feasibility of using anomaly detection for \rev{LiDAR} degradation quantification, significant challenges remain. Chief among them are the definition and collection of ground truth, the development of analog evaluation targets, and architectural adaptations for more complex real-world scenarios. Addressing these challenges will be critical for moving from proof-of-concept to practical deployment in rescue robotics and beyond.
% end mainmatter
% **************************************************************************************************

View File

@@ -595,9 +595,96 @@
author = {Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian},
year = {2018},
month = jun,
},
@article{bg_svm,
title = {Support-vector networks},
author = {Cortes, Corinna and Vapnik, Vladimir},
journal = {Machine learning},
volume = {20},
number = {3},
pages = {273--297},
year = {1995},
publisher = {Springer},
},
@article{bg_kmeans,
author = {Lloyd, S.},
journal = {IEEE Transactions on Information Theory},
title = {Least squares quantization in PCM},
year = {1982},
volume = {28},
number = {2},
pages = {129-137},
keywords = {Noise;Quantization (signal);Voltage;Receivers;Pulse
modulation;Sufficient conditions;Stochastic processes;Probabilistic
logic;Urban areas;Q measurement},
doi = {10.1109/TIT.1982.1056489},
},
@inproceedings{bg_dbscan,
added-at = {2023-12-13T07:32:13.000+0100},
author = {Ester, Martin and Kriegel, Hans-Peter and Sander, Jörg and Xu,
Xiaowei},
biburl = {
https://www.bibsonomy.org/bibtex/279a9f3560daefa3775bd35543b4482e1/admin
},
booktitle = {KDD},
crossref = {conf/kdd/1996},
editor = {Simoudis, Evangelos and Han, Jiawei and Fayyad, Usama M.},
ee = {http://www.aaai.org/Library/KDD/1996/kdd96-037.php},
interhash = {ba33e4d6b4e5b26bd9f543f26b7d250a},
intrahash = {79a9f3560daefa3775bd35543b4482e1},
isbn = {1-57735-004-9},
keywords = {},
pages = {226-231},
publisher = {AAAI Press},
timestamp = {2023-12-13T07:32:13.000+0100},
title = {A Density-Based Algorithm for Discovering Clusters in Large Spatial
Databases with Noise.},
url = {http://dblp.uni-trier.de/db/conf/kdd/kdd96.html#EsterKSX96},
year = 1996,
},
@article{bg_pca,
author = { Karl Pearson F.R.S. },
title = {LIII. On lines and planes of closest fit to systems of points in
space},
journal = {The London, Edinburgh, and Dublin Philosophical Magazine and
Journal of Science},
volume = {2},
number = {11},
pages = {559-572},
year = {1901},
publisher = {Taylor & Francis},
doi = {10.1080/14786440109462720},
},
@article{bg_infomax,
author = {Linsker, R.},
journal = {Computer},
title = {Self-organization in a perceptual network},
year = {1988},
volume = {21},
number = {3},
pages = {105-117},
keywords = {Intelligent networks;Biological information
theory;Circuits;Biology computing;Animal
structures;Neuroscience;Genetics;System testing;Neural
networks;Constraint theory},
doi = {10.1109/2.36},
},
@article{bg_slam,
title = {On the Representation and Estimation of Spatial Uncertainty},
volume = {5},
ISSN = {1741-3176},
url = {http://dx.doi.org/10.1177/027836498600500404},
DOI = {10.1177/027836498600500404},
number = {4},
journal = {The International Journal of Robotics Research},
publisher = {SAGE Publications},
author = {Smith, Randall C. and Cheeseman, Peter},
year = {1986},
month = dec,
pages = {5668},
}

View File

@@ -1,7 +1,9 @@
\addcontentsline{toc}{chapter}{Abstract}
\begin{center}\Large\bfseries Abstract\end{center}\vspace*{1cm}\noindent
Autonomous robots are increasingly used in search and rescue (SAR) missions. In these missions, lidar sensors are often the most important source of environmental data. However, lidar data can degrade under hazardous conditions, especially when airborne particles such as smoke or dust are present. This degradation can lead to errors in mapping and navigation and may endanger both the robot and humans. Robots therefore need a way to estimate the reliability of their lidar data, so they can make better informed decisions.
Autonomous robots are increasingly used in search and rescue (SAR) missions. In these missions, lidar sensors are often the most important source of environmental data. However, lidar data can degrade under hazardous conditions, especially when airborne particles such as smoke or dust are present. This degradation can lead to errors in mapping and navigation and may endanger both the robot and humans. Robots therefore need a way to estimate the reliability of their lidar data, so \rev{that} they can make better informed decisions.
\newline
This thesis investigates whether anomaly detection methods can be used to quantify lidar data degradation. We apply a semi-supervised deep learning approach called DeepSAD which produces an anomaly score for each lidar scan, serving as a measure of data reliability.
This thesis investigates whether anomaly detection methods can be used to quantify lidar data degradation \rev{caused by airborne particles such as smoke and dust}. We apply a semi-supervised deep learning approach called DeepSAD which produces an anomaly score for each lidar scan, serving as a measure of data reliability.
\newline
We evaluate this method against baseline methods on an subterranean dataset that includes lidar scans degraded by artificial smoke. Our results show that DeepSAD consistently outperforms the baselines and can clearly distinguish degraded from normal scans. At the same time, we find that the limited availability of labeled data and the lack of robust ground truth remain major challenges. Despite these limitations, our work demonstrates that anomaly detection methods are a promising tool for lidar degradation quantification in SAR scenarios.