Compare commits
3 Commits
5287f2c557
...
5aca00ad67
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5aca00ad67 | ||
|
|
374420727b | ||
|
|
8697c07c0f |
BIN
thesis/Main.pdf
BIN
thesis/Main.pdf
Binary file not shown.
@@ -53,9 +53,9 @@
|
|||||||
|
|
||||||
% **************************************************************************************************
|
% **************************************************************************************************
|
||||||
% template setup -- do not change these unless you know what you are doing!
|
% template setup -- do not change these unless you know what you are doing!
|
||||||
\input{./base/documentclass_\DocumentType}
|
\input{./base/documentclass_thesis}
|
||||||
\input{./base/packages}
|
\input{./base/packages}
|
||||||
\input{./base/layout_\DocumentType}
|
\input{./base/layout_thesis}
|
||||||
\input{./base/macros}
|
\input{./base/macros}
|
||||||
|
|
||||||
% **************************************************************************************************
|
% **************************************************************************************************
|
||||||
@@ -156,26 +156,27 @@
|
|||||||
|
|
||||||
% variable for page numbering
|
% variable for page numbering
|
||||||
\newcounter{mypageno}
|
\newcounter{mypageno}
|
||||||
% **************************************************************************************************
|
|
||||||
\begin{document}
|
|
||||||
% **************************************************************************************************
|
|
||||||
\input{./base/syntax_formatting}
|
\input{./base/syntax_formatting}
|
||||||
|
|
||||||
% for thesis: switch to frontmatter (Roman numbering, etc.)
|
% for thesis: switch to frontmatter (Roman numbering, etc.)
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
||||||
}{}
|
}{}
|
||||||
|
% **************************************************************************************************
|
||||||
|
\begin{document}
|
||||||
|
% **************************************************************************************************
|
||||||
|
|
||||||
%title
|
%title
|
||||||
\input{./base/titlepage_\DocumentType}
|
\input{./base/titlepage_thesis}
|
||||||
|
|
||||||
% for thesis: abstract, kurzfassung, affidavit and statutory declaration
|
% for thesis: abstract, kurzfassung, affidavit and statutory declaration
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
\addcontentsline{toc}{chapter}{Statutory Declaration}
|
\addcontentsline{toc}{chapter}{Statutory Declaration}
|
||||||
\input{./base/declaration_\DocumentLanguage}
|
\input{./base/declaration_en}
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
\input{thesis_preamble/acknowledgements}
|
\input{thesis_preamble/acknowledgements}
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
@@ -187,7 +188,7 @@
|
|||||||
|
|
||||||
\tableofcontents
|
\tableofcontents
|
||||||
|
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
\setcounter{mypageno}{\value{page}}
|
\setcounter{mypageno}{\value{page}}
|
||||||
@@ -608,7 +609,7 @@ To obtain robust performance estimates on our relatively small dataset, we imple
|
|||||||
|
|
||||||
For inference (i.e.\ model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire experiment.
|
For inference (i.e.\ model validation on held-out experiments), we provide a second \texttt{Dataset} class that loads a single experiment's NumPy file (no k-fold splitting), does not assign any labels to the frames nor does it shuffle frames, preserving temporal order. This setup enables seamless, frame-by-frame scoring of complete runs—crucial for analyzing degradation dynamics over an entire experiment.
|
||||||
|
|
||||||
\section{Model Configuration \& Evaluation Protocol}
|
\section{Model Configuration}
|
||||||
|
|
||||||
Since the neural network architecture trained in the \rev{DeepSAD} method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \rev{\cite{degradation_quantification_rain}} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth \rev{refered} to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
|
Since the neural network architecture trained in the \rev{DeepSAD} method is not fixed as described in \rev{Section}~\ref{sec:algorithm_details} but rather chosen based on the input data, we also had to choose an autoencoder architecture befitting our preprocessed \rev{LiDAR} data projections. Since \rev{\cite{degradation_quantification_rain}} reported success in training DeepSAD on similar data we firstly adapted the network architecture utilized by them for our use case, which is based on the simple and well understood LeNet architecture~\cite{lenet}. Additionally we were interested in evaluating the importance and impact of a well-suited network architecture for DeepSAD's performance and therefore designed a second network architecture henceforth \rev{refered} to as "efficient architecture" to incorporate a few modern techniques, befitting our use case.
|
||||||
|
|
||||||
@@ -743,22 +744,24 @@ To compare the computational efficiency of the two architectures we show the num
|
|||||||
|
|
||||||
\FloatBarrier
|
\FloatBarrier
|
||||||
|
|
||||||
\paragraph{Baseline methods (Isolation Forest, OCSVM)}
|
\newsection{setup_baselines_evaluation}{Baseline Methods \& Evaluation Metrics}
|
||||||
|
|
||||||
|
To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and OCSVM. Both are included in the original DeepSAD codebase and associated paper, and they represent well-understood yet conceptually distinct families of anomaly detection. Together, these baselines provide complementary perspectives: raw input tree-based partitioning (Isolation Forest) and dimensionality-reduced kernel-based boundary learning (OCSVM), offering a broad and well-established basis for comparison.
|
||||||
|
|
||||||
To contextualize the performance of DeepSAD, we compare against two widely used baselines: Isolation Forest and OCSVM. Both are included in the original DeepSAD codebase and the associated paper, and they represent well-understood but conceptually different families of anomaly detection. In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the DeepSAD autoencoder’s \emph{encoder} as a learned dimensionality reduction (to the same latent size as DeepSAD), to allow OCSVM training on this latent space. Together, these two baselines cover complementary perspectives: raw input tree-based partitioning (Isolation Forest) and dimensionality reduced kernel-based boundary learning (OCSVM), providing a broad and well-established basis for comparison.
|
\paragraph{Isolation Forest} is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.
|
||||||
|
|
||||||
Isolation Forest is an ensemble method for anomaly detection that builds on the principle that anomalies are easier to separate from the rest of the data. It constructs many binary decision trees, each by recursively splitting the data at randomly chosen features and thresholds. In this process, the “training” step consists of building the forest of trees: each tree captures different random partitions of the input space, and together they form a diverse set of perspectives on how easily individual samples can be isolated.
|
|
||||||
|
|
||||||
Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the \rev{LiDAR} input representation, providing a strong non-neural baseline for comparison against DeepSAD.
|
Once trained, the method assigns an anomaly score to new samples by measuring their average path length through the trees. Normal samples, being surrounded by other similar samples, typically require many recursive splits and thus end up deep in the trees. Anomalies, by contrast, stand out in one or more features, which means they can be separated much earlier and end up closer to the root. The shorter the average path length, the more anomalous the sample is considered. This makes Isolation Forest highly scalable and robust: training is efficient and the resulting model is fast to apply to new data. In our setup, we apply Isolation Forest directly to the \rev{LiDAR} input representation, providing a strong non-neural baseline for comparison against DeepSAD.
|
||||||
|
|
||||||
OCSVM takes a very different approach by learning a flexible boundary around normal samples. It assumes all training data to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies.
|
\paragraph{OCSVM} takes a very different approach by learning a flexible boundary around normal samples. It assumes all training data to be normal, with the goal of enclosing the majority of these samples in such a way that new points lying outside this boundary can be identified as anomalies. The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data.
|
||||||
|
|
||||||
The boundary itself is learned using the support vector machine framework. In essence, OCSVM looks for a hyperplane in some feature space that maximizes the separation between the bulk of the data and the origin. To make this possible even when the normal data has a complex, curved shape, OCSVM uses a kernel function such as the radial basis function (RBF). The kernel implicitly maps the input data into a higher-dimensional space, where the cluster of normal samples becomes easier to separate with a simple hyperplane. When this separation is mapped back to the original input space, it corresponds to a flexible, nonlinear boundary that can adapt to the structure of the data.
|
|
||||||
|
|
||||||
During training, the algorithm balances two competing objectives: capturing as many of the normal samples as possible inside the boundary, while keeping the region compact enough to exclude potential outliers. Once this boundary is established, applying OCSVM is straightforward — any new data point is checked against the learned boundary, with points inside considered normal and those outside flagged as anomalous.
|
During training, the algorithm balances two competing objectives: capturing as many of the normal samples as possible inside the boundary, while keeping the region compact enough to exclude potential outliers. Once this boundary is established, applying OCSVM is straightforward — any new data point is checked against the learned boundary, with points inside considered normal and those outside flagged as anomalous.
|
||||||
|
|
||||||
We adapted the baseline implementations to our data loader and input format, and added support for multiple evaluation targets per frame (two labels per data point), reporting both results per experiment. For OCSVM, the dimensionality reduction step is \emph{always} performed with the corresponding DeepSAD encoder and its autoencoder pretraining weights that match the evaluated setting (i.e., same latent size and backbone). Both baselines, like DeepSAD, output continuous anomaly scores. This allows us to evaluate them directly without committing to a fixed threshold.
|
In our setting, the raw input dimensionality ($2048 \times 32$ per frame) is too high for a direct OCSVM fit, so we reuse the autoencoder’s encoder from DeepSAD's pretraining as a learned dimensionality reduction (to the same latent size as DeepSAD) to allow OCSVM training on this latent space. The dimensionality reduction step is always performed with the corresponding DeepSAD encoder and its autoencoder pretraining weights that match the evaluated setting (i.e., same latent size and backbone).
|
||||||
|
|
||||||
|
|
||||||
|
We adapted the baseline implementations to our data loader and input format and added support for multiple evaluation targets per frame (two labels per data point), reporting both results per experiment. Both baselines, like DeepSAD, output continuous anomaly scores, which allows us to evaluate them directly without committing to a fixed threshold.
|
||||||
|
|
||||||
|
TODO transition to evaluation metrics, talk about typical ones like F1 scores (single threshold) so we go on to talk about ROC AUC, well known but can suffer from having class imbalance (especially as in our case) maybe calculation and example. say we saw these exact problems in our results so we decided to report mAP which is similar to roc auc but not as sensitive in regards to class imbalance (show with formula why its not) and then go on to explain that its basically the AUC of PRCs, which are more fitting curves for our usecase due to the same stability for class imbalance (like mAP) but for multiple thresholds (unlike F1) and shape can also give more insight than simple mAP alone.
|
||||||
|
|
||||||
\newsection{setup_experiments_environment}{Experiment Overview \& Computational Environment}
|
\newsection{setup_experiments_environment}{Experiment Overview \& Computational Environment}
|
||||||
|
|
||||||
@@ -1146,7 +1149,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
|
|||||||
% **************************************************************************************************
|
% **************************************************************************************************
|
||||||
|
|
||||||
\appendix
|
\appendix
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\setcounter{mypageno}{\value{page}}
|
\setcounter{mypageno}{\value{page}}
|
||||||
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
||||||
|
|||||||
@@ -24,15 +24,12 @@
|
|||||||
not used other than the declared sources/resources, and that I have
|
not used other than the declared sources/resources, and that I have
|
||||||
explicitly indicated all material which has been quoted either
|
explicitly indicated all material which has been quoted either
|
||||||
literally or by content from the sources used.
|
literally or by content from the sources used.
|
||||||
\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or
|
The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
|
||||||
\equal{\ThesisTitle}{diploma thesis} \or
|
|
||||||
\equal{\ThesisTitle}{doctoral thesis}}
|
|
||||||
{The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
|
|
||||||
|
|
||||||
|
|
||||||
\par\vspace*{4cm}
|
\par\vspace*{4cm}
|
||||||
\centerline{
|
\centerline{
|
||||||
\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
|
\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
|
||||||
\cline{1-3} \cline{5-7}
|
\cline{1-3} \cline{5-7}
|
||||||
& date & & & & (signature) &\\
|
& date & & & & (signature) & \\
|
||||||
\end{tabular}}
|
\end{tabular}}
|
||||||
|
|||||||
11
thesis/filters/drop-images.lua
Normal file
11
thesis/filters/drop-images.lua
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
-- drop-images.lua
|
||||||
|
-- Replaces all images (figures, graphics) with a short placeholder.
|
||||||
|
function Image(el) return pandoc.Str("[image omitted]") end
|
||||||
|
|
||||||
|
-- For LaTeX figures that are still raw
|
||||||
|
function RawBlock(el)
|
||||||
|
if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
|
||||||
|
return pandoc.Plain({pandoc.Str("[figure omitted]")})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
11
thesis/filters/drop-tables.lua
Normal file
11
thesis/filters/drop-tables.lua
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
-- drop-tables.lua
|
||||||
|
-- Removes LaTeX tabular and tabularx environments (and their contents).
|
||||||
|
function RawBlock(el)
|
||||||
|
if el.format == "tex" then
|
||||||
|
-- Check for tabular or tabularx environment
|
||||||
|
if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
|
||||||
|
return pandoc.Plain({pandoc.Str("[table omitted]")})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
43
thesis/filters/keep-citations.lua
Normal file
43
thesis/filters/keep-citations.lua
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
-- keep-citations.lua
|
||||||
|
-- Replace citations with a placeholder and eat any preceding space.
|
||||||
|
local PH = "[citation]"
|
||||||
|
|
||||||
|
-- Pandoc-native citations (if the reader produced Cite nodes)
|
||||||
|
function Cite(el) return pandoc.Str(PH) end
|
||||||
|
|
||||||
|
-- Raw LaTeX \cite-like macros (when not parsed as Cite)
|
||||||
|
function RawInline(el)
|
||||||
|
if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
|
||||||
|
return pandoc.Str(PH)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
-- Remove a single leading Space before our placeholder
|
||||||
|
local function squash_spaces(inlines)
|
||||||
|
local out = {}
|
||||||
|
local i = 1
|
||||||
|
while i <= #inlines do
|
||||||
|
local cur = inlines[i]
|
||||||
|
local nxt = inlines[i + 1]
|
||||||
|
if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
|
||||||
|
PH then
|
||||||
|
table.insert(out, nxt)
|
||||||
|
i = i + 2
|
||||||
|
else
|
||||||
|
table.insert(out, cur)
|
||||||
|
i = i + 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return out
|
||||||
|
end
|
||||||
|
|
||||||
|
function Para(el)
|
||||||
|
el.content = squash_spaces(el.content)
|
||||||
|
return el
|
||||||
|
end
|
||||||
|
|
||||||
|
function Plain(el)
|
||||||
|
el.content = squash_spaces(el.content)
|
||||||
|
return el
|
||||||
|
end
|
||||||
|
|
||||||
48
thesis/filters/math-omit.lua
Normal file
48
thesis/filters/math-omit.lua
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
-- math-omit.lua
|
||||||
|
-- Replace any math with a placeholder and ensure a space before it when appropriate.
|
||||||
|
local PH = "[math omitted]"
|
||||||
|
|
||||||
|
function Math(el)
|
||||||
|
-- Emit the placeholder as a Str; spacing is fixed in Para/Plain below.
|
||||||
|
return pandoc.Str(PH)
|
||||||
|
end
|
||||||
|
|
||||||
|
local function ensure_space_before_ph(inlines)
|
||||||
|
local out = {}
|
||||||
|
for i = 1, #inlines do
|
||||||
|
local cur = inlines[i]
|
||||||
|
if cur.t == "Str" and cur.text == PH then
|
||||||
|
local prev = out[#out]
|
||||||
|
local need_space = true
|
||||||
|
|
||||||
|
-- No space if it's the first token in the block
|
||||||
|
if not prev then
|
||||||
|
need_space = false
|
||||||
|
elseif prev.t == "Space" then
|
||||||
|
need_space = false
|
||||||
|
elseif prev.t == "Str" then
|
||||||
|
-- If previous char is an opening bracket/paren/slash/hyphen or whitespace, skip
|
||||||
|
local last = prev.text:sub(-1)
|
||||||
|
if last:match("[%(%[%{%/%-]") or last:match("%s") then
|
||||||
|
need_space = false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if need_space then table.insert(out, pandoc.Space()) end
|
||||||
|
table.insert(out, cur)
|
||||||
|
else
|
||||||
|
table.insert(out, cur)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return out
|
||||||
|
end
|
||||||
|
|
||||||
|
function Para(el)
|
||||||
|
el.content = ensure_space_before_ph(el.content)
|
||||||
|
return el
|
||||||
|
end
|
||||||
|
|
||||||
|
function Plain(el)
|
||||||
|
el.content = ensure_space_before_ph(el.content)
|
||||||
|
return el
|
||||||
|
end
|
||||||
@@ -28,7 +28,10 @@
|
|||||||
zathura
|
zathura
|
||||||
wmctrl
|
wmctrl
|
||||||
python312
|
python312
|
||||||
|
pandoc
|
||||||
|
pandoc-lua-filters
|
||||||
];
|
];
|
||||||
|
filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
devShell = pkgs.mkShell {
|
devShell = pkgs.mkShell {
|
||||||
@@ -39,6 +42,28 @@
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
shellHook = ''
|
||||||
|
set -eu
|
||||||
|
# local folder in your repo to reference in commands
|
||||||
|
link_target="pandoc-filters"
|
||||||
|
# refresh symlink each time you enter the shell
|
||||||
|
ln -sfn ${filtersPath} "$link_target"
|
||||||
|
echo "Linked $link_target -> ${filtersPath}"
|
||||||
|
|
||||||
|
# (optional) write a defaults file that uses the relative symlink
|
||||||
|
if [ ! -f pandoc.defaults.yaml ]; then
|
||||||
|
cat > pandoc.defaults.yaml <<'YAML'
|
||||||
|
from: latex
|
||||||
|
to: plain
|
||||||
|
wrap: none
|
||||||
|
lua-filter:
|
||||||
|
- pandoc-filters/latex-hyphen.lua
|
||||||
|
- pandoc-filters/pandoc-quotes.lua
|
||||||
|
YAML
|
||||||
|
echo "Wrote pandoc.defaults.yaml"
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
61
thesis/tex2plaintext.sh
Executable file
61
thesis/tex2plaintext.sh
Executable file
@@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Usage:
|
||||||
|
# ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
|
||||||
|
#
|
||||||
|
# Defaults:
|
||||||
|
# INPUT_TEX = Main.txt (your original file name)
|
||||||
|
# OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
|
||||||
|
|
||||||
|
INPUT_TEX="${1:-Main.tex}"
|
||||||
|
OUT_BASE="${2:-thesis}"
|
||||||
|
|
||||||
|
FLAT_TEX="flat.tex"
|
||||||
|
NO_TABLES_TEX="flat_notables.tex"
|
||||||
|
PLAIN_TXT="${OUT_BASE}.txt"
|
||||||
|
PART1_TXT="${OUT_BASE}_part1.txt"
|
||||||
|
PART2_TXT="${OUT_BASE}_part2.txt"
|
||||||
|
MARKER="Data and Preprocessing"
|
||||||
|
|
||||||
|
echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}"
|
||||||
|
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
|
||||||
|
|
||||||
|
echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
|
||||||
|
# Replace entire tabular / tabularx environments with a placeholder
|
||||||
|
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
|
||||||
|
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
|
||||||
|
|
||||||
|
echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}"
|
||||||
|
pandoc -f latex -t plain --wrap=none \
|
||||||
|
--lua-filter=filters/keep-citations.lua \
|
||||||
|
--lua-filter=filters/math-omit.lua \
|
||||||
|
"${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
|
||||||
|
|
||||||
|
echo "[4/5] Replacing [] placeholders with [figure]"
|
||||||
|
sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}"
|
||||||
|
|
||||||
|
echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
|
||||||
|
|
||||||
|
# Ensure the marker exists exactly on its own line
|
||||||
|
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
|
||||||
|
echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
|
||||||
|
echo " (It must be the only content on that line.)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean previous outputs if present
|
||||||
|
rm -f -- "${PART1_TXT}" "${PART2_TXT}"
|
||||||
|
|
||||||
|
# Split so the marker line becomes the FIRST line of part 2
|
||||||
|
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
|
||||||
|
BEGIN { current = out1 }
|
||||||
|
$0 == marker { current = out2; print $0 > current; next }
|
||||||
|
{ print $0 > current }
|
||||||
|
' "${PLAIN_TXT}"
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo " - ${PLAIN_TXT}"
|
||||||
|
echo " - ${PART1_TXT}"
|
||||||
|
echo " - ${PART2_TXT}"
|
||||||
|
|
||||||
Reference in New Issue
Block a user