cleanup for raw txt (grammar check)

This commit is contained in:
Jan Kowalczyk
2025-10-18 12:19:26 +02:00
parent 8697c07c0f
commit 374420727b
7 changed files with 162 additions and 19 deletions

View File

@@ -53,9 +53,9 @@
% ************************************************************************************************** % **************************************************************************************************
% template setup -- do not change these unless you know what you are doing! % template setup -- do not change these unless you know what you are doing!
\input{./base/documentclass_\DocumentType} \input{./base/documentclass_thesis}
\input{./base/packages} \input{./base/packages}
\input{./base/layout_\DocumentType} \input{./base/layout_thesis}
\input{./base/macros} \input{./base/macros}
% ************************************************************************************************** % **************************************************************************************************
@@ -156,26 +156,27 @@
% variable for page numbering % variable for page numbering
\newcounter{mypageno} \newcounter{mypageno}
% **************************************************************************************************
\begin{document}
% **************************************************************************************************
\input{./base/syntax_formatting} \input{./base/syntax_formatting}
% for thesis: switch to frontmatter (Roman numbering, etc.) % for thesis: switch to frontmatter (Roman numbering, etc.)
\ifthenelse{\equal{\DocumentType}{thesis}} \ifthenelse{\equal{thesis}{thesis}}
{ {
\frontmatter \pagestyle{plain} \pagenumbering{Roman} \frontmatter \pagestyle{plain} \pagenumbering{Roman}
}{} }{}
% **************************************************************************************************
\begin{document}
% **************************************************************************************************
%title %title
\input{./base/titlepage_\DocumentType} \input{./base/titlepage_thesis}
% for thesis: abstract, kurzfassung, affidavit and statutory declaration % for thesis: abstract, kurzfassung, affidavit and statutory declaration
\ifthenelse{\equal{\DocumentType}{thesis}} \ifthenelse{\equal{thesis}{thesis}}
{ {
\emptydoublepage \emptydoublepage
\addcontentsline{toc}{chapter}{Statutory Declaration} \addcontentsline{toc}{chapter}{Statutory Declaration}
\input{./base/declaration_\DocumentLanguage} \input{./base/declaration_en}
\emptydoublepage \emptydoublepage
\input{thesis_preamble/acknowledgements} \input{thesis_preamble/acknowledgements}
\emptydoublepage \emptydoublepage
@@ -187,7 +188,7 @@
\tableofcontents \tableofcontents
\ifthenelse{\equal{\DocumentType}{thesis}} \ifthenelse{\equal{thesis}{thesis}}
{ {
\emptydoublepage \emptydoublepage
\setcounter{mypageno}{\value{page}} \setcounter{mypageno}{\value{page}}
@@ -1148,7 +1149,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
% ************************************************************************************************** % **************************************************************************************************
\appendix \appendix
\ifthenelse{\equal{\DocumentType}{thesis}} \ifthenelse{\equal{thesis}{thesis}}
{ {
\setcounter{mypageno}{\value{page}} \setcounter{mypageno}{\value{page}}
\frontmatter \pagestyle{plain} \pagenumbering{Roman} \frontmatter \pagestyle{plain} \pagenumbering{Roman}

View File

@@ -24,15 +24,12 @@
not used other than the declared sources/resources, and that I have not used other than the declared sources/resources, and that I have
explicitly indicated all material which has been quoted either explicitly indicated all material which has been quoted either
literally or by content from the sources used. literally or by content from the sources used.
\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
\equal{\ThesisTitle}{diploma thesis} \or
\equal{\ThesisTitle}{doctoral thesis}}
{The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
\par\vspace*{4cm} \par\vspace*{4cm}
\centerline{ \centerline{
\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}} \begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
\cline{1-3} \cline{5-7} \cline{1-3} \cline{5-7}
& date & & & & (signature) &\\ & date & & & & (signature) & \\
\end{tabular}} \end{tabular}}

11
thesis/drop-images.lua Normal file
View File

@@ -0,0 +1,11 @@
-- drop-images.lua
-- Replaces all images (figures, graphics) with a short placeholder.
function Image(el) return pandoc.Str("[image omitted]") end
-- For LaTeX figures that are still raw
function RawBlock(el)
if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
return pandoc.Plain({pandoc.Str("[figure omitted]")})
end
end

11
thesis/drop-tables.lua Normal file
View File

@@ -0,0 +1,11 @@
-- drop-tables.lua
-- Removes LaTeX tabular and tabularx environments (and their contents).
function RawBlock(el)
if el.format == "tex" then
-- Check for tabular or tabularx environment
if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
return pandoc.Plain({pandoc.Str("[table omitted]")})
end
end
end

View File

@@ -28,7 +28,10 @@
zathura zathura
wmctrl wmctrl
python312 python312
pandoc
pandoc-lua-filters
]; ];
filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
in in
{ {
devShell = pkgs.mkShell { devShell = pkgs.mkShell {
@@ -39,6 +42,28 @@
]; ];
}; };
shellHook = ''
set -eu
# local folder in your repo to reference in commands
link_target="pandoc-filters"
# refresh symlink each time you enter the shell
ln -sfn ${filtersPath} "$link_target"
echo "Linked $link_target -> ${filtersPath}"
# (optional) write a defaults file that uses the relative symlink
if [ ! -f pandoc.defaults.yaml ]; then
cat > pandoc.defaults.yaml <<'YAML'
from: latex
to: plain
wrap: none
lua-filter:
- pandoc-filters/latex-hyphen.lua
- pandoc-filters/pandoc-quotes.lua
YAML
echo "Wrote pandoc.defaults.yaml"
fi
'';
} }
); );
} }

43
thesis/keep-citations.lua Normal file
View File

@@ -0,0 +1,43 @@
-- keep-citations.lua
-- Replace citations with a placeholder and eat any preceding space.
local PH = "[citation]"
-- Pandoc-native citations (if the reader produced Cite nodes)
function Cite(el) return pandoc.Str(PH) end
-- Raw LaTeX \cite-like macros (when not parsed as Cite)
function RawInline(el)
if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
return pandoc.Str(PH)
end
end
-- Remove a single leading Space before our placeholder
local function squash_spaces(inlines)
local out = {}
local i = 1
while i <= #inlines do
local cur = inlines[i]
local nxt = inlines[i + 1]
if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
PH then
table.insert(out, nxt)
i = i + 2
else
table.insert(out, cur)
i = i + 1
end
end
return out
end
function Para(el)
el.content = squash_spaces(el.content)
return el
end
function Plain(el)
el.content = squash_spaces(el.content)
return el
end

55
thesis/tex2plaintext.sh Executable file
View File

@@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail
# Usage:
# ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
#
# Defaults:
# INPUT_TEX = Main.txt (your original file name)
# OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
INPUT_TEX="${1:-Main.tex}"
OUT_BASE="${2:-thesis}"
FLAT_TEX="flat.tex"
NO_TABLES_TEX="flat_notables.tex"
PLAIN_TXT="${OUT_BASE}.txt"
PART1_TXT="${OUT_BASE}_part1.txt"
PART2_TXT="${OUT_BASE}_part2.txt"
MARKER="Data and Preprocessing"
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
# Replace entire tabular / tabularx environments with a placeholder
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
# Ensure the marker exists exactly on its own line
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
echo " (It must be the only content on that line.)"
exit 1
fi
# Clean previous outputs if present
rm -f -- "${PART1_TXT}" "${PART2_TXT}"
# Split so the marker line becomes the FIRST line of part 2
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
BEGIN { current = out1 }
$0 == marker { current = out2; print $0 > current; next }
{ print $0 > current }
' "${PLAIN_TXT}"
echo "Done."
echo " - ${PLAIN_TXT}"
echo " - ${PART1_TXT}"
echo " - ${PART2_TXT}"