cleanup for raw txt (grammar check)

2025-10-18 12:19:26 +02:00
parent 8697c07c0f
commit 374420727b
7 changed files with 162 additions and 19 deletions
--- a/thesis/Main.tex
+++ b/thesis/Main.tex
@@ -53,9 +53,9 @@
 % **************************************************************************************************
 % template setup -- do not change these unless you know what you are doing!
-\input{./base/documentclass_\DocumentType}
+\input{./base/documentclass_thesis}
 \input{./base/packages}
-\input{./base/layout_\DocumentType}
+\input{./base/layout_thesis}
 \input{./base/macros}
 % **************************************************************************************************
@@ -156,26 +156,27 @@
 % variable for page numbering
 \newcounter{mypageno}
-% **************************************************************************************************
+
 \begin{document}
 % **************************************************************************************************
 \input{./base/syntax_formatting}
 % for thesis: switch to frontmatter (Roman numbering, etc.)
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\frontmatter \pagestyle{plain} \pagenumbering{Roman}
 }{}
 % **************************************************************************************************
 \begin{document}
 % **************************************************************************************************
 %title
-\input{./base/titlepage_\DocumentType}
+\input{./base/titlepage_thesis}
 % for thesis: abstract, kurzfassung, affidavit and statutory declaration
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\emptydoublepage
 	\addcontentsline{toc}{chapter}{Statutory Declaration}
-	\input{./base/declaration_\DocumentLanguage}
+	\input{./base/declaration_en}
 	\emptydoublepage
 	\input{thesis_preamble/acknowledgements}
 	\emptydoublepage
@@ -187,7 +188,7 @@
 \tableofcontents
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\emptydoublepage
 	\setcounter{mypageno}{\value{page}}
@@ -1148,7 +1149,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
 % **************************************************************************************************
 \appendix
-\ifthenelse{\equal{\DocumentType}{thesis}}
+\ifthenelse{\equal{thesis}{thesis}}
 {
 	\setcounter{mypageno}{\value{page}}
 	\frontmatter \pagestyle{plain} \pagenumbering{Roman}
--- a/thesis/base/declaration_en.tex
+++ b/thesis/base/declaration_en.tex
@@ -24,10 +24,7 @@
 not used other than the declared sources/resources, and that I have
 explicitly indicated all material which has been quoted either
 literally or by content from the sources used.
-\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or
+The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
 	\equal{\ThesisTitle}{diploma thesis} \or
 	\equal{\ThesisTitle}{doctoral thesis}}
 {The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
 \par\vspace*{4cm}
--- a/thesis/drop-images.lua
+++ b/thesis/drop-images.lua
@@ -0,0 +1,11 @@
 -- drop-images.lua
 -- Replaces all images (figures, graphics) with a short placeholder.
 function Image(el) return pandoc.Str("[image omitted]") end
 -- For LaTeX figures that are still raw
 function RawBlock(el)
    if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
        return pandoc.Plain({pandoc.Str("[figure omitted]")})
    end
 end
--- a/thesis/drop-tables.lua
+++ b/thesis/drop-tables.lua
@@ -0,0 +1,11 @@
 -- drop-tables.lua
 -- Removes LaTeX tabular and tabularx environments (and their contents).
 function RawBlock(el)
    if el.format == "tex" then
        -- Check for tabular or tabularx environment
        if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
            return pandoc.Plain({pandoc.Str("[table omitted]")})
        end
    end
 end
--- a/thesis/flake.nix
+++ b/thesis/flake.nix
@@ -28,7 +28,10 @@
          zathura
          wmctrl
          python312
          pandoc
          pandoc-lua-filters
        ];
        filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
      in
      {
        devShell = pkgs.mkShell {
@@ -39,6 +42,28 @@
          ];
        };
        shellHook = ''
          set -eu
          # local folder in your repo to reference in commands
          link_target="pandoc-filters"
          # refresh symlink each time you enter the shell
          ln -sfn ${filtersPath} "$link_target"
          echo "Linked $link_target -> ${filtersPath}"
          # (optional) write a defaults file that uses the relative symlink
          if [ ! -f pandoc.defaults.yaml ]; then
            cat > pandoc.defaults.yaml <<'YAML'
            from: latex
            to: plain
            wrap: none
            lua-filter:
              - pandoc-filters/latex-hyphen.lua
              - pandoc-filters/pandoc-quotes.lua
            YAML
            echo "Wrote pandoc.defaults.yaml"
          fi
        '';
      }
    );
 }
--- a/thesis/keep-citations.lua
+++ b/thesis/keep-citations.lua
@@ -0,0 +1,43 @@
 -- keep-citations.lua
 -- Replace citations with a placeholder and eat any preceding space.
 local PH = "[citation]"
 -- Pandoc-native citations (if the reader produced Cite nodes)
 function Cite(el) return pandoc.Str(PH) end
 -- Raw LaTeX \cite-like macros (when not parsed as Cite)
 function RawInline(el)
    if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
        return pandoc.Str(PH)
    end
 end
 -- Remove a single leading Space before our placeholder
 local function squash_spaces(inlines)
    local out = {}
    local i = 1
    while i <= #inlines do
        local cur = inlines[i]
        local nxt = inlines[i + 1]
        if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
            PH then
            table.insert(out, nxt)
            i = i + 2
        else
            table.insert(out, cur)
            i = i + 1
        end
    end
    return out
 end
 function Para(el)
    el.content = squash_spaces(el.content)
    return el
 end
 function Plain(el)
    el.content = squash_spaces(el.content)
    return el
 end
--- a/thesis/tex2plaintext.sh
+++ b/thesis/tex2plaintext.sh
@@ -0,0 +1,55 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Usage:
 #   ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
 #
 # Defaults:
 #   INPUT_TEX     = Main.txt     (your original file name)
 #   OUT_BASENAME  = thesis       (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
 INPUT_TEX="${1:-Main.tex}"
 OUT_BASE="${2:-thesis}"
 FLAT_TEX="flat.tex"
 NO_TABLES_TEX="flat_notables.tex"
 PLAIN_TXT="${OUT_BASE}.txt"
 PART1_TXT="${OUT_BASE}_part1.txt"
 PART2_TXT="${OUT_BASE}_part2.txt"
 MARKER="Data and Preprocessing"
 echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
 latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
 echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
 # Replace entire tabular / tabularx environments with a placeholder
 perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
  "${FLAT_TEX}" > "${NO_TABLES_TEX}"
 echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
 pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
 echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
 # Ensure the marker exists exactly on its own line
 if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
  echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
  echo "       (It must be the only content on that line.)"
  exit 1
 fi
 # Clean previous outputs if present
 rm -f -- "${PART1_TXT}" "${PART2_TXT}"
 # Split so the marker line becomes the FIRST line of part 2
 awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
 BEGIN { current = out1 }
 $0 == marker { current = out2; print $0 > current; next }
 { print $0 > current }
 ' "${PLAIN_TXT}"
 echo "Done."
 echo "  - ${PLAIN_TXT}"
 echo "  - ${PART1_TXT}"
 echo "  - ${PART2_TXT}"