cleanup for raw txt (grammar check)
This commit is contained in:
@@ -53,9 +53,9 @@
|
|||||||
|
|
||||||
% **************************************************************************************************
|
% **************************************************************************************************
|
||||||
% template setup -- do not change these unless you know what you are doing!
|
% template setup -- do not change these unless you know what you are doing!
|
||||||
\input{./base/documentclass_\DocumentType}
|
\input{./base/documentclass_thesis}
|
||||||
\input{./base/packages}
|
\input{./base/packages}
|
||||||
\input{./base/layout_\DocumentType}
|
\input{./base/layout_thesis}
|
||||||
\input{./base/macros}
|
\input{./base/macros}
|
||||||
|
|
||||||
% **************************************************************************************************
|
% **************************************************************************************************
|
||||||
@@ -156,26 +156,27 @@
|
|||||||
|
|
||||||
% variable for page numbering
|
% variable for page numbering
|
||||||
\newcounter{mypageno}
|
\newcounter{mypageno}
|
||||||
% **************************************************************************************************
|
|
||||||
\begin{document}
|
|
||||||
% **************************************************************************************************
|
|
||||||
\input{./base/syntax_formatting}
|
\input{./base/syntax_formatting}
|
||||||
|
|
||||||
% for thesis: switch to frontmatter (Roman numbering, etc.)
|
% for thesis: switch to frontmatter (Roman numbering, etc.)
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
||||||
}{}
|
}{}
|
||||||
|
% **************************************************************************************************
|
||||||
|
\begin{document}
|
||||||
|
% **************************************************************************************************
|
||||||
|
|
||||||
%title
|
%title
|
||||||
\input{./base/titlepage_\DocumentType}
|
\input{./base/titlepage_thesis}
|
||||||
|
|
||||||
% for thesis: abstract, kurzfassung, affidavit and statutory declaration
|
% for thesis: abstract, kurzfassung, affidavit and statutory declaration
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
\addcontentsline{toc}{chapter}{Statutory Declaration}
|
\addcontentsline{toc}{chapter}{Statutory Declaration}
|
||||||
\input{./base/declaration_\DocumentLanguage}
|
\input{./base/declaration_en}
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
\input{thesis_preamble/acknowledgements}
|
\input{thesis_preamble/acknowledgements}
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
@@ -187,7 +188,7 @@
|
|||||||
|
|
||||||
\tableofcontents
|
\tableofcontents
|
||||||
|
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\emptydoublepage
|
\emptydoublepage
|
||||||
\setcounter{mypageno}{\value{page}}
|
\setcounter{mypageno}{\value{page}}
|
||||||
@@ -1148,7 +1149,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
|
|||||||
% **************************************************************************************************
|
% **************************************************************************************************
|
||||||
|
|
||||||
\appendix
|
\appendix
|
||||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
\ifthenelse{\equal{thesis}{thesis}}
|
||||||
{
|
{
|
||||||
\setcounter{mypageno}{\value{page}}
|
\setcounter{mypageno}{\value{page}}
|
||||||
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
||||||
|
|||||||
@@ -24,10 +24,7 @@
|
|||||||
not used other than the declared sources/resources, and that I have
|
not used other than the declared sources/resources, and that I have
|
||||||
explicitly indicated all material which has been quoted either
|
explicitly indicated all material which has been quoted either
|
||||||
literally or by content from the sources used.
|
literally or by content from the sources used.
|
||||||
\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or
|
The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
|
||||||
\equal{\ThesisTitle}{diploma thesis} \or
|
|
||||||
\equal{\ThesisTitle}{doctoral thesis}}
|
|
||||||
{The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
|
|
||||||
|
|
||||||
|
|
||||||
\par\vspace*{4cm}
|
\par\vspace*{4cm}
|
||||||
|
|||||||
11
thesis/drop-images.lua
Normal file
11
thesis/drop-images.lua
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
-- drop-images.lua
|
||||||
|
-- Replaces all images (figures, graphics) with a short placeholder.
|
||||||
|
function Image(el) return pandoc.Str("[image omitted]") end
|
||||||
|
|
||||||
|
-- For LaTeX figures that are still raw
|
||||||
|
function RawBlock(el)
|
||||||
|
if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
|
||||||
|
return pandoc.Plain({pandoc.Str("[figure omitted]")})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
11
thesis/drop-tables.lua
Normal file
11
thesis/drop-tables.lua
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
-- drop-tables.lua
|
||||||
|
-- Removes LaTeX tabular and tabularx environments (and their contents).
|
||||||
|
function RawBlock(el)
|
||||||
|
if el.format == "tex" then
|
||||||
|
-- Check for tabular or tabularx environment
|
||||||
|
if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
|
||||||
|
return pandoc.Plain({pandoc.Str("[table omitted]")})
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
@@ -28,7 +28,10 @@
|
|||||||
zathura
|
zathura
|
||||||
wmctrl
|
wmctrl
|
||||||
python312
|
python312
|
||||||
|
pandoc
|
||||||
|
pandoc-lua-filters
|
||||||
];
|
];
|
||||||
|
filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
devShell = pkgs.mkShell {
|
devShell = pkgs.mkShell {
|
||||||
@@ -39,6 +42,28 @@
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
shellHook = ''
|
||||||
|
set -eu
|
||||||
|
# local folder in your repo to reference in commands
|
||||||
|
link_target="pandoc-filters"
|
||||||
|
# refresh symlink each time you enter the shell
|
||||||
|
ln -sfn ${filtersPath} "$link_target"
|
||||||
|
echo "Linked $link_target -> ${filtersPath}"
|
||||||
|
|
||||||
|
# (optional) write a defaults file that uses the relative symlink
|
||||||
|
if [ ! -f pandoc.defaults.yaml ]; then
|
||||||
|
cat > pandoc.defaults.yaml <<'YAML'
|
||||||
|
from: latex
|
||||||
|
to: plain
|
||||||
|
wrap: none
|
||||||
|
lua-filter:
|
||||||
|
- pandoc-filters/latex-hyphen.lua
|
||||||
|
- pandoc-filters/pandoc-quotes.lua
|
||||||
|
YAML
|
||||||
|
echo "Wrote pandoc.defaults.yaml"
|
||||||
|
fi
|
||||||
|
'';
|
||||||
|
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
43
thesis/keep-citations.lua
Normal file
43
thesis/keep-citations.lua
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
-- keep-citations.lua
|
||||||
|
-- Replace citations with a placeholder and eat any preceding space.
|
||||||
|
local PH = "[citation]"
|
||||||
|
|
||||||
|
-- Pandoc-native citations (if the reader produced Cite nodes)
|
||||||
|
function Cite(el) return pandoc.Str(PH) end
|
||||||
|
|
||||||
|
-- Raw LaTeX \cite-like macros (when not parsed as Cite)
|
||||||
|
function RawInline(el)
|
||||||
|
if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
|
||||||
|
return pandoc.Str(PH)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
-- Remove a single leading Space before our placeholder
|
||||||
|
local function squash_spaces(inlines)
|
||||||
|
local out = {}
|
||||||
|
local i = 1
|
||||||
|
while i <= #inlines do
|
||||||
|
local cur = inlines[i]
|
||||||
|
local nxt = inlines[i + 1]
|
||||||
|
if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
|
||||||
|
PH then
|
||||||
|
table.insert(out, nxt)
|
||||||
|
i = i + 2
|
||||||
|
else
|
||||||
|
table.insert(out, cur)
|
||||||
|
i = i + 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return out
|
||||||
|
end
|
||||||
|
|
||||||
|
function Para(el)
|
||||||
|
el.content = squash_spaces(el.content)
|
||||||
|
return el
|
||||||
|
end
|
||||||
|
|
||||||
|
function Plain(el)
|
||||||
|
el.content = squash_spaces(el.content)
|
||||||
|
return el
|
||||||
|
end
|
||||||
|
|
||||||
55
thesis/tex2plaintext.sh
Executable file
55
thesis/tex2plaintext.sh
Executable file
@@ -0,0 +1,55 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Usage:
|
||||||
|
# ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
|
||||||
|
#
|
||||||
|
# Defaults:
|
||||||
|
# INPUT_TEX = Main.txt (your original file name)
|
||||||
|
# OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
|
||||||
|
|
||||||
|
INPUT_TEX="${1:-Main.tex}"
|
||||||
|
OUT_BASE="${2:-thesis}"
|
||||||
|
|
||||||
|
FLAT_TEX="flat.tex"
|
||||||
|
NO_TABLES_TEX="flat_notables.tex"
|
||||||
|
PLAIN_TXT="${OUT_BASE}.txt"
|
||||||
|
PART1_TXT="${OUT_BASE}_part1.txt"
|
||||||
|
PART2_TXT="${OUT_BASE}_part2.txt"
|
||||||
|
MARKER="Data and Preprocessing"
|
||||||
|
|
||||||
|
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
|
||||||
|
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
|
||||||
|
|
||||||
|
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
|
||||||
|
# Replace entire tabular / tabularx environments with a placeholder
|
||||||
|
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
|
||||||
|
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
|
||||||
|
|
||||||
|
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
|
||||||
|
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
|
||||||
|
|
||||||
|
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
|
||||||
|
|
||||||
|
# Ensure the marker exists exactly on its own line
|
||||||
|
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
|
||||||
|
echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
|
||||||
|
echo " (It must be the only content on that line.)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clean previous outputs if present
|
||||||
|
rm -f -- "${PART1_TXT}" "${PART2_TXT}"
|
||||||
|
|
||||||
|
# Split so the marker line becomes the FIRST line of part 2
|
||||||
|
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
|
||||||
|
BEGIN { current = out1 }
|
||||||
|
$0 == marker { current = out2; print $0 > current; next }
|
||||||
|
{ print $0 > current }
|
||||||
|
' "${PLAIN_TXT}"
|
||||||
|
|
||||||
|
echo "Done."
|
||||||
|
echo " - ${PLAIN_TXT}"
|
||||||
|
echo " - ${PART1_TXT}"
|
||||||
|
echo " - ${PART2_TXT}"
|
||||||
|
|
||||||
Reference in New Issue
Block a user