better grammarly prep

This commit is contained in:
Jan Kowalczyk
2025-10-18 12:47:16 +02:00
parent 374420727b
commit 5aca00ad67
5 changed files with 59 additions and 5 deletions

View File

@@ -18,18 +18,24 @@ PART1_TXT="${OUT_BASE}_part1.txt"
PART2_TXT="${OUT_BASE}_part2.txt"
MARKER="Data and Preprocessing"
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}"
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
# Replace entire tabular / tabularx environments with a placeholder
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}"
pandoc -f latex -t plain --wrap=none \
--lua-filter=filters/keep-citations.lua \
--lua-filter=filters/math-omit.lua \
"${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
echo "[4/5] Replacing [] placeholders with [figure]"
sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}"
echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
# Ensure the marker exists exactly on its own line
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then