#!/usr/bin/env bash set -euo pipefail # Usage: # ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME] # # Defaults: # INPUT_TEX = Main.txt (your original file name) # OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt) INPUT_TEX="${1:-Main.tex}" OUT_BASE="${2:-thesis}" FLAT_TEX="flat.tex" NO_TABLES_TEX="flat_notables.tex" PLAIN_TXT="${OUT_BASE}.txt" PART1_TXT="${OUT_BASE}_part1.txt" PART2_TXT="${OUT_BASE}_part2.txt" MARKER="Data and Preprocessing" echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}" latexpand "${INPUT_TEX}" > "${FLAT_TEX}" echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}" # Replace entire tabular / tabularx environments with a placeholder perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \ "${FLAT_TEX}" > "${NO_TABLES_TEX}" echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}" pandoc -f latex -t plain --wrap=none \ --lua-filter=filters/keep-citations.lua \ --lua-filter=filters/math-omit.lua \ "${NO_TABLES_TEX}" -o "${PLAIN_TXT}" echo "[4/5] Replacing [] placeholders with [figure]" sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}" echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\"" # Ensure the marker exists exactly on its own line if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}." echo " (It must be the only content on that line.)" exit 1 fi # Clean previous outputs if present rm -f -- "${PART1_TXT}" "${PART2_TXT}" # Split so the marker line becomes the FIRST line of part 2 awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" ' BEGIN { current = out1 } $0 == marker { current = out2; print $0 > current; next } { print $0 > current } ' "${PLAIN_TXT}" echo "Done." echo " - ${PLAIN_TXT}" echo " - ${PART1_TXT}" echo " - ${PART2_TXT}"