cleanup for raw txt (grammar check)
This commit is contained in:
55
thesis/tex2plaintext.sh
Executable file
55
thesis/tex2plaintext.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Usage:
|
||||
# ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
|
||||
#
|
||||
# Defaults:
|
||||
# INPUT_TEX = Main.txt (your original file name)
|
||||
# OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
|
||||
|
||||
INPUT_TEX="${1:-Main.tex}"
|
||||
OUT_BASE="${2:-thesis}"
|
||||
|
||||
FLAT_TEX="flat.tex"
|
||||
NO_TABLES_TEX="flat_notables.tex"
|
||||
PLAIN_TXT="${OUT_BASE}.txt"
|
||||
PART1_TXT="${OUT_BASE}_part1.txt"
|
||||
PART2_TXT="${OUT_BASE}_part2.txt"
|
||||
MARKER="Data and Preprocessing"
|
||||
|
||||
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
|
||||
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
|
||||
|
||||
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
|
||||
# Replace entire tabular / tabularx environments with a placeholder
|
||||
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
|
||||
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
|
||||
|
||||
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
|
||||
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
|
||||
|
||||
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
|
||||
|
||||
# Ensure the marker exists exactly on its own line
|
||||
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
|
||||
echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
|
||||
echo " (It must be the only content on that line.)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean previous outputs if present
|
||||
rm -f -- "${PART1_TXT}" "${PART2_TXT}"
|
||||
|
||||
# Split so the marker line becomes the FIRST line of part 2
|
||||
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
|
||||
BEGIN { current = out1 }
|
||||
$0 == marker { current = out2; print $0 > current; next }
|
||||
{ print $0 > current }
|
||||
' "${PLAIN_TXT}"
|
||||
|
||||
echo "Done."
|
||||
echo " - ${PLAIN_TXT}"
|
||||
echo " - ${PART1_TXT}"
|
||||
echo " - ${PART2_TXT}"
|
||||
|
||||
Reference in New Issue
Block a user