#!/usr/bin/env bash
set -euo pipefail

# Usage:
#   ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
#
# Defaults:
#   INPUT_TEX     = Main.txt     (your original file name)
#   OUT_BASENAME  = thesis       (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)

INPUT_TEX="${1:-Main.tex}"
OUT_BASE="${2:-thesis}"

FLAT_TEX="flat.tex"
NO_TABLES_TEX="flat_notables.tex"
PLAIN_TXT="${OUT_BASE}.txt"
PART1_TXT="${OUT_BASE}_part1.txt"
PART2_TXT="${OUT_BASE}_part2.txt"
MARKER="Data and Preprocessing"

echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}"
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"

echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
# Replace entire tabular / tabularx environments with a placeholder
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
  "${FLAT_TEX}" > "${NO_TABLES_TEX}"

echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}"
pandoc -f latex -t plain --wrap=none \
  --lua-filter=filters/keep-citations.lua \
  --lua-filter=filters/math-omit.lua \
  "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"

echo "[4/5] Replacing [] placeholders with [figure]"
sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}"

echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""

# Ensure the marker exists exactly on its own line
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
  echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
  echo "       (It must be the only content on that line.)"
  exit 1
fi

# Clean previous outputs if present
rm -f -- "${PART1_TXT}" "${PART2_TXT}"

# Split so the marker line becomes the FIRST line of part 2
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
BEGIN { current = out1 }
$0 == marker { current = out2; print $0 > current; next }
{ print $0 > current }
' "${PLAIN_TXT}"

echo "Done."
echo "  - ${PLAIN_TXT}"
echo "  - ${PART1_TXT}"
echo "  - ${PART2_TXT}"