diff --git a/thesis/drop-images.lua b/thesis/filters/drop-images.lua similarity index 100% rename from thesis/drop-images.lua rename to thesis/filters/drop-images.lua diff --git a/thesis/drop-tables.lua b/thesis/filters/drop-tables.lua similarity index 100% rename from thesis/drop-tables.lua rename to thesis/filters/drop-tables.lua diff --git a/thesis/keep-citations.lua b/thesis/filters/keep-citations.lua similarity index 100% rename from thesis/keep-citations.lua rename to thesis/filters/keep-citations.lua diff --git a/thesis/filters/math-omit.lua b/thesis/filters/math-omit.lua new file mode 100644 index 0000000..8ef5486 --- /dev/null +++ b/thesis/filters/math-omit.lua @@ -0,0 +1,48 @@ +-- math-omit.lua +-- Replace any math with a placeholder and ensure a space before it when appropriate. +local PH = "[math omitted]" + +function Math(el) + -- Emit the placeholder as a Str; spacing is fixed in Para/Plain below. + return pandoc.Str(PH) +end + +local function ensure_space_before_ph(inlines) + local out = {} + for i = 1, #inlines do + local cur = inlines[i] + if cur.t == "Str" and cur.text == PH then + local prev = out[#out] + local need_space = true + + -- No space if it's the first token in the block + if not prev then + need_space = false + elseif prev.t == "Space" then + need_space = false + elseif prev.t == "Str" then + -- If previous char is an opening bracket/paren/slash/hyphen or whitespace, skip + local last = prev.text:sub(-1) + if last:match("[%(%[%{%/%-]") or last:match("%s") then + need_space = false + end + end + + if need_space then table.insert(out, pandoc.Space()) end + table.insert(out, cur) + else + table.insert(out, cur) + end + end + return out +end + +function Para(el) + el.content = ensure_space_before_ph(el.content) + return el +end + +function Plain(el) + el.content = ensure_space_before_ph(el.content) + return el +end diff --git a/thesis/tex2plaintext.sh b/thesis/tex2plaintext.sh index b009770..39a7122 100755 --- a/thesis/tex2plaintext.sh +++ b/thesis/tex2plaintext.sh @@ -18,18 +18,24 @@ PART1_TXT="${OUT_BASE}_part1.txt" PART2_TXT="${OUT_BASE}_part2.txt" MARKER="Data and Preprocessing" -echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}" +echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}" latexpand "${INPUT_TEX}" > "${FLAT_TEX}" -echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}" +echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}" # Replace entire tabular / tabularx environments with a placeholder perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \ "${FLAT_TEX}" > "${NO_TABLES_TEX}" -echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}" -pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}" +echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}" +pandoc -f latex -t plain --wrap=none \ + --lua-filter=filters/keep-citations.lua \ + --lua-filter=filters/math-omit.lua \ + "${NO_TABLES_TEX}" -o "${PLAIN_TXT}" -echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\"" +echo "[4/5] Replacing [] placeholders with [figure]" +sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}" + +echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\"" # Ensure the marker exists exactly on its own line if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then