better grammarly prep

This commit is contained in:
Jan Kowalczyk
2025-10-18 12:47:16 +02:00
parent 374420727b
commit 5aca00ad67
5 changed files with 59 additions and 5 deletions

View File

@@ -0,0 +1,48 @@
-- math-omit.lua
-- Replace any math with a placeholder and ensure a space before it when appropriate.
local PH = "[math omitted]"
function Math(el)
-- Emit the placeholder as a Str; spacing is fixed in Para/Plain below.
return pandoc.Str(PH)
end
local function ensure_space_before_ph(inlines)
local out = {}
for i = 1, #inlines do
local cur = inlines[i]
if cur.t == "Str" and cur.text == PH then
local prev = out[#out]
local need_space = true
-- No space if it's the first token in the block
if not prev then
need_space = false
elseif prev.t == "Space" then
need_space = false
elseif prev.t == "Str" then
-- If previous char is an opening bracket/paren/slash/hyphen or whitespace, skip
local last = prev.text:sub(-1)
if last:match("[%(%[%{%/%-]") or last:match("%s") then
need_space = false
end
end
if need_space then table.insert(out, pandoc.Space()) end
table.insert(out, cur)
else
table.insert(out, cur)
end
end
return out
end
function Para(el)
el.content = ensure_space_before_ph(el.content)
return el
end
function Plain(el)
el.content = ensure_space_before_ph(el.content)
return el
end

View File

@@ -18,18 +18,24 @@ PART1_TXT="${OUT_BASE}_part1.txt"
PART2_TXT="${OUT_BASE}_part2.txt" PART2_TXT="${OUT_BASE}_part2.txt"
MARKER="Data and Preprocessing" MARKER="Data and Preprocessing"
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}" echo "[1/5] Flattening with latexpand -> ${FLAT_TEX}"
latexpand "${INPUT_TEX}" > "${FLAT_TEX}" latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}" echo "[2/5] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
# Replace entire tabular / tabularx environments with a placeholder # Replace entire tabular / tabularx environments with a placeholder
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \ perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
"${FLAT_TEX}" > "${NO_TABLES_TEX}" "${FLAT_TEX}" > "${NO_TABLES_TEX}"
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}" echo "[3/5] Converting to plain text with pandoc -> ${PLAIN_TXT}"
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}" pandoc -f latex -t plain --wrap=none \
--lua-filter=filters/keep-citations.lua \
--lua-filter=filters/math-omit.lua \
"${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\"" echo "[4/5] Replacing [] placeholders with [figure]"
sed -i 's/\[\]/[figure]/g' "${PLAIN_TXT}"
echo "[5/5] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
# Ensure the marker exists exactly on its own line # Ensure the marker exists exactly on its own line
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then