|
| 1 | +#!/usr/bin/env lua5.3 |
| 2 | + |
| 3 | +-- lua filter for converting (a subset of) LaTeX into HTML5 with mathjax |
| 4 | +-- Basic idea: leave the math to matjax and search&replace simple commands |
| 5 | +-- Note that this does not really parse the LaTeX source, it only performs |
| 6 | +-- a line-based pattern matching for a well-behaved subset of LaTeX that |
| 7 | +-- i am forced to use if i want to run this filter. |
| 8 | + |
| 9 | +-- read whole input file into a string |
| 10 | +whole_file = io.read("*all") |
| 11 | + |
| 12 | +-- extract title and other data |
| 13 | +metadata_title = whole_file:match("^\\title{(.-)}") |
| 14 | +metadata_maths = false |
| 15 | + |
| 16 | +-- table of global substitutions (could be implemented in sed) |
| 17 | +global_substitutions = { |
| 18 | + |
| 19 | + -- change a few character entities |
| 20 | + {"&", "&"}, -- note: this affects LaTeX tables, see below |
| 21 | + {"<", "<"}, |
| 22 | + {">", ">"}, |
| 23 | + {"~", " "}, -- real non-breaking space here, equivalent to |
| 24 | + |
| 25 | + -- htmlize SCRIPT tags |
| 26 | + {"\n%%(SCRIPT[^\n]+)", "\n<!--\n%%1\n-->"}, |
| 27 | + {"\n%-%->\n<!%-%-", ""}, |
| 28 | + |
| 29 | + -- do not remove comments, just escape % to % |
| 30 | + {"%%", "%"}, |
| 31 | + |
| 32 | + -- change \emph{...} to <i>...</i> (and similar inline stuff) |
| 33 | + {"\\emph{(.-)}", "<i>%1</i>"}, |
| 34 | + {"\\textbf{(.-)}", "<b>%1</b>"}, |
| 35 | + {"\\textit{(.-)}", "<i>%1</i>"}, |
| 36 | + {"\\texttt{(.-)}", "<code>%1</code>"}, |
| 37 | + {"{\\it%s(.-)}", "<i>%1</i>"}, |
| 38 | + {"{\\bf%s(.-)}", "<b>%1</b>"}, |
| 39 | + {"{\\tt%s(.-)}", "<code>%1</code>"}, |
| 40 | + |
| 41 | + -- change \includegraphics{...} to <img src="..."> |
| 42 | + {"\\includegraphics{(.-)}", '<img src="%1" alt="image">'}, |
| 43 | + |
| 44 | + -- change \href{link}{text} to <a href="link">text</a> |
| 45 | + {"\\href{(.-)}{(.-)}", "<a href=\"%1\">%2</a>"}, |
| 46 | + {"\\url{(.-)}", "<a href=\"%1\">%1</a>"}, |
| 47 | + |
| 48 | + -- change \newline to <br> |
| 49 | + {"\\newline", "<br>"}, |
| 50 | + {"\\bigskip", "<p>"}, |
| 51 | + {"\\medskip", "<p>"}, |
| 52 | + {"\\smallskip", ""}, |
| 53 | + |
| 54 | + -- remove latex-only stuff |
| 55 | + {"\\clearpage", ""}, |
| 56 | + {"\\thispagestyle{(.-)}", ""}, |
| 57 | + {"\\setcounter{(.-)}{(.-)}", ""}, |
| 58 | + {"\\vfill", ""}, |
| 59 | + {"\\mbox{(.-)}", '<span>%1</span>'}, |
| 60 | + {"\\fbox{(.-)}", '<span style="border:1px solid">%1</span>'}, |
| 61 | + |
| 62 | + -- change \sections to <headers> |
| 63 | + {"\\title{(.-)}", "<h1>%1</h1>\n"}, |
| 64 | + {"\\section{(.-)}", "<h2>%1</h2>\n"}, |
| 65 | + {"\\subsection{(.-)}", "<h3>%1</h3>\n"}, |
| 66 | + {"\\paragraph{(.-)}", "<p>\n<b>%1</b>"}, |
| 67 | + |
| 68 | + -- list environments (TODO: support for \item[] ) |
| 69 | + {"\\item ", "<li>" }, |
| 70 | + {"\\begin{enumerate}", "<ol>"}, |
| 71 | + {"\\end{enumerate}", "</ol>"}, |
| 72 | + {"\\begin{itemize}", "<ul>"}, |
| 73 | + {"\\end{itemize}", "</ul>"}, |
| 74 | + {"\\begin{verbatim}", "<pre>"}, |
| 75 | + {"\\end{verbatim}", "</pre>"}, |
| 76 | + {"\\item%[(.-)%] ", "<li style='list-style-type:none'>%1 "}, |
| 77 | + |
| 78 | + -- change \verb+...+ to <code>...</code> |
| 79 | + {"\\verb%+([^+]*)%+", "<code>%1</code>"}, |
| 80 | + {"\\verb_([^_]*)_", "<code>%1</code>"}, |
| 81 | + |
| 82 | + -- quotation environements |
| 83 | + {"\\begin{quotation}", "<blockquote>"}, |
| 84 | + {"\\end{quotation}", "</blockquote>"}, |
| 85 | + {"\\begin{quote}", "<blockquote>"}, |
| 86 | + {"\\end{quote}", "</blockquote>"}, |
| 87 | + |
| 88 | + -- gallery environement |
| 89 | + {"\\begin{gallery}", "<div class=\"gallery\" style=\"height:400px\"><ul class=\"index\">\n"}, |
| 90 | + {"\\end{gallery}", "</ul></div>"}, |
| 91 | + {"\\galleryline{(.-)}", '<li><a href="%1">%1<span><img src="%1" alt=""></span></a>\n'}, |
| 92 | + |
| 93 | + -- theorem environments (TODO: different class for each type) |
| 94 | + {"\\begin{proof}", "<p>\n<i>Proof.</i>"}, |
| 95 | + {"\\end{proof}", "<span style=\"float:right\">∎</span>"}, |
| 96 | + -- to avoid repetitive code, the other environments are added in a loop |
| 97 | + -- below |
| 98 | + |
| 99 | + -- abstract environement (wherever it appears) |
| 100 | + {"\\begin{abstract}(.-)\\end{abstract}", |
| 101 | + '<blockquote style="font-size:80%%"><b>Abstract.</b>%1</blockquote>' |
| 102 | + }, |
| 103 | + |
| 104 | + -- tabular (TODO: recover vertical bars) |
| 105 | + -- XXX: the order of these pattern replacements is important |
| 106 | + {"%s*\\\\(%s*\\end{tabular})", "%1"}, |
| 107 | + {"\\begin{tabular}{.-}", "<table><tr><td>"}, |
| 108 | + {"\\end{tabular}", "</table>"}, |
| 109 | + |
| 110 | + -- surround \newcommands by math so that mathjax sees it |
| 111 | + {"\n(\\newcommand[^\n]+)", "$%1$\n"}, |
| 112 | + |
| 113 | + -- hack to remove "i/" from displayed image and text filenames |
| 114 | + {'([^"])i/([%a.-]-%.tif)', "%1%2"}, |
| 115 | + {'([^"])i/([%a.-]-%.png)', "%1%2"}, |
| 116 | + {'([^"])i/([%a.-]-%.jpg)', "%1%2"}, |
| 117 | + {'([^"])i/([%a.-]-%.txt)', "%1%2"}, |
| 118 | + {'([^"])o/([%a.-]-%.tif)', "%1%2"}, |
| 119 | + {'([^"])o/([%a.-]-%.png)', "%1%2"}, |
| 120 | + {'([^"])o/([%a.-]-%.jpg)', "%1%2"}, |
| 121 | + {'([^"])o/([%a.-]-%.txt)', "%1%2"}, |
| 122 | +} |
| 123 | + |
| 124 | +-- add theorem-like environments to the list of substitutions |
| 125 | +thm_envs = {"Theorem", "Definition", "Remark", "Lemma", "Proposition", |
| 126 | + "Exercice", "Axiom"} |
| 127 | +for _,X in pairs(thm_envs) do |
| 128 | + x = X:lower() |
| 129 | + p = {"\\begin{"..x.."}%[(.-)%]", |
| 130 | + '<p>\n<div class="thm"><b>'..X..'</b> (%1). <em>'} |
| 131 | + q = {"\\begin{"..x.."}", '<p>\n<div class="thm"><b>'..X.."</b>. <em>"} |
| 132 | + r = {"\\end{"..x.."}", "</em></div>"} |
| 133 | + table.insert(global_substitutions, p) |
| 134 | + table.insert(global_substitutions, q) |
| 135 | + table.insert(global_substitutions, r) |
| 136 | +end |
| 137 | + |
| 138 | +-- perform the global substitutions |
| 139 | +for _,s in ipairs(global_substitutions) do |
| 140 | + whole_file = whole_file:gsub(s[1], s[2]) |
| 141 | +end |
| 142 | + |
| 143 | +-- process the galleries |
| 144 | +list_of_galleries = {} |
| 145 | + |
| 146 | +-- g traverses all gallery blocks |
| 147 | +for g in whole_file:gmatch('(<div class="gallery" .-</ul></div>)') do |
| 148 | + f = g:match('<img src="(.-)"') -- f=first image file in the gallery |
| 149 | + h = io.popen("imprintf %h "..f):read("*all*") -- h=image height |
| 150 | + _,n = g:gsub("<li>", "") -- n=total number of images in the gallery |
| 151 | + table.insert(list_of_galleries, {f, h, math.ceil(2*n)}) |
| 152 | +end |
| 153 | + |
| 154 | +-- adapt the size of each gallery |
| 155 | +for i=1,#list_of_galleries do |
| 156 | + g = list_of_galleries[i] |
| 157 | + f,h,n = g[1],g[2],g[3] -- filename, height in pixels, height in em |
| 158 | + f2 = f:gsub("%-", "--") -- escaped filename |
| 159 | + p = 'gallery" style="height:400px">(.-<a href=")'..f2 |
| 160 | + q = 'gallery" style="min-height:'..h..'px;height:'..n..'em">%1'..f |
| 161 | + whole_file = whole_file:gsub(p, q) |
| 162 | +end |
| 163 | + |
| 164 | +-- process \input{file} and \VerbatimInput{file} |
| 165 | +putfile = function(l) return io.open(l, "r"):read("*all") end |
| 166 | +putfile_verb = function(l) return "<pre>\n"..io.open(l, "r"):read("*all").."</pre>\n" end |
| 167 | +whole_file = whole_file:gsub("\\input{(.-)}", putfile) |
| 168 | +whole_file = whole_file:gsub("\\VerbatimInput{(.-)}", putfile_verb) |
| 169 | + |
| 170 | + |
| 171 | +output_template = [[ |
| 172 | +<!doctype html> |
| 173 | +<meta charset="utf-8" /> |
| 174 | +<title>TITLE</title> |
| 175 | +CSSCODE |
| 176 | +JAXCODE |
| 177 | +LITCODE |
| 178 | +]] |
| 179 | + |
| 180 | +boilerplate_lit = [[ |
| 181 | +<!-- |
| 182 | + This file is a literate program. |
| 183 | + The experiments are run by applying the following filter: |
| 184 | + |
| 185 | + grep ^%%SCRIPT | sed 's/&/\&/g' | cut -c9- | sh |
| 186 | +--> |
| 187 | +]] |
| 188 | + |
| 189 | +boilerplate_css = [[ |
| 190 | + body { max-width:90ex; } |
| 191 | + pre { background:lightgray; width:80ch; } |
| 192 | + table, td { border:1px solid black; border-collapse:collapse; } |
| 193 | + table td { padding:7px; border-spacing:0px; } |
| 194 | + .thm em i { font-style: normal;}]] |
| 195 | +-- TODO: only add the styling if the concerned elements are present |
| 196 | + |
| 197 | +css_gallery = [[ |
| 198 | + .gallery{position:relative;width:auto;height:400px} |
| 199 | + .gallery .index{padding:0;margin:0;width:9em;list-style:none} |
| 200 | + .gallery .index li{margin:0;padding:0} |
| 201 | + .gallery .index a{display:block;background-color:#eee;border:1px solid #fff;text-decoration:none;width:11em;padding:5px} |
| 202 | + .gallery .index a span{display:block;position:absolute;left:-9999px;top:0;padding-left:2em} |
| 203 | + .gallery .index li:first-child a span{left:10em;z-index:99} |
| 204 | + .gallery .index a:hover{ border: 1px solid #888888;} |
| 205 | + .gallery .index a:hover span{left:10em;z-index:100} |
| 206 | + .gallery .index a span img{ } |
| 207 | + .gallery .index a span { white-space:nowrap; } |
| 208 | +]] |
| 209 | + |
| 210 | +boilerplate_mathjax = [[ |
| 211 | +<script type="text/x-mathjax-config"> |
| 212 | + MathJax.Hub.Config({ |
| 213 | + tex2jax: { |
| 214 | + inlineMath: [ ['$','$'], ['\\(','\\)'] ], |
| 215 | + processEscapes: true |
| 216 | + } |
| 217 | + }); |
| 218 | +</script> |
| 219 | +<script async src="JAXURL?JAXOPT"> |
| 220 | +</script> |
| 221 | +]] |
| 222 | + |
| 223 | +JAXURL = "https://cdn.rawgit.com/mathjax/MathJax/2.7.1/MathJax.js" |
| 224 | +JAXOPT = "config=TeX-AMS_CHTML-full" |
| 225 | + |
| 226 | +if whole_file:match("%$") then -- TODO: perform a more intelligent check here |
| 227 | + metadata_maths = true |
| 228 | +end |
| 229 | +if not metadata_maths then boilerplate_mathjax = "" end |
| 230 | + |
| 231 | +if not whole_file:match("\n%SCRIPT") then boilerplate_lit = "" end |
| 232 | + |
| 233 | +if whole_file:match("<div class=\"gallery\"") then |
| 234 | + boilerplate_css = boilerplate_css .. css_gallery |
| 235 | +end |
| 236 | + |
| 237 | +boilerplate_mathjax = boilerplate_mathjax:gsub("JAXURL", JAXURL) |
| 238 | +boilerplate_mathjax = boilerplate_mathjax:gsub("JAXOPT", JAXOPT) |
| 239 | + |
| 240 | +if not metadata_title then metadata_title = "" end |
| 241 | + |
| 242 | +boilerplate_css = "<style>\n"..boilerplate_css.."\n</style>" |
| 243 | +output_template = output_template:gsub("TITLE", metadata_title) |
| 244 | +output_template = output_template:gsub("CSSCODE", boilerplate_css) |
| 245 | +output_template = output_template:gsub("JAXCODE", boilerplate_mathjax) |
| 246 | +output_template = output_template:gsub("LITCODE", boilerplate_lit) |
| 247 | + |
| 248 | +-- start building output file |
| 249 | +output_lines = { output_template } |
| 250 | +table.insert(output_lines, "\n") |
| 251 | + |
| 252 | + |
| 253 | +-- initialize counters |
| 254 | +counter_sec = 0 |
| 255 | +counter_ssec = 0 |
| 256 | +counter_thm = 0 |
| 257 | +table_depth = 0 |
| 258 | +toggle_pre = false |
| 259 | + |
| 260 | +o = "" -- the previous line |
| 261 | +for l in string.gmatch(whole_file, "[^\n]-\n") do -- traverse line by line |
| 262 | + u = l -- line to add to the output (the same, by default) |
| 263 | + |
| 264 | + -- if a paragraph starts, put <p> |
| 265 | + if o == "\n" and |
| 266 | + not toggle_pre and |
| 267 | + not u:match("^%$\\newcommand") and |
| 268 | + ( |
| 269 | + u:match("^[%a][^\n]-\n") or |
| 270 | + u:match("^<img[^\n]-\n") or |
| 271 | + u:match("^<span[^\n]-\n") or |
| 272 | + u:match("^<b>[^\n]-\n") or |
| 273 | + u:match("^<i>[^\n]-\n") or |
| 274 | + u:match("^<em>[^\n]-\n") or |
| 275 | + u:match("^%$[^%$]") |
| 276 | + ) |
| 277 | + then |
| 278 | + u = "<p>\n"..u |
| 279 | + end |
| 280 | + |
| 281 | + -- build a table context |
| 282 | + if o:match("<table" ) then table_depth = table_depth + 1 end |
| 283 | + if u:match("</table>") then table_depth = table_depth - 1 end |
| 284 | + |
| 285 | + -- if inside table, change & and \\ to <td> and <tr>, remove \hline |
| 286 | + if table_depth > 0 then |
| 287 | + u = u:gsub("&", "<td>") |
| 288 | + u = u:gsub("\\\\", "<tr><td>") |
| 289 | + u = u:gsub("\\hline", "") |
| 290 | + end |
| 291 | + |
| 292 | + -- section counters (TODO: add attr name="sec-n" for easier navigation) |
| 293 | + if u:match("^<h2>.*$") then |
| 294 | + counter_sec = counter_sec + 1 |
| 295 | + counter_ssec = 0 |
| 296 | + u = u:gsub("^<h2>", "<h2>"..counter_sec..". ") |
| 297 | + end |
| 298 | + |
| 299 | + -- subsection counters |
| 300 | + if u:match("^<h3>.*$") then |
| 301 | + counter_ssec = counter_ssec + 1 |
| 302 | + u = u:gsub("^<h3>","<h3>"..counter_sec.."."..counter_ssec..". ") |
| 303 | + end |
| 304 | + |
| 305 | + -- theorem counters |
| 306 | + if u:match("^<div class=\"thm\">.*%</b>") then |
| 307 | + counter_thm = counter_thm + 1 |
| 308 | + u = u:gsub("%</b>", " "..counter_thm.."</b>") |
| 309 | + end |
| 310 | + |
| 311 | + -- <pre> book-keeping |
| 312 | + if u:match("^<pre") then toggle_pre = true end |
| 313 | + if u:match("^ <pre") then toggle_pre = true end |
| 314 | + if u:match("</pre>") then toggle_pre = false end |
| 315 | + |
| 316 | + -- post-processing replacements |
| 317 | + if toggle_pre then -- if inside <pre> : |
| 318 | + u = u:gsub("%", "%%") -- un-escape html entity for % |
| 319 | + end |
| 320 | + if u:match("^%%SCRIPT ") then -- if inside valid ^%SCRIPT line : |
| 321 | + u = u:gsub("<", "<") -- un-escape html entity for < |
| 322 | + u = u:gsub(">", ">") -- un-escape html entity for > |
| 323 | + u = u:gsub("%", "%%") -- un-escape html entity for % |
| 324 | + elseif not toggle_pre then -- deal with comments |
| 325 | + u = u:gsub("^%[^\n]*\n", "") -- remove whole line comment |
| 326 | + u = u:gsub("([^\\])%[^\n]*\n", "%1\n") -- midline comment |
| 327 | + u = u:gsub("\\%", "\\%%") -- escaped % symbols |
| 328 | + end |
| 329 | + |
| 330 | + if o ~= "\n" or u ~= "\n" then -- uniq-ify blank lines |
| 331 | + table.insert(output_lines, u) |
| 332 | + end |
| 333 | + o = l |
| 334 | +end |
| 335 | + |
| 336 | +table.insert(output_lines, "\n<hr>\n\n") |
| 337 | +--table.insert(output_lines, '\n<!-- yes i know, wanna fight about it ? -->\n') |
| 338 | +--tracker = string.format('<img src="%s?a=%s" width="1" height="1" alt="">\n\n', |
| 339 | +-- "http://boucantrin.ovh.hw.ipol.im:7743/white_pixel.png", |
| 340 | +-- metadata_title:gsub("%s", "_") |
| 341 | +-- ) |
| 342 | +--table.insert(output_lines, tracker) |
| 343 | +table.insert(output_lines, "<!-- enric meinhardt-llopis, 2019 -->\n\n") |
| 344 | + |
| 345 | +-- dump output lines |
| 346 | +for _,l in ipairs(output_lines) do |
| 347 | + io.stdout:write(l) |
| 348 | +end |
0 commit comments