Skip to content

Commit deae482

Browse files
committed
add python paper muncher package
1 parent cd0f588 commit deae482

File tree

26 files changed

+1632
-153
lines changed

26 files changed

+1632
-153
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from paper_muncher.asynchrous import render
2+
3+
4+
html = """
5+
<h1>Hello, Paper Muncher!</h1>
6+
<p>This is a simple example of using Paper Muncher in an asynchronous context.</p>
7+
"""
8+
9+
async def main():
10+
pdf_bytes = await render(html, mode="print")
11+
with open("output_async.pdf", "wb") as f:
12+
f.write(pdf_bytes)
13+
print("PDF generated and saved as output_async.pdf")
14+
15+
if __name__ == "__main__":
16+
import asyncio
17+
asyncio.run(main())
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from paper_muncher.frameworks.flask import register_paper_muncher
2+
from flask import Flask, Response
3+
4+
app = Flask(__name__)
5+
register_paper_muncher(app)
6+
7+
8+
@app.route("/")
9+
def index():
10+
html_content = "<h1>Hello, Paper Muncher with Flask!</h1>"
11+
pdf_bytes = app.run_paper_muncher(html_content, mode="print")
12+
return Response(pdf_bytes, mimetype="application/pdf")
13+
14+
if __name__ == "__main__":
15+
app.run(debug=True)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from paper_muncher.synchronous import render
2+
3+
4+
html = """
5+
<h1>Hello, Paper Muncher!</h1>
6+
<p>This is a simple example of using Paper Muncher in a synchronous context.</p>
7+
"""
8+
9+
def main():
10+
pdf_bytes = render(html, mode="print")
11+
with open("output.pdf", "wb") as f:
12+
f.write(pdf_bytes)
13+
print("PDF generated and saved as output.pdf")
14+
15+
if __name__ == "__main__":
16+
main()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
3+
import logging
4+
import os
5+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""The :mod:`paper_muncher.asynchronous` module
2+
provides the core functionality for rendering documents
3+
using the Paper Muncher engine.
4+
It includes the main rendering functions and utilities
5+
for managing the rendering process.
6+
"""
7+
8+
from .interface import rendered, render
9+
from ..binary import can_use_paper_muncher
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
"""
2+
The :mod:`.paper_muncher.synchronous.interface` module provides
3+
utilities for interacting with Paper Muncher, a subprocess used to render
4+
HTML content into or Image format.
5+
"""
6+
7+
8+
import logging
9+
from asyncio import wait_for, TimeoutError as AsyncTimeoutError
10+
from asyncio.subprocess import PIPE as APIPE
11+
from datetime import datetime, timezone
12+
from contextlib import asynccontextmanager
13+
from collections.abc import Generator
14+
from email.utils import format_datetime
15+
from io import BytesIO
16+
from itertools import count
17+
from typing import BinaryIO, Optional
18+
19+
from .request import (
20+
consume_paper_muncher_request,
21+
read_paper_muncher_request,
22+
)
23+
from .io_with_timeout import (
24+
read_all_with_timeout,
25+
write_with_timeout,
26+
)
27+
from .popen import Popen
28+
from ..binary import get_paper_muncher_binary, can_use_paper_muncher
29+
30+
from ..typing import AsyncRunner
31+
32+
_logger = logging.getLogger(__name__)
33+
34+
AUTHORIZED_MODE = {'print', 'render'}
35+
DEFAULT_READ_TIMEOUT = 60 # seconds
36+
DEFAULT_READLINE_TIMEOUT = 60 * 15 # seconds (15 minutes is for the put request)
37+
DEFAULT_WRITE_TIMEOUT = 30 # seconds
38+
DEFAULT_CHUNK_SIZE = 4096 # bytes
39+
DEFAULT_WAIT_TIMEOUT = 5 # seconds
40+
NOT_RENDERABLE_OPTIONS = {
41+
'read_timeout',
42+
'readline_timeout',
43+
'write_timeout',
44+
'chunk_size',
45+
'wait_timeout',
46+
}
47+
SERVER_SOFTWARE = b'Paper Muncher (Fully Asynchronous Engine)'
48+
49+
50+
@asynccontextmanager
51+
async def rendered(
52+
content: BytesIO,
53+
mode: str = "print",
54+
runner: Optional[AsyncRunner] = None,
55+
**options,
56+
) -> Generator[tuple[BinaryIO], None, None]:
57+
"""Async context manager to render HTML content using Paper Muncher.
58+
59+
:param content: The HTML content to render, as a BytesIO object.
60+
:param mode: The rendering mode, either 'print' or 'render'.
61+
:param runner: Optional AsyncRunner function to handle asset requests.
62+
:param options: Additional options to pass to Paper Muncher.
63+
:return: A generator yielding the stdout and stderr streams of the
64+
Paper Muncher process.
65+
:raises RuntimeError: If Paper Muncher is not available or crashes.
66+
:raises ValueError: If an invalid mode is specified.
67+
"""
68+
69+
if not can_use_paper_muncher():
70+
raise RuntimeError(
71+
"Paper Muncher is not available in the current session. "
72+
"Ensure it is installed and available in the system PATH."
73+
)
74+
75+
if not mode in AUTHORIZED_MODE:
76+
raise ValueError(
77+
f"Invalid mode '{mode}', must be one of {AUTHORIZED_MODE}"
78+
)
79+
80+
readline_timeout = options.get(
81+
'readline_timeout',
82+
DEFAULT_READLINE_TIMEOUT,
83+
)
84+
write_timeout = options.get('write_timeout', DEFAULT_WRITE_TIMEOUT)
85+
wait_timeout = options.get('wait_timeout', DEFAULT_WAIT_TIMEOUT)
86+
87+
extra_args = []
88+
for option, value in options.items():
89+
if option in NOT_RENDERABLE_OPTIONS:
90+
continue
91+
extra_args.extend([
92+
f'--{option}', str(value),
93+
])
94+
95+
if not (binary := get_paper_muncher_binary()):
96+
raise RuntimeError(
97+
"Paper Muncher binary not found or not usable. "
98+
"Ensure it is installed and available in the system PATH."
99+
)
100+
101+
async with Popen(
102+
[binary, mode, "pipe:", '-o', "pipe:"] + extra_args,
103+
stdin=APIPE,
104+
stdout=APIPE,
105+
stderr=APIPE,
106+
) as process:
107+
# Phase 1: send HTML content headers and body
108+
try:
109+
await consume_paper_muncher_request(
110+
process.stdout,
111+
timeout=readline_timeout,
112+
)
113+
except EOFError as early_eof:
114+
raise RuntimeError(
115+
"Paper Muncher terminated prematurely (phase 1)"
116+
) from early_eof
117+
118+
if process.returncode is not None:
119+
raise RuntimeError(
120+
"Paper Muncher crashed before receiving content")
121+
122+
now = datetime.now(timezone.utc)
123+
response_headers = (
124+
b"HTTP/1.1 200 OK\r\n"
125+
b"Content-Length: %(length)d\r\n"
126+
b"Content-Type: text/html\r\n"
127+
b"Date: %(date)s\r\n"
128+
b"Server: %(server)s\r\n"
129+
b"\r\n"
130+
) % {
131+
b'length': len(content.encode()),
132+
b'date': format_datetime(now, usegmt=True).encode(),
133+
b'server': SERVER_SOFTWARE,
134+
}
135+
136+
await write_with_timeout(
137+
process.stdin,
138+
response_headers,
139+
timeout=write_timeout,
140+
)
141+
await write_with_timeout(
142+
process.stdin,
143+
content.encode(),
144+
timeout=write_timeout,
145+
)
146+
147+
if process.returncode is not None:
148+
raise RuntimeError(
149+
"Paper Muncher crashed while sending HTML content")
150+
151+
# Phase 2: serve asset requests until the rendered content is ready
152+
for request_no in count(start=1):
153+
try:
154+
path = await read_paper_muncher_request(
155+
process.stdout,
156+
timeout=readline_timeout,
157+
)
158+
except (EOFError, TimeoutError):
159+
process.kill()
160+
await process.wait()
161+
raise
162+
163+
if path is None:
164+
break
165+
166+
for chunk in await runner(path):
167+
await write_with_timeout(
168+
process.stdin,
169+
chunk,
170+
timeout=write_timeout
171+
)
172+
173+
if process.returncode is not None:
174+
raise RuntimeError(
175+
"Paper Muncher crashed while serving asset"
176+
f" {request_no}: {path}"
177+
)
178+
179+
# Phase 3: send final OK and close the process
180+
now = datetime.now(timezone.utc)
181+
final_response = (
182+
b"HTTP/1.1 200 OK\r\n"
183+
b"Date: %(date)s\r\n"
184+
b"Server: %(server)s\r\n"
185+
b"\r\n"
186+
) % {
187+
b'date': format_datetime(now, usegmt=True).encode(),
188+
b'server': SERVER_SOFTWARE,
189+
}
190+
191+
await write_with_timeout(
192+
process.stdin,
193+
final_response,
194+
timeout=write_timeout,
195+
)
196+
try:
197+
process.stdin.write_eof()
198+
except (NotImplementedError, AttributeError):
199+
process.stdin.close()
200+
await process.stdin.wait_closed()
201+
202+
if process.returncode is not None:
203+
raise RuntimeError(
204+
"Paper Muncher crashed before returning the rendered content"
205+
)
206+
207+
try:
208+
yield process.stdout, process.stderr
209+
finally:
210+
try:
211+
await wait_for(
212+
process.wait(),
213+
timeout=wait_timeout,
214+
)
215+
except AsyncTimeoutError:
216+
process.kill()
217+
await process.wait()
218+
_logger.warning(
219+
"Paper Muncher did not terminate in time,"
220+
"forcefully killed it"
221+
)
222+
223+
if process.returncode != 0:
224+
_logger.warning(
225+
"Paper Muncher exited with code %d",
226+
process.returncode,
227+
)
228+
229+
230+
async def render(
231+
content: BytesIO,
232+
mode: str = "print",
233+
runner: Optional[AsyncRunner] = None,
234+
**options,
235+
) -> bytes:
236+
"""Render HTML content using Paper Muncher and return the rendered output.
237+
238+
:param content: The HTML content to render, as a BytesIO object.
239+
:param mode: The rendering mode, either 'print' or 'render'.
240+
:param runner: Optional AsyncRunner function to handle asset requests.
241+
:param options: Additional options to pass to Paper Muncher.
242+
:return: The rendered content as bytes.
243+
:raises RuntimeError: If Paper Muncher is not available or crashes.
244+
:raises ValueError: If an invalid mode is specified.
245+
"""
246+
247+
async with rendered(
248+
content,
249+
mode=mode,
250+
runner=runner,
251+
**options,
252+
) as (content_stream, error_stream):
253+
read_timeout = options.get('read_timeout', DEFAULT_READ_TIMEOUT)
254+
chunk_size = options.get('chunk_size', DEFAULT_CHUNK_SIZE)
255+
rendered_content = await read_all_with_timeout(
256+
content_stream,
257+
chunk_size=chunk_size,
258+
timeout=read_timeout,
259+
)
260+
stderr_output = await read_all_with_timeout(
261+
error_stream,
262+
chunk_size=chunk_size,
263+
timeout=read_timeout,
264+
)
265+
266+
if stderr_output:
267+
_logger.warning(
268+
"Paper Muncher error output: %s",
269+
stderr_output.decode('utf-8', errors='replace'),
270+
)
271+
272+
if mode == "print":
273+
if not rendered_content.startswith(b'%PDF-'):
274+
raise RuntimeError(
275+
"Paper Muncher did not return valid PDF content"
276+
)
277+
278+
return rendered_content

0 commit comments

Comments
 (0)