Skip to content

Commit 356e213

Browse files
committed
fix: enhance setup instructions for Docling, PDFtoText, and OCRMyPDF processors
1 parent 4a82d26 commit 356e213

1 file changed

Lines changed: 24 additions & 10 deletions

File tree

transaction_parser/transaction_parser/utils/pdf_processor.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,10 @@ class DoclingPDFProcessor(PDFProcessor):
9595

9696
_converter = None
9797

98-
# TODO: Give detail of install `docling` system dependency and opencv-python-headless for OCR
98+
SETUP_URL = (
99+
"https://github.com/resilient-tech/transaction-parser#3-docling-optional"
100+
)
101+
99102
def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
100103
try:
101104
from docling.datamodel.base_models import ConversionStatus, DocumentStream
@@ -104,8 +107,9 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
104107
title=_("Missing Dependency"),
105108
msg=_(
106109
"docling is not installed.<br>"
107-
"Install it with: <code>bench pip install transaction_parser[docling]</code>"
108-
),
110+
"Install it with: <code>bench pip install transaction_parser[docling]</code><br>"
111+
"See <a href='{0}'>setup instructions</a> for more details."
112+
).format(self.SETUP_URL),
109113
)
110114

111115
file = self.get_sanitized_file(file, page_limit)
@@ -147,8 +151,9 @@ def _get_converter(self):
147151
title=_("Missing Dependency"),
148152
msg=_(
149153
"docling is not installed.<br>"
150-
"Install it with: <code>bench pip install transaction_parser[docling]</code>"
151-
),
154+
"Install it with: <code>bench pip install transaction_parser[docling]</code><br>"
155+
"See <a href='{0}'>setup instructions</a> for more details."
156+
).format(self.SETUP_URL),
152157
)
153158

154159
pipeline_options = PdfPipelineOptions()
@@ -169,6 +174,10 @@ class PDFtoTextProcessor(PDFProcessor):
169174
PDF processor using pdftotext for layout-preserving text extraction.
170175
"""
171176

177+
SETUP_URL = (
178+
"https://github.com/resilient-tech/transaction-parser#1-pdftotext-default"
179+
)
180+
172181
def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
173182
file = self.get_sanitized_file(file, page_limit)
174183
return self.get_text(file)
@@ -183,8 +192,9 @@ def get_text(self, file: io.BytesIO) -> str:
183192
"pdftotext is not installed.<br>"
184193
"Install OS dependencies first if not already installed: "
185194
"<code>sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev</code>"
186-
"<br>Then run: <code>bench setup requirements</code>"
187-
),
195+
"<br>Then run: <code>bench setup requirements</code><br>"
196+
"See <a href='{0}'>setup instructions</a> for more details."
197+
).format(self.SETUP_URL),
188198
)
189199

190200
pdf = pdftotext.PDF(file, physical=True)
@@ -197,13 +207,16 @@ class OCRMyPDFProcessor(PDFProcessor):
197207
PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR.
198208
"""
199209

210+
SETUP_URL = (
211+
"https://github.com/resilient-tech/transaction-parser#2-ocrmypdf-optional"
212+
)
213+
200214
def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
201215
file = self.get_sanitized_file(file, page_limit)
202216
file = self.apply_ocr(file)
203217

204218
return self.get_text(file)
205219

206-
# TODO: Give detail of install `tesseract-ocr` system dependency
207220
def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
208221
try:
209222
import ocrmypdf
@@ -212,8 +225,9 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
212225
title=_("Missing Dependency"),
213226
msg=_(
214227
"ocrmypdf is not installed.<br>"
215-
"Install it with: <code>bench pip install transaction_parser[ocrmypdf]</code>"
216-
),
228+
"Install it with: <code>bench pip install transaction_parser[ocrmypdf]</code><br>"
229+
"See <a href='{0}'>setup instructions</a> for more details."
230+
).format(self.SETUP_URL),
217231
)
218232

219233
file.seek(0)

0 commit comments

Comments
 (0)