@@ -95,7 +95,10 @@ class DoclingPDFProcessor(PDFProcessor):
9595
9696 _converter = None
9797
98- # TODO: Give detail of install `docling` system dependency and opencv-python-headless for OCR
98+ SETUP_URL = (
99+ "https://github.com/resilient-tech/transaction-parser#3-docling-optional"
100+ )
101+
99102 def process (self , file : io .BytesIO | File , page_limit : int | None = None ) -> str :
100103 try :
101104 from docling .datamodel .base_models import ConversionStatus , DocumentStream
@@ -104,8 +107,9 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
104107 title = _ ("Missing Dependency" ),
105108 msg = _ (
106109 "docling is not installed.<br>"
107- "Install it with: <code>bench pip install transaction_parser[docling]</code>"
108- ),
110+ "Install it with: <code>bench pip install transaction_parser[docling]</code><br>"
111+ "See <a href='{0}'>setup instructions</a> for more details."
112+ ).format (self .SETUP_URL ),
109113 )
110114
111115 file = self .get_sanitized_file (file , page_limit )
@@ -147,8 +151,9 @@ def _get_converter(self):
147151 title = _ ("Missing Dependency" ),
148152 msg = _ (
149153 "docling is not installed.<br>"
150- "Install it with: <code>bench pip install transaction_parser[docling]</code>"
151- ),
154+ "Install it with: <code>bench pip install transaction_parser[docling]</code><br>"
155+ "See <a href='{0}'>setup instructions</a> for more details."
156+ ).format (self .SETUP_URL ),
152157 )
153158
154159 pipeline_options = PdfPipelineOptions ()
@@ -169,6 +174,10 @@ class PDFtoTextProcessor(PDFProcessor):
169174 PDF processor using pdftotext for layout-preserving text extraction.
170175 """
171176
177+ SETUP_URL = (
178+ "https://github.com/resilient-tech/transaction-parser#1-pdftotext-default"
179+ )
180+
172181 def process (self , file : io .BytesIO | File , page_limit : int | None = None ) -> str :
173182 file = self .get_sanitized_file (file , page_limit )
174183 return self .get_text (file )
@@ -183,8 +192,9 @@ def get_text(self, file: io.BytesIO) -> str:
183192 "pdftotext is not installed.<br>"
184193 "Install OS dependencies first if not already installed: "
185194 "<code>sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev</code>"
186- "<br>Then run: <code>bench setup requirements</code>"
187- ),
195+ "<br>Then run: <code>bench setup requirements</code><br>"
196+ "See <a href='{0}'>setup instructions</a> for more details."
197+ ).format (self .SETUP_URL ),
188198 )
189199
190200 pdf = pdftotext .PDF (file , physical = True )
@@ -197,13 +207,16 @@ class OCRMyPDFProcessor(PDFProcessor):
197207 PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR.
198208 """
199209
210+ SETUP_URL = (
211+ "https://github.com/resilient-tech/transaction-parser#2-ocrmypdf-optional"
212+ )
213+
200214 def process (self , file : io .BytesIO | File , page_limit : int | None = None ) -> str :
201215 file = self .get_sanitized_file (file , page_limit )
202216 file = self .apply_ocr (file )
203217
204218 return self .get_text (file )
205219
206- # TODO: Give detail of install `tesseract-ocr` system dependency
207220 def apply_ocr (self , file : io .BytesIO ) -> io .BytesIO :
208221 try :
209222 import ocrmypdf
@@ -212,8 +225,9 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
212225 title = _ ("Missing Dependency" ),
213226 msg = _ (
214227 "ocrmypdf is not installed.<br>"
215- "Install it with: <code>bench pip install transaction_parser[ocrmypdf]</code>"
216- ),
228+ "Install it with: <code>bench pip install transaction_parser[ocrmypdf]</code><br>"
229+ "See <a href='{0}'>setup instructions</a> for more details."
230+ ).format (self .SETUP_URL ),
217231 )
218232
219233 file .seek (0 )
0 commit comments