fix: enhance setup instructions for Docling, PDFtoText, and OCRMyPDF processors

Abdeali099 · Abdeali099 · commit 356e21377b5a · 2026-04-16T11:34:06.000+05:30
diff --git a/transaction_parser/transaction_parser/utils/pdf_processor.py b/transaction_parser/transaction_parser/utils/pdf_processor.py
@@ -95,7 +95,10 @@ class DoclingPDFProcessor(PDFProcessor):
 
     _converter = None
 
-    # TODO: Give detail of install `docling` system dependency and opencv-python-headless for OCR
+    SETUP_URL = (
+        "https://github.com/resilient-tech/transaction-parser#3-docling-optional"
+    )
+
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
         try:
             from docling.datamodel.base_models import ConversionStatus, DocumentStream
@@ -104,8 +107,9 @@ def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str
                 title=_("Missing Dependency"),
                 msg=_(
                     "docling is not installed.<br>"
-                    "Install it with: <code>bench pip install transaction_parser[docling]</code>"
-                ),
+                    "Install it with: <code>bench pip install transaction_parser[docling]</code><br>"
+                    "See <a href='{0}'>setup instructions</a> for more details."
+                ).format(self.SETUP_URL),
             )
 
         file = self.get_sanitized_file(file, page_limit)
@@ -147,8 +151,9 @@ def _get_converter(self):
                     title=_("Missing Dependency"),
                     msg=_(
                         "docling is not installed.<br>"
-                        "Install it with: <code>bench pip install transaction_parser[docling]</code>"
-                    ),
+                        "Install it with: <code>bench pip install transaction_parser[docling]</code><br>"
+                        "See <a href='{0}'>setup instructions</a> for more details."
+                    ).format(self.SETUP_URL),
                 )
 
             pipeline_options = PdfPipelineOptions()
@@ -169,6 +174,10 @@ class PDFtoTextProcessor(PDFProcessor):
     PDF processor using pdftotext for layout-preserving text extraction.
     """
 
+    SETUP_URL = (
+        "https://github.com/resilient-tech/transaction-parser#1-pdftotext-default"
+    )
+
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
         file = self.get_sanitized_file(file, page_limit)
         return self.get_text(file)
@@ -183,8 +192,9 @@ def get_text(self, file: io.BytesIO) -> str:
                     "pdftotext is not installed.<br>"
                     "Install OS dependencies first if not already installed: "
                     "<code>sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev</code>"
-                    "<br>Then run: <code>bench setup requirements</code>"
-                ),
+                    "<br>Then run: <code>bench setup requirements</code><br>"
+                    "See <a href='{0}'>setup instructions</a> for more details."
+                ).format(self.SETUP_URL),
             )
 
         pdf = pdftotext.PDF(file, physical=True)
@@ -197,13 +207,16 @@ class OCRMyPDFProcessor(PDFProcessor):
     PDF processor using PyMuPDF for text extraction and OCRMyPDF for OCR.
     """
 
+    SETUP_URL = (
+        "https://github.com/resilient-tech/transaction-parser#2-ocrmypdf-optional"
+    )
+
     def process(self, file: io.BytesIO | File, page_limit: int | None = None) -> str:
         file = self.get_sanitized_file(file, page_limit)
         file = self.apply_ocr(file)
 
         return self.get_text(file)
 
-    # TODO: Give detail of install `tesseract-ocr` system dependency
     def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
         try:
             import ocrmypdf
@@ -212,8 +225,9 @@ def apply_ocr(self, file: io.BytesIO) -> io.BytesIO:
                 title=_("Missing Dependency"),
                 msg=_(
                     "ocrmypdf is not installed.<br>"
-                    "Install it with: <code>bench pip install transaction_parser[ocrmypdf]</code>"
-                ),
+                    "Install it with: <code>bench pip install transaction_parser[ocrmypdf]</code><br>"
+                    "See <a href='{0}'>setup instructions</a> for more details."
+                ).format(self.SETUP_URL),
             )
 
         file.seek(0)