unclecode · YorelN · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
@@ -10,6 +10,9 @@
     LXMLWebScrapingStrategy,
     WebScrapingStrategy,  # Backward compatibility alias
 )
+
+from .processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+
 from .async_logger import (
     AsyncLoggerBase,
     AsyncLogger,
@@ -128,6 +131,8 @@
     "BFSDeepCrawlStrategy",
     "BestFirstCrawlingStrategy",
     "DFSDeepCrawlStrategy",
+    "PDFCrawlerStrategy",
+    "PDFContentScrapingStrategy",
     "FilterChain",
     "URLPatternFilter",
     "ContentTypeFilter",

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
@@ -13,8 +13,12 @@
 from fastapi import HTTPException, Request, status
 from fastapi.background import BackgroundTasks
 from fastapi.responses import JSONResponse
+from fastapi.encoders import jsonable_encoder
+
 from redis import asyncio as aioredis
 
+from utils import is_pdf_url
+
 from crawl4ai import (
     AsyncWebCrawler,
     CrawlerRunConfig,
@@ -31,6 +35,10 @@
     BM25ContentFilter,
     LLMContentFilter
 )
+
+from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+# from crawl4ai.async_configs import to_serializable_dict
+
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 
@@ -431,6 +439,23 @@ async def handle_crawl_request(
         urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
         browser_config = BrowserConfig.load(browser_config)
         crawler_config = CrawlerRunConfig.load(crawler_config)
+
+        is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
+        is_pdf = any(is_pdf_flags)
+        if any(is_pdf_flags) and not all(is_pdf_flags):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Mix of PDF and non-PDF URLs in a single request is not supported yet."
+            )
+
+        crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
+
+        if is_pdf and crawler_config.scraping_strategy is None:
+            crawler_config.scraping_strategy = PDFContentScrapingStrategy(
+                extract_images=False,
+                save_images_locally=False,
+                batch_size=2
+            )
 
         dispatcher = MemoryAdaptiveDispatcher(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
@@ -440,7 +465,7 @@ async def handle_crawl_request(
         )
 
         from crawler_pool import get_crawler
-        crawler = await get_crawler(browser_config)
+        crawler = await get_crawler(browser_config, crawler_strategy)
 
         # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
         # await crawler.start()
@@ -458,6 +483,7 @@ async def handle_crawl_request(
                                 config=crawler_config, 
                                 dispatcher=dispatcher)
         results = await partial_func()
+        results_list = results if isinstance(results, list) else [results]
 
         # await crawler.close()
 
@@ -471,12 +497,14 @@ async def handle_crawl_request(
 
         # Process results to handle PDF bytes
         processed_results = []
-        for result in results:
+        for result in results_list:
             result_dict = result.model_dump()
             # If PDF exists, encode it to base64
             if result_dict.get('pdf') is not None:
                 result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
-            processed_results.append(result_dict)
+
+            # Keep response shape consistent with streaming (plain JSON-serializable dict)
+            processed_results.append(jsonable_encoder(result_dict))
 
         return {
             "success": True,
@@ -521,18 +549,37 @@ async def handle_stream_crawl_request(
         # browser_config.verbose = True # Set to False or remove for production stress testing
         browser_config.verbose = False
         crawler_config = CrawlerRunConfig.load(crawler_config)
-        crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
         crawler_config.stream = True
+
+        # Normalize URLs to include scheme (match non-streaming behavior)
+        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
+
+        is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
+        is_pdf = any(is_pdf_flags)
+        if any(is_pdf_flags) and not all(is_pdf_flags):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Mix of PDF and non-PDF URLs in a single request is not supported yet."
+            )
+
+        crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
+
+        if is_pdf and crawler_config.scraping_strategy is None:
+            crawler_config.scraping_strategy = PDFContentScrapingStrategy(
+                extract_images=False,
+                save_images_locally=False,
+                batch_size=2
+            )
 
         dispatcher = MemoryAdaptiveDispatcher(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
             rate_limiter=RateLimiter(
                 base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
-            )
+            ) if config["crawler"]["rate_limiter"]["enabled"] else None
         )
 
         from crawler_pool import get_crawler
-        crawler = await get_crawler(browser_config)
+        crawler = await get_crawler(browser_config, crawler_strategy)
 
         # crawler = AsyncWebCrawler(config=browser_config)
         # await crawler.start()

diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
@@ -1,11 +1,11 @@
 # crawler_pool.py  (new file)
 import asyncio, json, hashlib, time, psutil
 from contextlib import suppress
-from typing import Dict
+from typing import Dict, Optional
 from crawl4ai import AsyncWebCrawler, BrowserConfig
-from typing import Dict
 from utils import load_config 
 
+
 CONFIG = load_config()
 
 POOL: Dict[str, AsyncWebCrawler] = {}
@@ -15,20 +15,33 @@
 MEM_LIMIT  = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)   # % RAM – refuse new browsers above this
 IDLE_TTL  = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)   # close if unused for 30 min
 
-def _sig(cfg: BrowserConfig) -> str:
-    payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
-    return hashlib.sha1(payload.encode()).hexdigest()
+def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
+    """
+    Generate a unique signature for a crawler based on browser config
+    and optional crawler strategy. This ensures that crawlers with
+    different strategies (e.g., PDF) are stored separately in the pool.
+    """
+    payload = cfg.to_dict()
+
+    if crawler_strategy is not None:
+        payload["strategy"] = crawler_strategy.__class__.__name__
+
+    json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":"))
+    return hashlib.sha256(json_payload.encode()).hexdigest()
+
+
 
-async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
+async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
+    sig: Optional[str] = None
     try:
-        sig = _sig(cfg)
+        sig = _sig(cfg, crawler_strategy=crawler_strategy)
         async with LOCK:
             if sig in POOL:
                 LAST_USED[sig] = time.time();  
                 return POOL[sig]
             if psutil.virtual_memory().percent >= MEM_LIMIT:
                 raise MemoryError("RAM pressure – new browser denied")
-            crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
+            crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
             await crawler.start()
             POOL[sig] = crawler; LAST_USED[sig] = time.time()
             return crawler
@@ -37,13 +50,15 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:
-        if sig in POOL:
+        if sig and sig in POOL:
             LAST_USED[sig] = time.time()
         else:
             # If we failed to start the browser, we should remove it from the pool
-            POOL.pop(sig, None)
-            LAST_USED.pop(sig, None)
+            if sig:
+                POOL.pop(sig, None)
+                LAST_USED.pop(sig, None)
         # If we failed to start the browser, we should remove it from the pool
+
 async def close_all():
     async with LOCK:
         await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)

diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
@@ -2,6 +2,7 @@
 import logging
 import yaml
 import os
+import httpx
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
@@ -124,4 +125,37 @@ def verify_email_domain(email: str) -> bool:
         records = dns.resolver.resolve(domain, 'MX')
         return True if records else False
     except Exception as e:
-        return False
+        return False
+
+async def is_pdf_url(url: str) -> bool:
+    """
+    Check if a URL points to a PDF using httpx:
+    - Check extension
+    - Check Content-Type via HEAD request
+    - Check first 5 bytes (magic number) if needed
+    """
+    if url.lower().endswith(".pdf"):
+        return True
+
+    timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
+    async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
+        # HEAD request to check Content-Type (ignore servers that reject HEAD)
+        try:
+            head_resp = await client.head(url, headers={"Accept": "*/*"})
+            content_type = head_resp.headers.get("content-type", "").lower()
+            if "application/pdf" in content_type:
+                return True
+        except httpx.HTTPError:
+            pass
+
+        # Fallback: GET first 5 bytes to check PDF magic number
+        try:
+            get_resp = await client.get(url, headers={"Range": "bytes=0-4", "Accept": "*/*"})
+            if get_resp.status_code in (200, 206):  # 206 Partial Content
+                return get_resp.content.startswith(b"%PDF-")
+        except httpx.HTTPError:
+            return False
+
+    # Default: not a PDF (or unable to determine)
+    return False
+
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
@@ -18,15 +18,14 @@
 console = Console()
 
 # --- Configuration ---
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
 # Target URLs
-SIMPLE_URL = "https://example.com"  # For demo purposes
 SIMPLE_URL = "https://httpbin.org/html"
 LINKS_URL = "https://httpbin.org/links/10/0"
 FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo
 BOOKS_URL = "http://books.toscrape.com/"  # For CSS extraction
 PYTHON_URL = "https://python.org"  # For deeper crawl
+PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo
 # Use the same sample site as deep crawl tests for consistency
 DEEP_CRAWL_BASE_URL = os.getenv(
     "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
@@ -1261,6 +1260,73 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
         console.print(
             f"[bold red]Unexpected error during invalid test:[/] {e}")
 
+# 10. Crawl PDF
+
+async def demo_pdf_crawl(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PDF_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "scraping_strategy": {
+                    "type": "PDFContentScrapingStrategy",
+                    "params": {
+                        "extract_images": False,
+                        "save_images_locally": False,
+                        "batch_size": 2
+                    }
+                }
+            }
+        }
+    }
+
+    resp = await client.post("/crawl", json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print("=== Demo: PDF Crawl ===")
+    print("Success:", data.get("success"))
+    print("Number of results:", len(data.get("results", [])))
+    if data.get("results"):
+        first = data["results"][0]
+        text_snippet = (first.get("text") or "")[:500]
+        print("Extracted text (first 500 chars):")
+        print(text_snippet)
+
+# 11. Crawl PDF stream
+
+async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
+    """
+    Demo: Crawl a PDF and stream the extracted text content.
+    """
+    payload = {
+        "urls": [PDF_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": True,
+                "cache_mode": "BYPASS",
+                "scraping_strategy": {  # <-- Default strategy if not set
+                    "type": "PDFContentScrapingStrategy",
+                    "params": {
+                        "extract_images": False,     
+                        "save_images_locally": False,
+                        "batch_size": 2              
+                    }
+                }
+            }
+        }
+    }
+
+    await stream_request(
+        client,
+        "/crawl/stream",
+        payload,
+        "Demo PDF: Streaming PDF Crawl"
+    )
+
 
 # --- Update Main Runner to include new demo ---
 async def main_demo():
@@ -1294,6 +1360,9 @@ async def main_demo():
         # await demo_deep_with_llm_extraction(client)
         # await demo_deep_with_proxy(client)  # Skips if no PROXIES env var
         # await demo_deep_with_ssl(client)   # Added the new demo
+
+        # await demo_pdf_crawl_stream(client)
+        # await demo_pdf_crawl(client)
 
         # --- Helper endpoints ---
         await demo_markdown_endpoint(client)