Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
LXMLWebScrapingStrategy,
WebScrapingStrategy, # Backward compatibility alias
)

from .processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy

from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
Expand Down Expand Up @@ -128,6 +131,8 @@
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"PDFCrawlerStrategy",
"PDFContentScrapingStrategy",
"FilterChain",
"URLPatternFilter",
"ContentTypeFilter",
Expand Down
59 changes: 53 additions & 6 deletions deploy/docker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@
from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
from fastapi.encoders import jsonable_encoder

from redis import asyncio as aioredis

from utils import is_pdf_url

from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
Expand All @@ -31,6 +35,10 @@
BM25ContentFilter,
LLMContentFilter
)

from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
# from crawl4ai.async_configs import to_serializable_dict

from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy

Expand Down Expand Up @@ -431,6 +439,23 @@ async def handle_crawl_request(
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)

is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
is_pdf = any(is_pdf_flags)
if any(is_pdf_flags) and not all(is_pdf_flags):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Mix of PDF and non-PDF URLs in a single request is not supported yet."
)

crawler_strategy = PDFCrawlerStrategy() if is_pdf else None

if is_pdf and crawler_config.scraping_strategy is None:
crawler_config.scraping_strategy = PDFContentScrapingStrategy(
extract_images=False,
save_images_locally=False,
batch_size=2
)

dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
Expand All @@ -440,7 +465,7 @@ async def handle_crawl_request(
)

from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)
crawler = await get_crawler(browser_config, crawler_strategy)

# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
Expand All @@ -458,6 +483,7 @@ async def handle_crawl_request(
config=crawler_config,
dispatcher=dispatcher)
results = await partial_func()
results_list = results if isinstance(results, list) else [results]

# await crawler.close()

Expand All @@ -471,12 +497,14 @@ async def handle_crawl_request(

# Process results to handle PDF bytes
processed_results = []
for result in results:
for result in results_list:
result_dict = result.model_dump()
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)

# Keep response shape consistent with streaming (plain JSON-serializable dict)
processed_results.append(jsonable_encoder(result_dict))

return {
"success": True,
Expand Down Expand Up @@ -521,18 +549,37 @@ async def handle_stream_crawl_request(
# browser_config.verbose = True # Set to False or remove for production stress testing
browser_config.verbose = False
crawler_config = CrawlerRunConfig.load(crawler_config)
crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
crawler_config.stream = True

# Normalize URLs to include scheme (match non-streaming behavior)
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]

is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
is_pdf = any(is_pdf_flags)
if any(is_pdf_flags) and not all(is_pdf_flags):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Mix of PDF and non-PDF URLs in a single request is not supported yet."
)

crawler_strategy = PDFCrawlerStrategy() if is_pdf else None

if is_pdf and crawler_config.scraping_strategy is None:
crawler_config.scraping_strategy = PDFContentScrapingStrategy(
extract_images=False,
save_images_locally=False,
batch_size=2
)

dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
)
) if config["crawler"]["rate_limiter"]["enabled"] else None
)

from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)
crawler = await get_crawler(browser_config, crawler_strategy)

# crawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
Expand Down
37 changes: 26 additions & 11 deletions deploy/docker/crawler_pool.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# crawler_pool.py (new file)
import asyncio, json, hashlib, time, psutil
from contextlib import suppress
from typing import Dict
from typing import Dict, Optional
from crawl4ai import AsyncWebCrawler, BrowserConfig
from typing import Dict
from utils import load_config


CONFIG = load_config()

POOL: Dict[str, AsyncWebCrawler] = {}
Expand All @@ -15,20 +15,33 @@
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min

def _sig(cfg: BrowserConfig) -> str:
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
return hashlib.sha1(payload.encode()).hexdigest()
def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str:
"""
Generate a unique signature for a crawler based on browser config
and optional crawler strategy. This ensures that crawlers with
different strategies (e.g., PDF) are stored separately in the pool.
"""
payload = cfg.to_dict()

if crawler_strategy is not None:
payload["strategy"] = crawler_strategy.__class__.__name__

Comment on lines +26 to +28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Include strategy configuration in the pool signature.

Keying pooled crawlers only by crawler_strategy.__class__.__name__ collapses all instances of the same strategy class into a single slot. If two callers supply the same strategy class with different constructor arguments (e.g., PDFCrawlerStrategy(ocr=True) vs PDFCrawlerStrategy(ocr=False)), the second caller will be handed the already-started crawler that still holds the first instance, so their requested configuration is silently ignored. That’s a functional bug for any strategy with mutable/per-instance configuration. Please extend the signature to encode the strategy’s effective settings (e.g., module-qualified name plus a stable serialization of its config) so distinct instances don’t collide.

🤖 Prompt for AI Agents
In deploy/docker/crawler_pool.py around lines 25 to 27, the pool signature only
keys strategies by crawler_strategy.__class__.__name__, causing different
instances of the same class with different constructor arguments to collide; fix
this by including a stable, deterministic encoding of the instance configuration
in the payload key: derive the module-qualified class name (e.g.,
strategy.__class__.__module__ + "." + strategy.__class__.__name__) and a
serializable representation of the strategy's effective settings (prefer a
to_dict()/asdict() method on the strategy or fall back to
vars()/inspect.getfullargspec/init args), then JSON-serialize that config with
sort_keys=True (and avoid non-serializable members) and add it to payload (e.g.,
payload["strategy"] = "<module.ClassName>", payload["strategy_config"] =
"<stable-json>") so distinct configured instances do not collide.

json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":"))
return hashlib.sha256(json_payload.encode()).hexdigest()



async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
sig: Optional[str] = None
try:
sig = _sig(cfg)
sig = _sig(cfg, crawler_strategy=crawler_strategy)
async with LOCK:
if sig in POOL:
LAST_USED[sig] = time.time();
return POOL[sig]
if psutil.virtual_memory().percent >= MEM_LIMIT:
raise MemoryError("RAM pressure – new browser denied")
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
await crawler.start()
POOL[sig] = crawler; LAST_USED[sig] = time.time()
return crawler
Expand All @@ -37,13 +50,15 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
except Exception as e:
raise RuntimeError(f"Failed to start browser: {e}")
finally:
if sig in POOL:
if sig and sig in POOL:
LAST_USED[sig] = time.time()
else:
# If we failed to start the browser, we should remove it from the pool
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
if sig:
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool

async def close_all():
async with LOCK:
await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
Expand Down
36 changes: 35 additions & 1 deletion deploy/docker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import yaml
import os
import httpx
from datetime import datetime
from enum import Enum
from pathlib import Path
Expand Down Expand Up @@ -124,4 +125,37 @@ def verify_email_domain(email: str) -> bool:
records = dns.resolver.resolve(domain, 'MX')
return True if records else False
except Exception as e:
return False
return False

async def is_pdf_url(url: str) -> bool:
"""
Check if a URL points to a PDF using httpx:
- Check extension
- Check Content-Type via HEAD request
- Check first 5 bytes (magic number) if needed
"""
if url.lower().endswith(".pdf"):
return True

timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
# HEAD request to check Content-Type (ignore servers that reject HEAD)
try:
head_resp = await client.head(url, headers={"Accept": "*/*"})
content_type = head_resp.headers.get("content-type", "").lower()
if "application/pdf" in content_type:
return True
except httpx.HTTPError:
pass

# Fallback: GET first 5 bytes to check PDF magic number
try:
get_resp = await client.get(url, headers={"Range": "bytes=0-4", "Accept": "*/*"})
if get_resp.status_code in (200, 206): # 206 Partial Content
return get_resp.content.startswith(b"%PDF-")
except httpx.HTTPError:
return False

# Default: not a PDF (or unable to determine)
return False

73 changes: 71 additions & 2 deletions docs/examples/docker/demo_docker_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
console = Console()

# --- Configuration ---
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
# Target URLs
SIMPLE_URL = "https://example.com" # For demo purposes
SIMPLE_URL = "https://httpbin.org/html"
LINKS_URL = "https://httpbin.org/links/10/0"
FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
PYTHON_URL = "https://python.org" # For deeper crawl
PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo
# Use the same sample site as deep crawl tests for consistency
DEEP_CRAWL_BASE_URL = os.getenv(
"DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
Expand Down Expand Up @@ -1261,6 +1260,73 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
console.print(
f"[bold red]Unexpected error during invalid test:[/] {e}")

# 10. Crawl PDF

async def demo_pdf_crawl(client: httpx.AsyncClient):
payload = {
"urls": [PDF_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS",
"scraping_strategy": {
"type": "PDFContentScrapingStrategy",
"params": {
"extract_images": False,
"save_images_locally": False,
"batch_size": 2
}
}
}
}
}

resp = await client.post("/crawl", json=payload)
resp.raise_for_status()
data = resp.json()
print("=== Demo: PDF Crawl ===")
print("Success:", data.get("success"))
print("Number of results:", len(data.get("results", [])))
if data.get("results"):
first = data["results"][0]
text_snippet = (first.get("text") or "")[:500]
print("Extracted text (first 500 chars):")
print(text_snippet)

# 11. Crawl PDF stream

async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
"""
Demo: Crawl a PDF and stream the extracted text content.
"""
payload = {
"urls": [PDF_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
"cache_mode": "BYPASS",
"scraping_strategy": { # <-- Default strategy if not set
"type": "PDFContentScrapingStrategy",
"params": {
"extract_images": False,
"save_images_locally": False,
"batch_size": 2
}
}
}
}
}

await stream_request(
client,
"/crawl/stream",
payload,
"Demo PDF: Streaming PDF Crawl"
)


# --- Update Main Runner to include new demo ---
async def main_demo():
Expand Down Expand Up @@ -1294,6 +1360,9 @@ async def main_demo():
# await demo_deep_with_llm_extraction(client)
# await demo_deep_with_proxy(client) # Skips if no PROXIES env var
# await demo_deep_with_ssl(client) # Added the new demo

# await demo_pdf_crawl_stream(client)
# await demo_pdf_crawl(client)

# --- Helper endpoints ---
await demo_markdown_endpoint(client)
Expand Down
Loading