Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
RUN crawl4ai-doctor

# Copy application code
COPY deploy/docker/* ${APP_HOME}/
COPY deploy/docker/ ${APP_HOME}/

# copy the playground + any future static assets
COPY deploy/docker/static ${APP_HOME}/static
Expand Down
13 changes: 9 additions & 4 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,11 +614,12 @@ def dump(self) -> dict:

@staticmethod
def load(data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
if data is None:
return BrowserConfig()
config = from_serializable_dict(data)
if isinstance(config, BrowserConfig):
return config
return BrowserConfig.from_kwargs(config)
return BrowserConfig.from_kwargs(config if config is not None else {})

class VirtualScrollConfig:
"""Configuration for virtual scroll handling.
Expand Down Expand Up @@ -1426,6 +1427,9 @@ def __setattr__(self, name, value):
sig = inspect.signature(self.__init__)
all_params = sig.parameters # Dictionary of parameter names and their details

if name == 'wait_for' and value is not None and not isinstance(value, str):
raise ValueError("'wait_for' must be a string (e.g., a CSS selector or JS expression).")

if name in self._UNWANTED_PROPS and value is not all_params[name].default:
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")

Expand Down Expand Up @@ -1549,11 +1553,12 @@ def dump(self) -> dict:

@staticmethod
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
if data is None:
return CrawlerRunConfig()
config = from_serializable_dict(data)
if isinstance(config, CrawlerRunConfig):
return config
return CrawlerRunConfig.from_kwargs(config)
return CrawlerRunConfig.from_kwargs(config if config is not None else {})

def to_dict(self):
return {
Expand Down
62 changes: 62 additions & 0 deletions deploy/docker/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Crawl4AI Environment Configuration
# Copy this file to .env in the PROJECT ROOT: cp deploy/docker/.env.example .env
# Then customize with your settings

# ──────────────────────────────────────────────────────────────────
# Port Configuration
# ──────────────────────────────────────────────────────────────────
# Host port mapping (container always runs on 11235 internally)
HOST_PORT=11235

# ──────────────────────────────────────────────────────────────────
# LLM Provider API Keys (Runtime Configuration)
# ──────────────────────────────────────────────────────────────────
# Add your API keys for the LLM providers you want to use
OPENAI_API_KEY=
DEEPSEEK_API_KEY=
ANTHROPIC_API_KEY=
GROQ_API_KEY=
TOGETHER_API_KEY=
MISTRAL_API_KEY=
GEMINI_API_TOKEN=

# Optional: Override the default LLM provider
# Examples: "openai/gpt-4.1-mini", "anthropic/claude-4-sonnet", "deepseek/chat"
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
# LLM_PROVIDER=anthropic/claude-3-opus

# ──────────────────────────────────────────────────────────────────
# Docker Image Selection (Optional)
# ──────────────────────────────────────────────────────────────────
# Use pre-built image from Docker Hub (recommended)
# IMAGE=unclecode/crawl4ai:latest
# TAG=latest

# ──────────────────────────────────────────────────────────────────
# Build Configuration (Only applies when building locally)
# ──────────────────────────────────────────────────────────────────

# INSTALL_TYPE: Feature set for the installation
# - default: Basic installation (~2-3GB image)
# Includes: JsonCssExtractionStrategy, JsonXPathExtractionStrategy,
# LLMExtractionStrategy (API-based, no local ML)
# Best for: Standard web crawling, structured extraction, LLM-based extraction
#
# - all: Full installation with ML dependencies (~6-8GB image)
# Adds: PyTorch, transformers, sentence-transformers, scikit-learn, NLTK
# Enables: CosineStrategy (semantic clustering), local transformer models
# Best for: Advanced ML-based extraction, semantic content analysis
#
# - torch: PyTorch + scikit-learn + NLTK (no transformers)
# - transformer: Transformers + sentence-transformers (no PyTorch)
#
INSTALL_TYPE=default

# ENABLE_GPU: Enable NVIDIA CUDA support for GPU acceleration
# - false: CPU-only (works on all platforms)
# - true: Adds CUDA toolkit (AMD64/x86_64 only, requires NVIDIA GPU)
#
# Note: GPU support only available on AMD64 architecture
# ARM64 (Apple Silicon) will skip GPU installation
#
ENABLE_GPU=false
13 changes: 0 additions & 13 deletions deploy/docker/.llm.env.example

This file was deleted.

83 changes: 56 additions & 27 deletions deploy/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ docker pull unclecode/crawl4ai:latest

#### 2. Setup Environment (API Keys)

If you plan to use LLMs, create a `.llm.env` file in your working directory:
If you plan to use LLMs, create a `.env` file in your working directory:

```bash
# Create a .llm.env file with your API keys
cat > .llm.env << EOL
# Create a .env file with your API keys
cat > .env << EOL
# OpenAI
OPENAI_API_KEY=sk-your-key

Expand All @@ -91,7 +91,7 @@ ANTHROPIC_API_KEY=your-anthropic-key
# GEMINI_API_TOKEN=your-gemini-token
EOL
```
> πŸ”‘ **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
> πŸ”‘ **Note**: Keep your API keys secure! Never commit `.env` to version control.

#### 3. Run the Container

Expand All @@ -106,15 +106,26 @@ EOL

* **With LLM support:**
```bash
# Make sure .llm.env is in the current directory
# Make sure .env is in the current directory
docker run -d \
-p 11235:11235 \
--name crawl4ai \
--env-file .llm.env \
--env-file .env \
--shm-size=1g \
unclecode/crawl4ai:0.7.0-r1
```

* **With custom host port:**
```bash
docker run -d \
-p 8080:11235 \
--name crawl4ai \
--env-file .env \
--shm-size=1g \
unclecode/crawl4ai:0.7.0-r1
```
> Access at `http://localhost:8080` (mapped to container's internal port 11235)

> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.

#### 4. Stopping the Container
Expand Down Expand Up @@ -143,15 +154,19 @@ git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
```

#### 2. Environment Setup (API Keys)
#### 2. Environment Setup

If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
Crawl4AI uses a single `.env` file for all configuration:

```bash
# Make sure you are in the 'crawl4ai' root directory
cp deploy/docker/.llm.env.example .llm.env

# Now edit .llm.env and add your API keys
# Copy environment template and customize
cp .env.example .env
# Edit .env to:
# - Set HOST_PORT (default: 11235)
# - Add your API keys for LLM providers
# - Configure build options (if building locally)
```

**Flexible LLM Provider Configuration:**
Expand All @@ -161,7 +176,7 @@ The Docker setup now supports flexible LLM provider configuration through three
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
```bash
export LLM_PROVIDER="anthropic/claude-3-opus"
# Or in your .llm.env file:
# Or in your .env file:
# LLM_PROVIDER=anthropic/claude-3-opus
```

Expand Down Expand Up @@ -199,12 +214,15 @@ The `docker-compose.yml` file in the project root provides a simplified approach
```bash
# Build with all features (includes torch and transformers)
INSTALL_TYPE=all docker compose up --build -d

# Build with GPU support (for AMD64 platforms)
ENABLE_GPU=true docker compose up --build -d

# Run on custom host port
HOST_PORT=8080 docker compose up -d
```

> The server will be available at `http://localhost:11235`.
> The server will be available at `http://localhost:11235` (or your custom `HOST_PORT`).

#### 4. Stopping the Service

Expand All @@ -219,7 +237,7 @@ If you prefer not to use Docker Compose for direct control over the build and ru

#### 1. Clone Repository & Setup Environment

Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.env` in the root).

#### 2. Build the Image (Multi-Arch)

Expand Down Expand Up @@ -253,11 +271,11 @@ docker buildx build \

* **With LLM support:**
```bash
# Make sure .llm.env is in the current directory (project root)
# Make sure .env is in the current directory (project root)
docker run -d \
-p 11235:11235 \
--name crawl4ai-standalone \
--env-file .llm.env \
--env-file .env \
--shm-size=1g \
crawl4ai-local:latest
```
Expand All @@ -282,18 +300,23 @@ MCP is an open protocol that standardizes how applications provide context to LL

### Connecting via MCP

The Crawl4AI server exposes two MCP endpoints:
The Crawl4AI server exposes an MCP HTTP endpoint:

- **FastMCP HTTP**: `http://localhost:11235/mcp`

- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
- **WebSocket**: `ws://localhost:11235/mcp/ws`
> ⚠️ **Known limitation:** The FastMCP HTTP proxy does not yet forward JWT `Authorization`
> headers. If `security.jwt_enabled=true`, MCP tool calls will fail authentication.
> Until the auth-forwarding work lands, either
> disable JWT for MCP usage or introduce an internal-only token/header that the
> proxy can inject.

### Using with Claude Code

You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:

```bash
# Add the Crawl4AI server as an MCP provider
claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
# Add the Crawl4AI server as an MCP provider (HTTP transport)
claude mcp add --transport http c4ai-http http://localhost:11235/mcp

# List all MCP providers to verify it was added
claude mcp list
Expand Down Expand Up @@ -388,19 +411,25 @@ Generates a PDF document of the specified URL.
POST /execute_js
```

Executes JavaScript snippets on the specified URL and returns the full crawl result.
Executes JavaScript snippets against a fresh instance of the target page and
returns the resulting crawl data.

```json
{
"url": "https://example.com",
"scripts": [
"return document.title",
"return Array.from(document.querySelectorAll('a')).map(a => a.href)"
"(() => { document.body.dataset.demo = 'set'; return true; })()",
"(async () => { await new Promise(r => setTimeout(r, 500)); window.snapshot = document.body.dataset.demo; })()"
]
}
```

- `scripts`: List of JavaScript snippets to execute sequentially
- `scripts`: List of JavaScript expressions (typically self-invoking
functions) that run sequentially in the page context. There is no `page`
handle; use DOM APIs such as `document` or `window`.
- Results only report success or errorsβ€”returned values are not surfaced. Run
all related snippets in a single call; each request creates and tears down a
fresh page.

---

Expand Down Expand Up @@ -685,7 +714,7 @@ app:
title: "Crawl4AI API"
version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
host: "0.0.0.0"
port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
port: 11235 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
reload: False # Default set to False - suitable for production
timeout_keep_alive: 300

Expand Down Expand Up @@ -768,7 +797,7 @@ You can override the default `config.yml`.
# Assumes my-custom-config.yml is in the current directory
docker run -d -p 11235:11235 \
--name crawl4ai-custom-config \
--env-file .llm.env \
--env-file .env \
--shm-size=1g \
-v $(pwd)/my-custom-config.yml:/app/config.yml \
unclecode/crawl4ai:latest # Or your specific tag
Expand Down
2 changes: 2 additions & 0 deletions deploy/docker/app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# deploy/docker/app/__init__.py
"""Application core - API handlers, schemas, and utilities."""
Loading