Skip to content

Commit cd7b695

Browse files
tw4likreymer
andauthored
Add backend support for custom behaviors + validation endpoint (#2505)
Backend support for #2151 Adds support for specifying custom behaviors via a list of strings. When workflows are added or modified, minimal backend validation is done to ensure that all custom behavior URLs are valid URLs (after removing the git prefix and custom query arguments). A separate `POST /crawlconfigs/validate/custom-behavior` endpoint is also added, which can be used to validate a custom behavior URL. It performs the same syntax check as above and then: - For URL directly to behavior file, ensures URL resolves and returns a 2xx/3xx status code - For Git repositories, uses `git ls-remote` to ensure they exist (and that branch exists if specified) --------- Co-authored-by: Ilya Kreymer <[email protected]>
1 parent c067a0f commit cd7b695

File tree

8 files changed

+336
-11
lines changed

8 files changed

+336
-11
lines changed

backend/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
FROM docker.io/library/python:3.12-slim
22

3+
RUN apt-get update \
4+
&& apt-get install -y --no-install-recommends git \
5+
&& apt-get purge -y --auto-remove \
6+
&& rm -rf /var/lib/apt/lists/*
7+
38
WORKDIR /app
49

510
ADD requirements.txt /app

backend/btrixcloud/crawlconfigs.py

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
# pylint: disable=too-many-lines
66

7-
from typing import List, Union, Optional, TYPE_CHECKING, cast
7+
from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple
88

99
import asyncio
1010
import json
@@ -15,8 +15,9 @@
1515
from uuid import UUID, uuid4
1616
import urllib.parse
1717

18-
import pymongo
18+
import aiohttp
1919
from fastapi import APIRouter, Depends, HTTPException, Query
20+
import pymongo
2021

2122
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
2223
from .models import (
@@ -43,8 +44,9 @@
4344
CrawlConfigDeletedResponse,
4445
CrawlerProxy,
4546
CrawlerProxies,
47+
ValidateCustomBehavior,
4648
)
47-
from .utils import dt_now, slug_from_name, validate_regexes
49+
from .utils import dt_now, slug_from_name, validate_regexes, is_url
4850

4951
if TYPE_CHECKING:
5052
from .orgs import OrgOps
@@ -231,6 +233,10 @@ async def add_crawl_config(
231233
exclude = [exclude]
232234
validate_regexes(exclude)
233235

236+
if config_in.config.customBehaviors:
237+
for url in config_in.config.customBehaviors:
238+
self._validate_custom_behavior_url_syntax(url)
239+
234240
now = dt_now()
235241
crawlconfig = CrawlConfig(
236242
id=uuid4(),
@@ -291,6 +297,23 @@ async def add_crawl_config(
291297
execMinutesQuotaReached=exec_mins_quota_reached,
292298
)
293299

300+
def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]:
301+
"""Validate custom behaviors are valid URLs after removing custom git syntax"""
302+
git_prefix = "git+"
303+
is_git_repo = False
304+
305+
if url.startswith(git_prefix):
306+
is_git_repo = True
307+
url = url[len(git_prefix) :]
308+
309+
parts = url.split("?")
310+
url = parts[0]
311+
312+
if not is_url(url):
313+
raise HTTPException(status_code=400, detail="invalid_custom_behavior")
314+
315+
return is_git_repo, parts
316+
294317
def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
295318
"""ensure page limit is set to no greater than quota page limit, if any"""
296319
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
@@ -356,6 +379,10 @@ async def update_crawl_config(
356379
exclude = [exclude]
357380
validate_regexes(exclude)
358381

382+
if update.config and update.config.customBehaviors:
383+
for url in update.config.customBehaviors:
384+
self._validate_custom_behavior_url_syntax(url)
385+
359386
# indicates if any k8s crawl config settings changed
360387
changed = False
361388
changed = changed or (
@@ -1070,6 +1097,75 @@ async def re_add_all_scheduled_cron_jobs(self):
10701097
flush=True,
10711098
)
10721099

1100+
async def _validate_behavior_git_repo(self, repo_url: str, branch: str = ""):
1101+
"""Validate git repository and branch, if specified, exist and are reachable"""
1102+
cmd = f"git ls-remote {repo_url} HEAD"
1103+
proc = await asyncio.create_subprocess_shell(cmd)
1104+
if await proc.wait() > 0:
1105+
raise HTTPException(
1106+
status_code=404,
1107+
detail="custom_behavior_not_found",
1108+
)
1109+
1110+
if branch:
1111+
await asyncio.sleep(0.5)
1112+
git_remote_cmd = (
1113+
f"git ls-remote --exit-code --heads {repo_url} refs/heads/{branch}"
1114+
)
1115+
proc = await asyncio.create_subprocess_shell(git_remote_cmd)
1116+
if await proc.wait() > 0:
1117+
raise HTTPException(
1118+
status_code=404,
1119+
detail="custom_behavior_branch_not_found",
1120+
)
1121+
1122+
async def _validate_behavior_url(self, url: str):
1123+
"""Validate behavior file exists at url"""
1124+
try:
1125+
timeout = aiohttp.ClientTimeout(total=10)
1126+
async with aiohttp.ClientSession(timeout=timeout) as session:
1127+
async with session.get(url) as resp:
1128+
if resp.status >= 400:
1129+
raise HTTPException(
1130+
status_code=404,
1131+
detail="custom_behavior_not_found",
1132+
)
1133+
# pylint: disable=raise-missing-from
1134+
except aiohttp.ClientError:
1135+
raise HTTPException(
1136+
status_code=404,
1137+
detail="custom_behavior_not_found",
1138+
)
1139+
1140+
async def validate_custom_behavior(self, url: str) -> Dict[str, bool]:
1141+
"""Validate custom behavior URL
1142+
1143+
Implemented:
1144+
- Ensure URL is valid (after removing custom git prefix and syntax)
1145+
- Ensure URL returns status code < 400
1146+
- Ensure git repository can be reached by git ls-remote and that branch
1147+
exists, if provided
1148+
"""
1149+
git_branch = ""
1150+
1151+
is_git_repo, url_parts = self._validate_custom_behavior_url_syntax(url)
1152+
url = url_parts[0]
1153+
1154+
if is_git_repo and len(url_parts) > 1:
1155+
query_str = url_parts[1]
1156+
try:
1157+
git_branch = urllib.parse.parse_qs(query_str)["branch"][0]
1158+
# pylint: disable=broad-exception-caught
1159+
except (KeyError, IndexError):
1160+
pass
1161+
1162+
if is_git_repo:
1163+
await self._validate_behavior_git_repo(url, branch=git_branch)
1164+
else:
1165+
await self._validate_behavior_url(url)
1166+
1167+
return {"success": True}
1168+
10731169

10741170
# ============================================================================
10751171
# pylint: disable=too-many-locals
@@ -1316,6 +1412,14 @@ async def re_add_all_scheduled_cron_jobs(
13161412
asyncio.create_task(ops.re_add_all_scheduled_cron_jobs())
13171413
return {"success": True}
13181414

1415+
@router.post("/validate/custom-behavior", response_model=SuccessResponse)
1416+
async def validate_custom_behavior(
1417+
behavior: ValidateCustomBehavior,
1418+
# pylint: disable=unused-argument
1419+
org: Organization = Depends(org_crawl_dep),
1420+
):
1421+
return await ops.validate_custom_behavior(behavior.customBehavior)
1422+
13191423
org_ops.router.include_router(router)
13201424

13211425
return ops

backend/btrixcloud/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ class RawCrawlConfig(BaseModel):
333333

334334
logging: Optional[str] = None
335335
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
336+
customBehaviors: List[str] = []
336337

337338
userAgent: Optional[str] = None
338339

@@ -541,6 +542,8 @@ class CrawlConfigDefaults(BaseModel):
541542

542543
exclude: Optional[List[str]] = None
543544

545+
customBehaviors: List[str] = []
546+
544547

545548
# ============================================================================
546549
class CrawlConfigAddedResponse(BaseModel):
@@ -592,6 +595,13 @@ class CrawlConfigDeletedResponse(BaseModel):
592595
status: str
593596

594597

598+
# ============================================================================
599+
class ValidateCustomBehavior(BaseModel):
600+
"""Input model for validating custom behavior URL/Git reference"""
601+
602+
customBehavior: str
603+
604+
595605
# ============================================================================
596606

597607
### CRAWLER VERSIONS ###

backend/btrixcloud/utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from datetime import datetime, timezone
1313
from typing import Optional, Dict, Union, List, Any
14+
from urllib.parse import urlparse
1415
from uuid import UUID
1516

1617
from fastapi import HTTPException
@@ -100,6 +101,15 @@ def is_falsy_bool(stri: Optional[str]) -> bool:
100101
return False
101102

102103

104+
def is_url(url: str) -> bool:
105+
"""Check if string is a valid URL"""
106+
try:
107+
result = urlparse(url)
108+
return all([result.scheme, result.netloc])
109+
except ValueError:
110+
return False
111+
112+
103113
def str_list_to_bools(str_list: List[str], allow_none=True) -> List[Union[bool, None]]:
104114
"""Return version of input string list cast to bool or None, ignoring other values"""
105115
output: List[Union[bool, None]] = []

backend/test/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def crawler_config_id_only(_crawler_create_config_only):
326326
return _crawler_config_id
327327

328328

329-
@pytest.fixture(scope="session")
329+
@pytest.fixture(scope="function")
330330
def sample_crawl_data():
331331
return {
332332
"runNow": False,

0 commit comments

Comments
 (0)