|
4 | 4 |
|
5 | 5 | # pylint: disable=too-many-lines |
6 | 6 |
|
7 | | -from typing import List, Union, Optional, TYPE_CHECKING, cast |
| 7 | +from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple |
8 | 8 |
|
9 | 9 | import asyncio |
10 | 10 | import json |
|
15 | 15 | from uuid import UUID, uuid4 |
16 | 16 | import urllib.parse |
17 | 17 |
|
18 | | -import pymongo |
| 18 | +import aiohttp |
19 | 19 | from fastapi import APIRouter, Depends, HTTPException, Query |
| 20 | +import pymongo |
20 | 21 |
|
21 | 22 | from .pagination import DEFAULT_PAGE_SIZE, paginated_format |
22 | 23 | from .models import ( |
|
43 | 44 | CrawlConfigDeletedResponse, |
44 | 45 | CrawlerProxy, |
45 | 46 | CrawlerProxies, |
| 47 | + ValidateCustomBehavior, |
46 | 48 | ) |
47 | | -from .utils import dt_now, slug_from_name, validate_regexes |
| 49 | +from .utils import dt_now, slug_from_name, validate_regexes, is_url |
48 | 50 |
|
49 | 51 | if TYPE_CHECKING: |
50 | 52 | from .orgs import OrgOps |
@@ -231,6 +233,10 @@ async def add_crawl_config( |
231 | 233 | exclude = [exclude] |
232 | 234 | validate_regexes(exclude) |
233 | 235 |
|
| 236 | + if config_in.config.customBehaviors: |
| 237 | + for url in config_in.config.customBehaviors: |
| 238 | + self._validate_custom_behavior_url_syntax(url) |
| 239 | + |
234 | 240 | now = dt_now() |
235 | 241 | crawlconfig = CrawlConfig( |
236 | 242 | id=uuid4(), |
@@ -291,6 +297,23 @@ async def add_crawl_config( |
291 | 297 | execMinutesQuotaReached=exec_mins_quota_reached, |
292 | 298 | ) |
293 | 299 |
|
| 300 | + def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]: |
| 301 | + """Validate custom behaviors are valid URLs after removing custom git syntax""" |
| 302 | + git_prefix = "git+" |
| 303 | + is_git_repo = False |
| 304 | + |
| 305 | + if url.startswith(git_prefix): |
| 306 | + is_git_repo = True |
| 307 | + url = url[len(git_prefix) :] |
| 308 | + |
| 309 | + parts = url.split("?") |
| 310 | + url = parts[0] |
| 311 | + |
| 312 | + if not is_url(url): |
| 313 | + raise HTTPException(status_code=400, detail="invalid_custom_behavior") |
| 314 | + |
| 315 | + return is_git_repo, parts |
| 316 | + |
294 | 317 | def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization): |
295 | 318 | """ensure page limit is set to no greater than quota page limit, if any""" |
296 | 319 | if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0: |
@@ -356,6 +379,10 @@ async def update_crawl_config( |
356 | 379 | exclude = [exclude] |
357 | 380 | validate_regexes(exclude) |
358 | 381 |
|
| 382 | + if update.config and update.config.customBehaviors: |
| 383 | + for url in update.config.customBehaviors: |
| 384 | + self._validate_custom_behavior_url_syntax(url) |
| 385 | + |
359 | 386 | # indicates if any k8s crawl config settings changed |
360 | 387 | changed = False |
361 | 388 | changed = changed or ( |
@@ -1070,6 +1097,75 @@ async def re_add_all_scheduled_cron_jobs(self): |
1070 | 1097 | flush=True, |
1071 | 1098 | ) |
1072 | 1099 |
|
| 1100 | + async def _validate_behavior_git_repo(self, repo_url: str, branch: str = ""): |
| 1101 | + """Validate git repository and branch, if specified, exist and are reachable""" |
| 1102 | + cmd = f"git ls-remote {repo_url} HEAD" |
| 1103 | + proc = await asyncio.create_subprocess_shell(cmd) |
| 1104 | + if await proc.wait() > 0: |
| 1105 | + raise HTTPException( |
| 1106 | + status_code=404, |
| 1107 | + detail="custom_behavior_not_found", |
| 1108 | + ) |
| 1109 | + |
| 1110 | + if branch: |
| 1111 | + await asyncio.sleep(0.5) |
| 1112 | + git_remote_cmd = ( |
| 1113 | + f"git ls-remote --exit-code --heads {repo_url} refs/heads/{branch}" |
| 1114 | + ) |
| 1115 | + proc = await asyncio.create_subprocess_shell(git_remote_cmd) |
| 1116 | + if await proc.wait() > 0: |
| 1117 | + raise HTTPException( |
| 1118 | + status_code=404, |
| 1119 | + detail="custom_behavior_branch_not_found", |
| 1120 | + ) |
| 1121 | + |
| 1122 | + async def _validate_behavior_url(self, url: str): |
| 1123 | + """Validate behavior file exists at url""" |
| 1124 | + try: |
| 1125 | + timeout = aiohttp.ClientTimeout(total=10) |
| 1126 | + async with aiohttp.ClientSession(timeout=timeout) as session: |
| 1127 | + async with session.get(url) as resp: |
| 1128 | + if resp.status >= 400: |
| 1129 | + raise HTTPException( |
| 1130 | + status_code=404, |
| 1131 | + detail="custom_behavior_not_found", |
| 1132 | + ) |
| 1133 | + # pylint: disable=raise-missing-from |
| 1134 | + except aiohttp.ClientError: |
| 1135 | + raise HTTPException( |
| 1136 | + status_code=404, |
| 1137 | + detail="custom_behavior_not_found", |
| 1138 | + ) |
| 1139 | + |
| 1140 | + async def validate_custom_behavior(self, url: str) -> Dict[str, bool]: |
| 1141 | + """Validate custom behavior URL |
| 1142 | +
|
| 1143 | + Implemented: |
| 1144 | + - Ensure URL is valid (after removing custom git prefix and syntax) |
| 1145 | + - Ensure URL returns status code < 400 |
| 1146 | + - Ensure git repository can be reached by git ls-remote and that branch |
| 1147 | + exists, if provided |
| 1148 | + """ |
| 1149 | + git_branch = "" |
| 1150 | + |
| 1151 | + is_git_repo, url_parts = self._validate_custom_behavior_url_syntax(url) |
| 1152 | + url = url_parts[0] |
| 1153 | + |
| 1154 | + if is_git_repo and len(url_parts) > 1: |
| 1155 | + query_str = url_parts[1] |
| 1156 | + try: |
| 1157 | + git_branch = urllib.parse.parse_qs(query_str)["branch"][0] |
| 1158 | + # pylint: disable=broad-exception-caught |
| 1159 | + except (KeyError, IndexError): |
| 1160 | + pass |
| 1161 | + |
| 1162 | + if is_git_repo: |
| 1163 | + await self._validate_behavior_git_repo(url, branch=git_branch) |
| 1164 | + else: |
| 1165 | + await self._validate_behavior_url(url) |
| 1166 | + |
| 1167 | + return {"success": True} |
| 1168 | + |
1073 | 1169 |
|
1074 | 1170 | # ============================================================================ |
1075 | 1171 | # pylint: disable=too-many-locals |
@@ -1316,6 +1412,14 @@ async def re_add_all_scheduled_cron_jobs( |
1316 | 1412 | asyncio.create_task(ops.re_add_all_scheduled_cron_jobs()) |
1317 | 1413 | return {"success": True} |
1318 | 1414 |
|
| 1415 | + @router.post("/validate/custom-behavior", response_model=SuccessResponse) |
| 1416 | + async def validate_custom_behavior( |
| 1417 | + behavior: ValidateCustomBehavior, |
| 1418 | + # pylint: disable=unused-argument |
| 1419 | + org: Organization = Depends(org_crawl_dep), |
| 1420 | + ): |
| 1421 | + return await ops.validate_custom_behavior(behavior.customBehavior) |
| 1422 | + |
1319 | 1423 | org_ops.router.include_router(router) |
1320 | 1424 |
|
1321 | 1425 | return ops |
0 commit comments