Skip to content

Commit cb50c7c

Browse files
ikreymerSuaYootw4l
authored
Pause / Resume Crawls Initial Implmentation. (#2572)
- add 'pause' crawl state (fixes #2567) - gracefully shut down crawler pods, and then redis pod when paused - crawler uploads WACZ before shutting down (dependent on webrecorder/browsertrix-crawler#824, supported in 1.6.1+) - add 'paused_at' on crawl spec to indicate when crawl is paused - support max pause time limit, after which crawl becomes automatically stopped. - add 'stopped_pause_expired' when pause automatically expires and crawl is stopped - /crawl/<id>/{pause,resume} apis to toggle 'paused' on crawl spec - ui: add pause/resume button, paused state (partially addresses #2568) - ui: add pausing/resuming derivative states when crawl is running and pausing, or paused and not pausing (partially addresses #2569) - Designed to work with crawler 1.6.1+ which support pausing + uploading on pause Work on #2566, Fixes #2576 --------- Co-authored-by: sua yoo <[email protected]> Co-authored-by: Tessa Walsh <[email protected]> Co-authored-by: sua yoo <[email protected]>
1 parent e995811 commit cb50c7c

File tree

20 files changed

+321
-40
lines changed

20 files changed

+321
-40
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
User,
3636
StorageRef,
3737
RUNNING_AND_WAITING_STATES,
38-
SUCCESSFUL_STATES,
38+
SUCCESSFUL_AND_PAUSED_STATES,
3939
QARun,
4040
UpdatedResponse,
4141
DeletedResponseQuota,
@@ -460,7 +460,7 @@ async def _resolve_crawl_refs(
460460

461461
if (
462462
files
463-
and crawl.state in SUCCESSFUL_STATES
463+
and crawl.state in SUCCESSFUL_AND_PAUSED_STATES
464464
and isinstance(crawl, CrawlOutWithResources)
465465
):
466466
crawl.resources = await self._files_to_resources(files, org, crawl.id)

backend/btrixcloud/crawlconfigs.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import re
1212
import os
1313
import traceback
14-
from datetime import datetime
14+
from datetime import datetime, timedelta
1515
from uuid import UUID, uuid4
1616
import urllib.parse
1717

@@ -95,6 +95,8 @@ class CrawlConfigOps:
9595
crawler_images_map: dict[str, str]
9696
crawler_image_pull_policy_map: dict[str, str]
9797

98+
paused_expiry_delta: timedelta
99+
98100
def __init__(
99101
self,
100102
dbclient,
@@ -121,6 +123,10 @@ def __init__(
121123
"DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
122124
)
123125

126+
self.paused_expiry_delta = timedelta(
127+
minutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080"))
128+
)
129+
124130
self.router = APIRouter(
125131
prefix="/crawlconfigs",
126132
tags=["crawlconfigs"],
@@ -765,6 +771,13 @@ async def _add_running_curr_crawl_stats(self, crawlconfig: CrawlConfigOut):
765771
crawlconfig.lastCrawlState = crawl.state
766772
crawlconfig.lastCrawlSize = crawl.stats.size if crawl.stats else 0
767773
crawlconfig.lastCrawlStopping = crawl.stopping
774+
crawlconfig.lastCrawlShouldPause = crawl.shouldPause
775+
crawlconfig.lastCrawlPausedAt = crawl.pausedAt
776+
crawlconfig.lastCrawlPausedExpiry = None
777+
if crawl.pausedAt:
778+
crawlconfig.lastCrawlPausedExpiry = (
779+
crawl.pausedAt + self.paused_expiry_delta
780+
)
768781
crawlconfig.isCrawlRunning = True
769782

770783
async def get_crawl_config_out(self, cid: UUID, org: Organization):

backend/btrixcloud/crawlmanager.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import secrets
55

66
from typing import Optional, Dict, Tuple
7-
from datetime import timedelta
7+
from datetime import datetime, timedelta
88

99
from fastapi import HTTPException
1010

@@ -386,6 +386,14 @@ async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict:
386386

387387
return await self.delete_crawl_job(crawl_id)
388388

389+
async def pause_resume_crawl(
390+
self, crawl_id: str, paused_at: Optional[datetime] = None
391+
) -> dict:
392+
"""pause or resume a crawl"""
393+
return await self._patch_job(
394+
crawl_id, {"pausedAt": date_to_str(paused_at) if paused_at else ""}
395+
)
396+
389397
async def delete_crawl_configs_for_org(self, org: str) -> None:
390398
"""Delete all crawl configs for given org"""
391399
await self._delete_crawl_configs(f"btrix.org={org}")

backend/btrixcloud/crawls.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,39 @@ async def get_crawl_stats(
769769

770770
return crawls_data
771771

772+
async def pause_crawl(
773+
self, crawl_id: str, org: Organization, pause: bool
774+
) -> Dict[str, bool]:
775+
"""pause or resume a crawl temporarily"""
776+
crawl = await self.get_base_crawl(crawl_id, org)
777+
if crawl and crawl.type != "crawl":
778+
raise HTTPException(status_code=400, detail="not_a_crawl")
779+
780+
result = None
781+
782+
if pause:
783+
paused_at = dt_now()
784+
else:
785+
paused_at = None
786+
787+
try:
788+
result = await self.crawl_manager.pause_resume_crawl(
789+
crawl_id, paused_at=paused_at
790+
)
791+
792+
if result.get("success"):
793+
await self.crawls.find_one_and_update(
794+
{"_id": crawl_id, "type": "crawl", "oid": org.id},
795+
{"$set": {"shouldPause": pause, "pausedAt": paused_at}},
796+
)
797+
798+
return {"success": True}
799+
# pylint: disable=bare-except
800+
except:
801+
pass
802+
803+
raise HTTPException(status_code=404, detail="crawl_not_found")
804+
772805
async def shutdown_crawl(
773806
self, crawl_id: str, org: Organization, graceful: bool
774807
) -> Dict[str, bool]:
@@ -1242,6 +1275,22 @@ async def crawl_cancel_immediately(
12421275
async def crawl_graceful_stop(crawl_id, org: Organization = Depends(org_crawl_dep)):
12431276
return await ops.shutdown_crawl(crawl_id, org, graceful=True)
12441277

1278+
@app.post(
1279+
"/orgs/{oid}/crawls/{crawl_id}/pause",
1280+
tags=["crawls"],
1281+
response_model=SuccessResponse,
1282+
)
1283+
async def pause_crawl(crawl_id, org: Organization = Depends(org_crawl_dep)):
1284+
return await ops.pause_crawl(crawl_id, org, pause=True)
1285+
1286+
@app.post(
1287+
"/orgs/{oid}/crawls/{crawl_id}/resume",
1288+
tags=["crawls"],
1289+
response_model=SuccessResponse,
1290+
)
1291+
async def resume_crawl(crawl_id, org: Organization = Depends(org_crawl_dep)):
1292+
return await ops.pause_crawl(crawl_id, org, pause=False)
1293+
12451294
@app.post(
12461295
"/orgs/{oid}/crawls/delete",
12471296
tags=["crawls"],

backend/btrixcloud/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ class SettingsResponse(BaseModel):
124124

125125
localesEnabled: Optional[List[str]]
126126

127+
pausedExpiryMinutes: int
128+
127129

128130
# ============================================================================
129131
# pylint: disable=too-many-locals, duplicate-code
@@ -158,6 +160,7 @@ def main() -> None:
158160
if os.environ.get("LOCALES_ENABLED")
159161
else None
160162
),
163+
pausedExpiryMinutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", 10080)),
161164
)
162165

163166
invites = init_invites(mdb, email)

backend/btrixcloud/models.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,9 @@ class UserOrgInfoOut(BaseModel):
222222
]
223223
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)
224224

225-
TYPE_WAITING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"]
225+
TYPE_WAITING_STATES = Literal[
226+
"starting", "waiting_capacity", "waiting_org_limit", "paused"
227+
]
226228
WAITING_STATES = get_args(TYPE_WAITING_STATES)
227229

228230
TYPE_FAILED_STATES = Literal[
@@ -236,11 +238,13 @@ class UserOrgInfoOut(BaseModel):
236238
TYPE_SUCCESSFUL_STATES = Literal[
237239
"complete",
238240
"stopped_by_user",
241+
"stopped_pause_expired",
239242
"stopped_storage_quota_reached",
240243
"stopped_time_quota_reached",
241244
"stopped_org_readonly",
242245
]
243246
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
247+
SUCCESSFUL_AND_PAUSED_STATES = ["paused", *SUCCESSFUL_STATES]
244248

245249
TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES]
246250
RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES]
@@ -478,6 +482,9 @@ class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
478482
id: UUID
479483

480484
lastCrawlStopping: Optional[bool] = False
485+
lastCrawlShouldPause: Optional[bool] = False
486+
lastCrawlPausedAt: Optional[datetime] = None
487+
lastCrawlPausedExpiry: Optional[datetime] = None
481488
profileName: Optional[str] = None
482489
firstSeed: Optional[str] = None
483490
seedCount: int = 0
@@ -863,6 +870,8 @@ class CrawlOut(BaseMongoModel):
863870
seedCount: Optional[int] = None
864871
profileName: Optional[str] = None
865872
stopping: Optional[bool] = False
873+
shouldPause: Optional[bool] = False
874+
pausedAt: Optional[datetime] = None
866875
manual: bool = False
867876
cid_rev: Optional[int] = None
868877
scale: Scale = 1
@@ -1017,6 +1026,7 @@ class Crawl(BaseCrawl, CrawlConfigCore):
10171026
manual: bool = False
10181027

10191028
stopping: Optional[bool] = False
1029+
shouldPause: Optional[bool] = False
10201030

10211031
qaCrawlExecSeconds: int = 0
10221032

0 commit comments

Comments
 (0)