Skip to content

Commit a51f7c6

Browse files
tw4likreymer
andauthored
Add behavior logs from Redis to database and add endpoint to serve (#2526)
Backend work for #2524 This PR adds a second dedicated endpoint similar to `/errors`, as a combined log endpoint would give a false impression of being the complete crawl logs (which is far from what we're serving in Browsertrix at this point). Eventually when we have support for streaming live crawl logs in `crawls/<id>/logs` I'd ideally like to deprecate these two dedicated endpoints in favor of using that, but for now this seems like the best solution. --------- Co-authored-by: Ilya Kreymer <[email protected]>
1 parent f84f6f5 commit a51f7c6

File tree

10 files changed

+175
-24
lines changed

10 files changed

+175
-24
lines changed

.github/workflows/deploy-dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646

4747
- uses: actions/setup-python@v5
4848
with:
49-
python-version: "3.x"
49+
python-version: "3.12"
5050
cache: "poetry"
5151

5252
- name: Install vault decryption dependencies

backend/btrixcloud/basecrawls.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ async def get_crawl_out(
156156

157157
files = res.pop("files", None)
158158
res.pop("errors", None)
159+
res.pop("behaviorLogs", None)
159160

160161
if not skip_resources:
161162
coll_ids = res.get("collectionIds")
@@ -568,7 +569,7 @@ async def list_all_base_crawls(
568569
{"$match": query},
569570
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
570571
{"$set": {"firstSeed": "$firstSeedObject.url"}},
571-
{"$unset": ["firstSeedObject", "errors", "config"]},
572+
{"$unset": ["firstSeedObject", "errors", "behaviorLogs", "config"]},
572573
{"$set": {"activeQAStats": "$qa.stats"}},
573574
{
574575
"$set": {

backend/btrixcloud/crawls.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from .utils import (
2323
dt_now,
2424
date_to_str,
25-
parse_jsonl_error_messages,
25+
parse_jsonl_log_messages,
2626
stream_dict_list_as_csv,
2727
validate_regexes,
2828
)
@@ -49,7 +49,7 @@
4949
Seed,
5050
PaginatedCrawlOutResponse,
5151
PaginatedSeedResponse,
52-
PaginatedCrawlErrorResponse,
52+
PaginatedCrawlLogResponse,
5353
RUNNING_AND_WAITING_STATES,
5454
SUCCESSFUL_STATES,
5555
NON_RUNNING_STATES,
@@ -186,7 +186,7 @@ async def list_crawls(
186186
{"$match": query},
187187
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
188188
{"$set": {"firstSeed": "$firstSeedObject.url"}},
189-
{"$unset": ["firstSeedObject", "errors", "config"]},
189+
{"$unset": ["firstSeedObject", "errors", "behaviorLogs", "config"]},
190190
{"$set": {"activeQAStats": "$qa.stats"}},
191191
{
192192
"$set": {
@@ -670,6 +670,17 @@ async def add_crawl_error(
670670
)
671671
return res is not None
672672

673+
async def add_crawl_behavior_log(
674+
self,
675+
crawl_id: str,
676+
log_line: str,
677+
) -> bool:
678+
"""add crawl behavior log from redis to mongodb behaviorLogs field"""
679+
res = await self.crawls.find_one_and_update(
680+
{"_id": crawl_id}, {"$push": {"behaviorLogs": log_line}}
681+
)
682+
return res is not None
683+
673684
async def add_crawl_file(
674685
self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int
675686
) -> bool:
@@ -1595,7 +1606,7 @@ async def stream_crawl_logs(
15951606
@app.get(
15961607
"/orgs/{oid}/crawls/{crawl_id}/errors",
15971608
tags=["crawls"],
1598-
response_model=PaginatedCrawlErrorResponse,
1609+
response_model=PaginatedCrawlLogResponse,
15991610
)
16001611
async def get_crawl_errors(
16011612
crawl_id: str,
@@ -1609,7 +1620,31 @@ async def get_crawl_errors(
16091620
upper_bound = skip + pageSize
16101621

16111622
errors = crawl.errors[skip:upper_bound] if crawl.errors else []
1612-
parsed_errors = parse_jsonl_error_messages(errors)
1623+
parsed_errors = parse_jsonl_log_messages(errors)
16131624
return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize)
16141625

1626+
@app.get(
1627+
"/orgs/{oid}/crawls/{crawl_id}/behaviorLogs",
1628+
tags=["crawls"],
1629+
response_model=PaginatedCrawlLogResponse,
1630+
)
1631+
async def get_crawl_behavior_logs(
1632+
crawl_id: str,
1633+
pageSize: int = DEFAULT_PAGE_SIZE,
1634+
page: int = 1,
1635+
org: Organization = Depends(org_viewer_dep),
1636+
):
1637+
crawl = await ops.get_crawl(crawl_id, org)
1638+
1639+
skip = (page - 1) * pageSize
1640+
upper_bound = skip + pageSize
1641+
1642+
behavior_logs = (
1643+
crawl.behaviorLogs[skip:upper_bound] if crawl.behaviorLogs else []
1644+
)
1645+
parsed_logs = parse_jsonl_log_messages(behavior_logs)
1646+
return paginated_format(
1647+
parsed_logs, len(crawl.behaviorLogs or []), page, pageSize
1648+
)
1649+
16151650
return ops

backend/btrixcloud/models.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,7 @@ class CoreCrawlable(BaseModel):
774774
fileCount: int = 0
775775

776776
errors: Optional[List[str]] = []
777+
behaviorLogs: Optional[List[str]] = []
777778

778779

779780
# ============================================================================
@@ -845,6 +846,7 @@ class CrawlOut(BaseMongoModel):
845846
tags: Optional[List[str]] = []
846847

847848
errors: Optional[List[str]] = []
849+
behaviorLogs: Optional[List[str]] = []
848850

849851
collectionIds: Optional[List[UUID]] = []
850852

@@ -1042,8 +1044,8 @@ class CrawlScaleResponse(BaseModel):
10421044

10431045

10441046
# ============================================================================
1045-
class CrawlError(BaseModel):
1046-
"""Crawl error"""
1047+
class CrawlLogMessage(BaseModel):
1048+
"""Crawl log message"""
10471049

10481050
timestamp: str
10491051
logLevel: str
@@ -2892,10 +2894,10 @@ class PaginatedWebhookNotificationResponse(PaginatedResponse):
28922894

28932895

28942896
# ============================================================================
2895-
class PaginatedCrawlErrorResponse(PaginatedResponse):
2896-
"""Response model for crawl errors"""
2897+
class PaginatedCrawlLogResponse(PaginatedResponse):
2898+
"""Response model for crawl logs"""
28972899

2898-
items: List[CrawlError]
2900+
items: List[CrawlLogMessage]
28992901

29002902

29012903
# ============================================================================

backend/btrixcloud/operator/crawls.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class CrawlOperator(BaseOperator):
8686
done_key: str
8787
pages_key: str
8888
errors_key: str
89+
behavior_logs_key: str
8990

9091
fast_retry_secs: int
9192
log_failed_crawl_lines: int
@@ -98,6 +99,7 @@ def __init__(self, *args):
9899
self.done_key = "crawls-done"
99100
self.pages_key = "pages"
100101
self.errors_key = "e"
102+
self.behavior_logs_key = "b"
101103

102104
self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0)
103105

@@ -904,6 +906,13 @@ async def sync_crawl_state(
904906
)
905907
crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}")
906908

909+
behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}")
910+
while behavior_log:
911+
await self.crawl_ops.add_crawl_behavior_log(
912+
crawl.db_crawl_id, behavior_log
913+
)
914+
behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}")
915+
907916
# ensure filesAdded and filesAddedSize always set
908917
status.filesAdded = int(await redis.get("filesAdded") or 0)
909918
status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)

backend/btrixcloud/utils.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,21 +70,21 @@ def exit_handler():
7070
loop.add_signal_handler(signal.SIGTERM, exit_handler)
7171

7272

73-
def parse_jsonl_error_messages(errors: list[str]) -> list[dict]:
73+
def parse_jsonl_log_messages(log_lines: list[str]) -> list[dict]:
7474
"""parse json-l error strings from redis/db into json"""
75-
parsed_errors = []
76-
for error_line in errors:
77-
if not error_line:
75+
parsed_log_lines = []
76+
for log_line in log_lines:
77+
if not log_line:
7878
continue
7979
try:
80-
result = json.loads(error_line)
81-
parsed_errors.append(result)
80+
result = json.loads(log_line)
81+
parsed_log_lines.append(result)
8282
except json.JSONDecodeError as err:
8383
print(
84-
f"Error decoding json-l error line: {error_line}. Error: {err}",
84+
f"Error decoding json-l log line: {log_line}. Error: {err}",
8585
flush=True,
8686
)
87-
return parsed_errors
87+
return parsed_log_lines
8888

8989

9090
def is_bool(stri: Optional[str]) -> bool:

backend/test/conftest.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,42 @@ def all_crawls_delete_config_id(admin_crawl_id):
511511
return _all_crawls_delete_config_id
512512

513513

514+
@pytest.fixture(scope="session")
515+
def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
516+
crawl_data = {
517+
"runNow": True,
518+
"name": "Custom Behavior Logs",
519+
"config": {
520+
"seeds": [{"url": "https://specs.webrecorder.net/"}],
521+
"customBehaviors": [
522+
"https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom.js"
523+
],
524+
"limit": 1,
525+
},
526+
}
527+
r = requests.post(
528+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
529+
headers=admin_auth_headers,
530+
json=crawl_data,
531+
)
532+
data = r.json()
533+
534+
crawl_id = data["run_now_job"]
535+
536+
# Wait for crawl to complete
537+
while True:
538+
r = requests.get(
539+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
540+
headers=admin_auth_headers,
541+
)
542+
data = r.json()
543+
if data["state"] in FINISHED_STATES:
544+
break
545+
time.sleep(5)
546+
547+
return crawl_id
548+
549+
514550
@pytest.fixture(scope="session")
515551
def url_list_config_id(crawler_auth_headers, default_org_id):
516552
# Start crawl.

backend/test/test_run_crawl.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,3 +1283,67 @@ def test_delete_crawls_org_owner(
12831283
headers=admin_auth_headers,
12841284
)
12851285
assert r.status_code == 404
1286+
1287+
1288+
def test_custom_behavior_logs(
1289+
custom_behaviors_crawl_id, crawler_auth_headers, default_org_id
1290+
):
1291+
r = requests.get(
1292+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/behaviorLogs",
1293+
headers=crawler_auth_headers,
1294+
)
1295+
assert r.status_code == 200
1296+
data = r.json()
1297+
1298+
custom_log_line_count = 0
1299+
1300+
assert data["total"] > 0
1301+
for log in data["items"]:
1302+
assert log["timestamp"]
1303+
assert log["context"] in ("behavior", "behaviorScript", "behaviorScriptCustom")
1304+
1305+
if log["context"] == "behaviorScriptCustom":
1306+
assert log["message"] in (
1307+
"test-stat",
1308+
"done!",
1309+
"Using Site-Specific Behavior: TestBehavior",
1310+
)
1311+
if log["message"] in ("test-stat", "done!"):
1312+
assert log["details"]["behavior"] == "TestBehavior"
1313+
assert log["details"]["page"] == "https://specs.webrecorder.net/"
1314+
1315+
custom_log_line_count += 1
1316+
1317+
assert custom_log_line_count == 3
1318+
1319+
1320+
def test_crawls_exclude_behavior_logs(
1321+
custom_behaviors_crawl_id, admin_auth_headers, default_org_id
1322+
):
1323+
# Get endpoint
1324+
r = requests.get(
1325+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}",
1326+
headers=admin_auth_headers,
1327+
)
1328+
assert r.status_code == 200
1329+
data = r.json()
1330+
assert data.get("behaviorLogs") == []
1331+
1332+
# replay.json endpoint
1333+
r = requests.get(
1334+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/replay.json",
1335+
headers=admin_auth_headers,
1336+
)
1337+
assert r.status_code == 200
1338+
data = r.json()
1339+
assert data.get("behaviorLogs") == []
1340+
1341+
# List endpoint
1342+
r = requests.get(
1343+
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
1344+
headers=admin_auth_headers,
1345+
)
1346+
assert r.status_code == 200
1347+
crawls = r.json()["items"]
1348+
for crawl in crawls:
1349+
assert data.get("behaviorLogs") == []

backend/test/test_uploads.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,7 @@ def test_get_all_crawls_by_type(
607607
)
608608
assert r.status_code == 200
609609
data = r.json()
610-
assert data["total"] == 5
610+
assert data["total"] == 6
611611
for item in data["items"]:
612612
assert item["type"] == "crawl"
613613

@@ -823,9 +823,10 @@ def test_all_crawls_search_values(
823823
assert r.status_code == 200
824824
data = r.json()
825825

826-
assert len(data["names"]) == 7
826+
assert len(data["names"]) == 8
827827
expected_names = [
828828
"Crawler User Test Crawl",
829+
"Custom Behavior Logs",
829830
"My Upload Updated",
830831
"test2.wacz",
831832
"All Crawls Test Crawl",
@@ -837,6 +838,7 @@ def test_all_crawls_search_values(
837838
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
838839
assert sorted(data["firstSeeds"]) == [
839840
"https://old.webrecorder.net/",
841+
"https://specs.webrecorder.net/",
840842
"https://webrecorder.net/",
841843
]
842844

@@ -848,19 +850,21 @@ def test_all_crawls_search_values(
848850
assert r.status_code == 200
849851
data = r.json()
850852

851-
assert len(data["names"]) == 4
853+
assert len(data["names"]) == 5
852854
expected_names = [
853855
"Admin Test Crawl",
854856
"All Crawls Test Crawl",
855857
"Crawler User Crawl for Testing QA",
856858
"Crawler User Test Crawl",
859+
"Custom Behavior Logs",
857860
]
858861
for expected_name in expected_names:
859862
assert expected_name in data["names"]
860863

861864
assert sorted(data["descriptions"]) == ["Lorem ipsum"]
862865
assert sorted(data["firstSeeds"]) == [
863866
"https://old.webrecorder.net/",
867+
"https://specs.webrecorder.net/",
864868
"https://webrecorder.net/",
865869
]
866870

chart/templates/configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ metadata:
101101
namespace: {{ .Values.crawler_namespace }}
102102

103103
data:
104-
{{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }}
104+
{{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --logBehaviorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }}
105105

106106
CRAWL_ARGS: {{- include "btrix.crawler_args" . }}
107107

0 commit comments

Comments
 (0)