Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
RequestHandlerError,
SessionError,
UserDefinedErrorHandlerError,
UserHandlerTimeoutError,
)
from crawlee.events._types import Event, EventCrawlerStatusData
from crawlee.http_clients import ImpitHttpClient
Expand Down Expand Up @@ -1134,7 +1135,7 @@ async def _handle_request_retries(
request.retry_count += 1
reduced_error = str(error).split('\n')[0]
self.log.warning(
f'Retrying request to {context.request.url} due to: {reduced_error}'
f'Retrying request to {context.request.url} due to: {reduced_error}. '
f'{get_one_line_error_summary_if_possible(error)}'
)
await self._statistics.error_tracker.add(error=error, context=context)
Expand Down Expand Up @@ -1222,10 +1223,11 @@ def _get_message_from_error(self, error: Exception) -> str:

if (
isinstance(error, asyncio.exceptions.TimeoutError)
and traceback_parts
and self._request_handler_timeout_text in traceback_parts[-1]
):
) or isinstance(error, UserHandlerTimeoutError):
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
used_traceback_parts.append(traceback_parts[-1])
used_traceback_parts.extend(traceback_parts[-1:])

return ''.join(used_traceback_parts).strip('\n')

Expand Down
27 changes: 23 additions & 4 deletions src/crawlee/crawlers/_basic/_logging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,21 @@
import re
import traceback

import crawlee.errors


def _get_only_innermost_exception(error: BaseException) -> BaseException:
"""Get innermost exception by following __cause__ and __context__ attributes of exception."""
"""Get innermost exception by following __cause__ and __context__ attributes of exception.

If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
"""
if type(error) is crawlee.errors.UserHandlerTimeoutError:
if error.__cause__:
return error.__cause__
if error.__context__:
return error.__context__
return error

if error.__cause__:
return _get_only_innermost_exception(error.__cause__)
if error.__context__:
Expand Down Expand Up @@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:


def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
timeout_error: asyncio.exceptions.TimeoutError,
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
) -> list[str]:
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
Expand All @@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
innermost_error = _get_only_innermost_exception(error)
return traceback.format_exception(
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
)


def get_one_line_error_summary_if_possible(error: Exception) -> str:
if isinstance(error, asyncio.exceptions.TimeoutError):
most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
# code and third line the topmost user error
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
relevant_index_from_start = 3
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
elif 'playwright._impl._errors.Error' in str(error.__class__):
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
# point to deep internals.
Expand Down
4 changes: 4 additions & 0 deletions src/crawlee/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
"""Wraps an exception thrown from an user-defined error handler."""


class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""


@docs_group('Errors')
class SessionError(Exception):
"""Errors of `SessionError` type will trigger a session rotation.
Expand Down
14 changes: 11 additions & 3 deletions src/crawlee/router.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import asyncio
from collections.abc import Awaitable, Callable
from typing import Generic, TypeVar

Expand All @@ -8,6 +9,8 @@

__all__ = ['Router']

from crawlee.errors import UserHandlerTimeoutError

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]

Expand Down Expand Up @@ -95,7 +98,12 @@ async def __call__(self, context: TCrawlingContext) -> None:
f'No handler matches label `{context.request.label}` and no default handler is configured'
)

return await self._default_handler(context)
user_defined_handler = self._default_handler
else:
user_defined_handler = self._handlers_by_label[context.request.label]

handler = self._handlers_by_label[context.request.label]
return await handler(context)
try:
return await user_defined_handler(context)
except asyncio.TimeoutError as e:
# Timeout in handler, but not timeout of handler.
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
28 changes: 28 additions & 0 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
import json
import logging
import os
import re
import sys
import time
from asyncio import Future
from collections import Counter
from dataclasses import dataclass
from datetime import timedelta
Expand Down Expand Up @@ -1540,6 +1542,7 @@ async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCap
caplog.set_level(logging.INFO)
crawler = BasicCrawler(
configure_logging=False,
max_request_retries=1,
request_handler_timeout=timedelta(seconds=1),
)

Expand All @@ -1565,6 +1568,31 @@ async def handler(context: BasicCrawlingContext) -> None:
assert found_timeout_message, 'Expected log message about request handler error was not found.'


async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None:
crawler = BasicCrawler(configure_logging=False, max_request_retries=1)

@crawler.router.default_handler
async def default_handler(_: BasicCrawlingContext) -> None:
await asyncio.wait_for(Future(), timeout=1)

# Capture all logs from the 'crawlee' logger at INFO level or higher
with caplog.at_level(logging.INFO, logger='crawlee'):
await crawler.run([Request.from_url('https://a.placeholder.com')])

# Check for 1 line summary message
found_timeout_message = False
for record in caplog.records:
if re.match(
r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,'
r' in default_handler, await asyncio.wait_for\(Future\(\), timeout=1\)',
record.message,
):
found_timeout_message = True
break

assert found_timeout_message, 'Expected log message about request handler error was not found.'


async def test_status_message_callback() -> None:
"""Test that status message callback is called with the correct message."""
status_message_callback = AsyncMock()
Expand Down