diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 5b3e797d41..25f887ba4b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -59,6 +59,7 @@ RequestHandlerError, SessionError, UserDefinedErrorHandlerError, + UserHandlerTimeoutError, ) from crawlee.events._types import Event, EventCrawlerStatusData from crawlee.http_clients import ImpitHttpClient @@ -1134,7 +1135,7 @@ async def _handle_request_retries( request.retry_count += 1 reduced_error = str(error).split('\n')[0] self.log.warning( - f'Retrying request to {context.request.url} due to: {reduced_error}' + f'Retrying request to {context.request.url} due to: {reduced_error}. ' f'{get_one_line_error_summary_if_possible(error)}' ) await self._statistics.error_tracker.add(error=error, context=context) @@ -1222,10 +1223,11 @@ def _get_message_from_error(self, error: Exception) -> str: if ( isinstance(error, asyncio.exceptions.TimeoutError) + and traceback_parts and self._request_handler_timeout_text in traceback_parts[-1] - ): + ) or isinstance(error, UserHandlerTimeoutError): used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) - used_traceback_parts.append(traceback_parts[-1]) + used_traceback_parts.extend(traceback_parts[-1:]) return ''.join(used_traceback_parts).strip('\n') diff --git a/src/crawlee/crawlers/_basic/_logging_utils.py b/src/crawlee/crawlers/_basic/_logging_utils.py index f9a8306458..528d213cd6 100644 --- a/src/crawlee/crawlers/_basic/_logging_utils.py +++ b/src/crawlee/crawlers/_basic/_logging_utils.py @@ -2,9 +2,21 @@ import re import traceback +import crawlee.errors + def _get_only_innermost_exception(error: BaseException) -> BaseException: - """Get innermost exception by following __cause__ and __context__ attributes of exception.""" + """Get innermost exception by following __cause__ and __context__ attributes of exception. + + If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible. + """ + if type(error) is crawlee.errors.UserHandlerTimeoutError: + if error.__cause__: + return error.__cause__ + if error.__context__: + return error.__context__ + return error + if error.__cause__: return _get_only_innermost_exception(error.__cause__) if error.__context__: @@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str: def reduce_asyncio_timeout_error_to_relevant_traceback_parts( - timeout_error: asyncio.exceptions.TimeoutError, + timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError, ) -> list[str]: innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error) return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts) @@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts( def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]: innermost_error = _get_only_innermost_exception(error) return traceback.format_exception( - type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True + type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False ) def get_one_line_error_summary_if_possible(error: Exception) -> str: if isinstance(error, asyncio.exceptions.TimeoutError): - most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1] + relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error) + most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else '' + elif isinstance(error, crawlee.errors.UserHandlerTimeoutError): + # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee + # code and third line the topmost user error + traceback_parts = _get_traceback_parts_for_innermost_exception(error) + relevant_index_from_start = 3 + most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else '' elif 'playwright._impl._errors.Error' in str(error.__class__): # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway # point to deep internals. diff --git a/src/crawlee/errors.py b/src/crawlee/errors.py index 217ce31091..539bcf7711 100644 --- a/src/crawlee/errors.py +++ b/src/crawlee/errors.py @@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception): """Wraps an exception thrown from an user-defined error handler.""" +class UserHandlerTimeoutError(UserDefinedErrorHandlerError): + """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out.""" + + @docs_group('Errors') class SessionError(Exception): """Errors of `SessionError` type will trigger a session rotation. diff --git a/src/crawlee/router.py b/src/crawlee/router.py index a6278d98e1..94312cb1d1 100644 --- a/src/crawlee/router.py +++ b/src/crawlee/router.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from collections.abc import Awaitable, Callable from typing import Generic, TypeVar @@ -8,6 +9,8 @@ __all__ = ['Router'] +from crawlee.errors import UserHandlerTimeoutError + TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext) RequestHandler = Callable[[TCrawlingContext], Awaitable[None]] @@ -95,7 +98,12 @@ async def __call__(self, context: TCrawlingContext) -> None: f'No handler matches label `{context.request.label}` and no default handler is configured' ) - return await self._default_handler(context) + user_defined_handler = self._default_handler + else: + user_defined_handler = self._handlers_by_label[context.request.label] - handler = self._handlers_by_label[context.request.label] - return await handler(context) + try: + return await user_defined_handler(context) + except asyncio.TimeoutError as e: + # Timeout in handler, but not timeout of handler. + raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index e9606fb10c..c72002e1c6 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -6,8 +6,10 @@ import json import logging import os +import re import sys import time +from asyncio import Future from collections import Counter from dataclasses import dataclass from datetime import timedelta @@ -1540,6 +1542,7 @@ async def test_reduced_logs_from_timed_out_request_handler(caplog: pytest.LogCap caplog.set_level(logging.INFO) crawler = BasicCrawler( configure_logging=False, + max_request_retries=1, request_handler_timeout=timedelta(seconds=1), ) @@ -1565,6 +1568,31 @@ async def handler(context: BasicCrawlingContext) -> None: assert found_timeout_message, 'Expected log message about request handler error was not found.' +async def test_reduced_logs_from_time_out_in_request_handler(caplog: pytest.LogCaptureFixture) -> None: + crawler = BasicCrawler(configure_logging=False, max_request_retries=1) + + @crawler.router.default_handler + async def default_handler(_: BasicCrawlingContext) -> None: + await asyncio.wait_for(Future(), timeout=1) + + # Capture all logs from the 'crawlee' logger at INFO level or higher + with caplog.at_level(logging.INFO, logger='crawlee'): + await crawler.run([Request.from_url('https://a.placeholder.com')]) + + # Check for 1 line summary message + found_timeout_message = False + for record in caplog.records: + if re.match( + r'Retrying request to .* due to: Timeout raised by user defined handler\. File .*, line .*,' + r' in default_handler, await asyncio.wait_for\(Future\(\), timeout=1\)', + record.message, + ): + found_timeout_message = True + break + + assert found_timeout_message, 'Expected log message about request handler error was not found.' + + async def test_status_message_callback() -> None: """Test that status message callback is called with the correct message.""" status_message_callback = AsyncMock()