diff --git a/.gitignore b/.gitignore index 74cb9fe9c0..ea73de62b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Cache __pycache__ -.mypy_cache +.uv_cache .pytest_cache .ruff_cache .uv-cache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 940b6b7ec4..95b1982bdb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,7 +46,7 @@ make format ### Type checking -Type checking is handled by [mypy](https://mypy.readthedocs.io/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`. +Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`. To run type checking: diff --git a/Makefile b/Makefile index 390181e21a..7224fcb752 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ E2E_TESTS_CONCURRENCY = 1 clean: - rm -rf .mypy_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage + rm -rf .uv_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage install-sync: uv sync --all-extras @@ -27,7 +27,7 @@ lint: uv run ruff check type-check: - uv run mypy + uv run ty check unit-tests: uv run pytest \ diff --git a/docs/deployment/code_examples/google/cloud_run_example.py b/docs/deployment/code_examples/google/cloud_run_example.py index 4176cf60e4..27d23b99eb 100644 --- a/docs/deployment/code_examples/google/cloud_run_example.py +++ b/docs/deployment/code_examples/google/cloud_run_example.py @@ -1,4 +1,3 @@ -# mypy: disable-error-code="misc" import json import os @@ -9,7 +8,7 @@ from crawlee.storage_clients import MemoryStorageClient -@get('/') # type: ignore[untyped-decorator] +@get('/') async def main() -> str: """The crawler entry point that will be called when the HTTP endpoint is accessed.""" # highlight-start diff --git a/docs/deployment/code_examples/google/google_example.py b/docs/deployment/code_examples/google/google_example.py index 474e121b71..68deac804c 100644 --- a/docs/deployment/code_examples/google/google_example.py +++ b/docs/deployment/code_examples/google/google_example.py @@ -1,4 +1,3 @@ -# mypy: disable-error-code="misc" import asyncio import json from datetime import timedelta @@ -48,7 +47,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # highlight-end -@functions_framework.http # type: ignore[untyped-decorator] +@functions_framework.http def crawlee_run(request: Request) -> Response: # You can pass data to your crawler using `request` function_id = request.headers['Function-Execution-Id'] diff --git a/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py b/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py index a6d2072ad3..4e6ed92aa6 100644 --- a/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py +++ b/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py @@ -9,7 +9,7 @@ async def main() -> None: fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=['chromium']), + header_options=HeaderGeneratorOptions(browsers=['chrome']), screen_options=ScreenOptions(min_width=400), ) diff --git a/docs/guides/code_examples/running_in_web_server/server.py b/docs/guides/code_examples/running_in_web_server/server.py index 09be14e2be..64e192af37 100644 --- a/docs/guides/code_examples/running_in_web_server/server.py +++ b/docs/guides/code_examples/running_in_web_server/server.py @@ -14,7 +14,7 @@ app = FastAPI(lifespan=lifespan, title='Crawler app') -@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator] +@app.get('/', response_class=HTMLResponse) def index() -> str: return """ @@ -32,7 +32,7 @@ def index() -> str: """ -@app.get('/scrape') # type: ignore[untyped-decorator] +@app.get('/scrape') async def scrape_url(request: Request, url: str | None = None) -> dict: if not url: return {'url': 'missing', 'scrape result': 'no results'} diff --git a/pyproject.toml b/pyproject.toml index b7a9d72679..54fecf8ce4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,7 +102,6 @@ dev = [ "build<2.0.0", # For e2e tests. "dycw-pytest-only<3.0.0", "fakeredis[probabilistic,json,lua]<3.0.0", - "mypy~=1.19.0", "pre-commit<5.0.0", "proxy-py<3.0.0", "pydoc-markdown<5.0.0", @@ -113,6 +112,7 @@ dev = [ "pytest<9.0.0", "ruff~=0.14.0", "setuptools", # setuptools are used by pytest, but not explicitly required + "ty~=0.0.0", "types-beautifulsoup4<5.0.0", "types-cachetools<7.0.0", "types-colorama<1.0.0", @@ -230,57 +230,24 @@ filterwarnings = [ "ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning", ] -[tool.mypy] -python_version = "3.10" -plugins = ["pydantic.mypy"] +[tool.ty.environment] +python-version = "3.10" + +[tool.ty.src] +include = ["src", "tests", "scripts", "docs", "website"] exclude = [ "src/crawlee/project_template", "docs/guides/code_examples/storage_clients/custom_storage_client_example.py", ] -files = ["src", "tests", "docs", "website"] -check_untyped_defs = true -disallow_incomplete_defs = true -disallow_untyped_calls = true -disallow_untyped_decorators = true -disallow_untyped_defs = true -no_implicit_optional = true -warn_redundant_casts = true -warn_return_any = true -warn_unreachable = true -warn_unused_ignores = true - -[[tool.mypy.overrides]] -# Example codes are sometimes showing integration of crawlee with external tool, which is not dependency of crawlee. -module = [ - "apify", # Example code shows integration of apify and crawlee. - "apify_fingerprint_datapoints", # Untyped and stubs not available - "camoufox", # Example code shows integration of camoufox and crawlee. - "fastapi", # Example code shows running in webserver. - "stagehand.*", # Example code shows integration of Stagehand and crawlee. - "starlette.*", # Example code shows running in webserver. - "flask", # Example code shows deploy on Google Cloud. - "functions_framework", # Example code shows deploy on Google Cloud. - "jaro", # Untyped and stubs not available - "litestar", # Example code shows deploy on Google Cloud Run. - "loguru", # Example code shows integration of loguru and crawlee for JSON logging. - "sklearn.linear_model", # Untyped and stubs not available - "cookiecutter.*", # Untyped and stubs not available - "inquirer.*", # Untyped and stubs not available - "warcio.*", # Example code shows WARC files creation. - "wrapt" # Untyped and stubs not available -] -ignore_missing_imports = true -[[tool.mypy.overrides]] -module = [ - "running_in_web_server.*", # False positive when fastapi not available +[[tool.ty.overrides]] +include = [ + "docs/**/*.py", + "website/**/*.py", ] -disable_error_code = ["misc"] -[tool.basedpyright] -pythonVersion = "3.10" -typeCheckingMode = "standard" -include = ["src", "tests", "docs", "website"] +[tool.ty.overrides.rules] +unresolved-import = "ignore" [tool.coverage.report] exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"] diff --git a/src/crawlee/_browserforge_workaround.py b/src/crawlee/_browserforge_workaround.py index 495d2a8298..8e8dcceca4 100644 --- a/src/crawlee/_browserforge_workaround.py +++ b/src/crawlee/_browserforge_workaround.py @@ -20,7 +20,7 @@ def patch_browserforge() -> None: def DownloadIfNotExists(**flags: bool) -> None: pass - download.DownloadIfNotExists = DownloadIfNotExists + download.DownloadIfNotExists = DownloadIfNotExists # ty: ignore[invalid-assignment] import browserforge.bayesian_network diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 9e0fd1dfc6..fd1feef791 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -93,7 +93,7 @@ def __setitem__(self, key: str, value: JsonSerializable) -> None: def __delitem__(self, key: str) -> None: del self.__pydantic_extra__[key] - def __iter__(self) -> Iterator[str]: # type: ignore[override] + def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override] yield from self.__pydantic_extra__ def __len__(self) -> int: @@ -195,7 +195,7 @@ class Request(BaseModel): ] = None """HTTP request payload.""" - # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory + # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: headers: HttpHeaders = HttpHeaders() """HTTP request headers.""" diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index a98664d02d..bf10dd6ff3 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -62,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]): model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) - # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory + # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: root: dict[str, str] = {} else: root: Annotated[ dict[str, str], PlainValidator(lambda value: _normalize_headers(value)), - Field(default_factory=dict), + Field(default_factory=lambda: dict[str, str]()), ] def __getitem__(self, key: str) -> str: @@ -91,7 +91,7 @@ def __ror__(self, other: HttpHeaders) -> HttpHeaders: combined_headers = {**other, **self.root} return HttpHeaders(combined_headers) - def __iter__(self) -> Iterator[str]: # type: ignore[override] + def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override] yield from self.root def __len__(self) -> int: @@ -671,17 +671,16 @@ def create_modified_copy( get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None, ) -> Self: """Create a modified copy of the crawling context with specified changes.""" - original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} - modified_fields = { - key: value - for key, value in { - 'push_data': push_data, - 'add_requests': add_requests, - 'get_key_value_store': get_key_value_store, - }.items() - if value - } - return self.__class__(**{**original_fields, **modified_fields}) + modifications = dict[str, Any]() + + if push_data is not None: + modifications['push_data'] = push_data + if add_requests is not None: + modifications['add_requests'] = add_requests + if get_key_value_store is not None: + modifications['get_key_value_store'] = get_key_value_store + + return dataclasses.replace(self, **modifications) class GetDataKwargs(TypedDict): diff --git a/src/crawlee/_utils/context.py b/src/crawlee/_utils/context.py index fb750cf0e7..6f3a65094b 100644 --- a/src/crawlee/_utils/context.py +++ b/src/crawlee/_utils/context.py @@ -44,4 +44,4 @@ async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return await method(self, *args, **kwargs) - return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value] + return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # ty: ignore[invalid-return-type] diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index f53b6bab0a..1d297fa724 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -170,7 +170,7 @@ async def export_csv_to_stream( if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' - writer = csv.writer(dst, **kwargs) # type: ignore[arg-type] + writer = csv.writer(dst, **kwargs) write_header = True # Iterate over the dataset and write to CSV. diff --git a/src/crawlee/_utils/globs.py b/src/crawlee/_utils/globs.py index f7e1a57927..aed82e1f18 100644 --- a/src/crawlee/_utils/globs.py +++ b/src/crawlee/_utils/globs.py @@ -36,7 +36,7 @@ def _translate( if not seps: seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep - escaped_seps = ''.join(map(re.escape, seps)) + escaped_seps = ''.join(map(re.escape, seps)) # ty: ignore[invalid-argument-type] any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' if include_hidden: diff --git a/src/crawlee/_utils/recurring_task.py b/src/crawlee/_utils/recurring_task.py index 3a6553b6c0..ba80f8f8b0 100644 --- a/src/crawlee/_utils/recurring_task.py +++ b/src/crawlee/_utils/recurring_task.py @@ -25,7 +25,7 @@ class RecurringTask: """ def __init__(self, func: Callable, delay: timedelta) -> None: - logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') + logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') # ty: ignore[unresolved-attribute] self.func = func self.delay = delay self.task: asyncio.Task | None = None @@ -55,7 +55,7 @@ async def _wrapper(self) -> None: def start(self) -> None: """Start the recurring task execution.""" - self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') + self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') # ty: ignore[possibly-missing-attribute] async def stop(self) -> None: """Stop the recurring task execution.""" diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 0d839cd1ed..ba844ca47b 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -430,10 +430,10 @@ async def parse_sitemap( up to the specified maximum depth. """ # Set default options - options = options or {} - emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) - max_depth = options.get('max_depth', float('inf')) - sitemap_retries = options.get('sitemap_retries', 3) + options = options or {} # ty: ignore[invalid-assignment] + emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) # ty: ignore[possibly-missing-attribute] + max_depth = options.get('max_depth', float('inf')) # ty: ignore[possibly-missing-attribute] + sitemap_retries = options.get('sitemap_retries', 3) # ty: ignore[possibly-missing-attribute] # Setup working state sources = list(initial_sources) @@ -472,7 +472,7 @@ async def parse_sitemap( sitemap_retries, emit_nested_sitemaps=emit_nested_sitemaps, proxy_info=proxy_info, - timeout=options.get('timeout', timedelta(seconds=30)), + timeout=options.get('timeout', timedelta(seconds=30)), # ty: ignore[possibly-missing-attribute] ): yield result else: diff --git a/src/crawlee/_utils/system.py b/src/crawlee/_utils/system.py index d1f1cd9976..56eeaadf24 100644 --- a/src/crawlee/_utils/system.py +++ b/src/crawlee/_utils/system.py @@ -5,7 +5,7 @@ from contextlib import suppress from datetime import datetime, timezone from logging import getLogger -from typing import Annotated +from typing import TYPE_CHECKING, Annotated import psutil from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator @@ -41,11 +41,19 @@ class CpuInfo(BaseModel): used_ratio: Annotated[float, Field(alias='usedRatio')] """The ratio of CPU currently in use, represented as a float between 0 and 1.""" - created_at: datetime = Field( - alias='createdAt', - default_factory=lambda: datetime.now(timezone.utc), - ) - """The time at which the measurement was taken.""" + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + created_at: datetime = datetime.now(timezone.utc) + """The time at which the measurement was taken.""" + else: + created_at: Annotated[ + datetime, + Field( + alias='createdAt', + default_factory=lambda: datetime.now(timezone.utc), + ), + ] + """The time at which the measurement was taken.""" class MemoryUsageInfo(BaseModel): @@ -61,11 +69,19 @@ class MemoryUsageInfo(BaseModel): ] """Memory usage of the current Python process and its children.""" - created_at: datetime = Field( - alias='createdAt', - default_factory=lambda: datetime.now(timezone.utc), - ) - """The time at which the measurement was taken.""" + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + created_at: datetime = datetime.now(timezone.utc) + """The time at which the measurement was taken.""" + else: + created_at: Annotated[ + datetime, + Field( + alias='createdAt', + default_factory=lambda: datetime.now(timezone.utc), + ), + ] + """The time at which the measurement was taken.""" class MemoryInfo(MemoryUsageInfo): diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 7d3fe0409c..480fb9fac5 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -142,7 +142,7 @@ def with_default_plugin( plugin_options['browser_new_context_options'] = browser_new_context_options or {} if headless is not None: - plugin_options['browser_launch_options']['headless'] = headless + plugin_options['browser_launch_options']['headless'] = headless # ty: ignore[invalid-assignment] if use_incognito_pages is not None: plugin_options['use_incognito_pages'] = use_incognito_pages diff --git a/src/crawlee/browsers/_playwright_browser.py b/src/crawlee/browsers/_playwright_browser.py index aba8e6b7e1..c66dcb21be 100644 --- a/src/crawlee/browsers/_playwright_browser.py +++ b/src/crawlee/browsers/_playwright_browser.py @@ -78,7 +78,7 @@ async def new_context(self, **context_options: Any) -> BrowserContext: async def _delete_temp_dir(self, _: BrowserContext | None) -> None: if self._temp_dir and self._temp_dir.exists(): - await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True) + await asyncio.to_thread(lambda: shutil.rmtree(self._temp_dir, ignore_errors=True)) # ty: ignore[invalid-argument-type] @override async def close(self, **kwargs: Any) -> None: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 6c1fbb63f8..059b3adbe9 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -102,7 +102,7 @@ def create_parsed_http_crawler_class( class _ParsedHttpCrawler( AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] - ): + ): # ty: ignore[invalid-generic-class] def __init__( self, parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, @@ -122,9 +122,9 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) - .compose(self._handle_status_code_response) + .compose(self._handle_status_code_response) # ty: ignore[invalid-argument-type] .compose(self._parse_http_response) - .compose(self._handle_blocked_request_by_content) + .compose(self._handle_blocked_request_by_content) # ty: ignore[invalid-argument-type] ) async def _execute_pre_navigation_hooks( diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index b3b99e6f59..139e902c0e 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -160,7 +160,7 @@ def __init__( super().__init__(statistics=statistics, **kwargs) # Sub crawlers related. - playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} + playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} # ty: ignore[invalid-assignment] # Each sub crawler will use custom logger . static_logger = getLogger('Subcrawler_static') @@ -181,7 +181,7 @@ def __init__( ) playwright_crawler = PlaywrightCrawler( statistics=_NonPersistentStatistics(), - **playwright_crawler_specific_kwargs, + **playwright_crawler_specific_kwargs, # ty: ignore[invalid-argument-type] **basic_crawler_kwargs_for_pw_crawler, ) @@ -335,7 +335,7 @@ async def from_static_pipeline_to_top_router( ) await self.router(adaptive_crawling_context) - return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) + return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type] if rendering_type == 'client only': @@ -345,7 +345,7 @@ async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> ) await self.router(adaptive_crawling_context) - return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) + return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type] raise RuntimeError( f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}' diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 4f6f52ed43..5b88dee47b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -410,7 +410,7 @@ def __init__( self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]() # Context pipeline - self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) + self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type] # Crawl settings self._max_request_retries = max_request_retries @@ -774,7 +774,7 @@ async def _run_crawler(self) -> None: async with AsyncExitStack() as exit_stack: for context in contexts_to_enter: - await exit_stack.enter_async_context(context) # type: ignore[arg-type] + await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type] await self._autoscaled_pool.run() @@ -873,7 +873,7 @@ async def export_data( dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, - **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc] + **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], ) -> None: """Export all items from a Dataset to a JSON or CSV file. diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 0beb04a375..c71bb71510 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -183,7 +183,7 @@ def __init__( generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)] fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=generator_browser_type) + header_options=HeaderGeneratorOptions(browsers=generator_browser_type) # ty: ignore[invalid-argument-type] ) browser_pool = BrowserPool.with_default_plugin( @@ -202,9 +202,9 @@ def __init__( kwargs['_context_pipeline'] = ( ContextPipeline() .compose(self._open_page) - .compose(self._navigate) + .compose(self._navigate) # ty: ignore[invalid-argument-type] .compose(self._handle_status_code_response) - .compose(self._handle_blocked_request_by_content) + .compose(self._handle_blocked_request_by_content) # ty: ignore[invalid-argument-type] ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) @@ -516,7 +516,7 @@ async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]: async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None: """Update the cookies in the page context.""" - await page.context.add_cookies([{**cookie} for cookie in cookies]) + await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type] async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """Find the robots.txt file for a given URL. diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py index c623b341c1..2183727483 100644 --- a/src/crawlee/events/_event_manager.py +++ b/src/crawlee/events/_event_manager.py @@ -178,7 +178,7 @@ async def listener_wrapper(event_data: EventData) -> None: else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs) ) - listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}') + listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}') # ty: ignore[invalid-argument-type, unresolved-attribute] self._listener_tasks.add(listener_task) try: @@ -189,7 +189,7 @@ async def listener_wrapper(event_data: EventData) -> None: # We need to swallow the exception and just log it here, otherwise it could break the event emitter logger.exception( 'Exception in the event listener', - extra={'event_name': event.value, 'listener_name': listener.__name__}, + extra={'event_name': event.value, 'listener_name': listener.__name__}, # ty: ignore[unresolved-attribute] ) finally: logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...') diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index b4eff2421b..342a60ef08 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -93,12 +93,12 @@ async def read(self) -> bytes: return self._response.content async def read_stream(self) -> AsyncGenerator[bytes, None]: - if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined] + if not self._response.astream_task or self._response.astream_task.done(): # ty: ignore[possibly-missing-attribute] raise RuntimeError( 'Cannot read stream: either already consumed or Response not obtained from `stream` method' ) - async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] + async for chunk in self._response.aiter_content(): yield chunk @@ -156,7 +156,7 @@ async def crawl( try: response = await client.request( url=request.url, - method=request.method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + method=request.method.upper(), # ty: ignore[invalid-argument-type] headers=request.headers, data=request.payload, cookies=session.cookies.jar if session else None, @@ -203,7 +203,7 @@ async def send_request( try: response = await client.request( url=url, - method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + method=method.upper(), # ty: ignore[invalid-argument-type] headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, @@ -244,7 +244,7 @@ async def stream( try: response = await client.request( url=url, - method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + method=method.upper(), # ty: ignore[invalid-argument-type] headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, @@ -309,8 +309,8 @@ def _is_proxy_error(error: CurlRequestError) -> bool: @staticmethod def _get_cookies(curl: Curl) -> list[Cookie]: cookies: list[Cookie] = [] - for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr] - curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type] + for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # ty: ignore[not-iterable] + curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # ty: ignore[invalid-argument-type] cookie = curl_morsel.to_cookiejar_cookie() cookies.append(cookie) return cookies diff --git a/src/crawlee/otel/crawler_instrumentor.py b/src/crawlee/otel/crawler_instrumentor.py index 09f2fda525..9c12e9e17e 100644 --- a/src/crawlee/otel/crawler_instrumentor.py +++ b/src/crawlee/otel/crawler_instrumentor.py @@ -3,9 +3,7 @@ import inspect from typing import TYPE_CHECKING, Any -from opentelemetry.instrumentation.instrumentor import ( # type:ignore[attr-defined] # Mypy has troubles with OTEL - BaseInstrumentor, -) +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor from opentelemetry.instrumentation.utils import unwrap from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD diff --git a/src/crawlee/sessions/_cookies.py b/src/crawlee/sessions/_cookies.py index eb5a6a12ea..1089fc37f5 100644 --- a/src/crawlee/sessions/_cookies.py +++ b/src/crawlee/sessions/_cookies.py @@ -68,7 +68,7 @@ def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str] | list[C if isinstance(cookies, dict): for key, value in cookies.items(): - self.set(key, value) + self.set(key, value) # ty: ignore[invalid-argument-type] elif isinstance(cookies, list): for item in cookies: @@ -152,7 +152,7 @@ def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam: cookie_dict['expires'] = cookie.expires if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}: - cookie_dict['same_site'] = same_site # type: ignore[typeddict-item] + cookie_dict['same_site'] = same_site # ty: ignore[invalid-assignment] return cookie_dict diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index da709f1cdb..2f5b4a0483 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel): ), ] - @computed_field(alias='sessionCount') # type: ignore[prop-decorator] + @computed_field(alias='sessionCount') @property def session_count(self) -> int: """Get the total number of sessions currently maintained in the pool.""" return len(self.sessions) - @computed_field(alias='usableSessionCount') # type: ignore[prop-decorator] + @computed_field(alias='usableSessionCount') @property def usable_session_count(self) -> int: """Get the number of sessions that are currently usable.""" return len([session for _, session in self.sessions.items() if session.is_usable]) - @computed_field(alias='retiredSessionCount') # type: ignore[prop-decorator] + @computed_field(alias='retiredSessionCount') @property def retired_session_count(self) -> int: """Get the number of sessions that are no longer usable.""" diff --git a/src/crawlee/statistics/_models.py b/src/crawlee/statistics/_models.py index 11b4310f3a..b17c618540 100644 --- a/src/crawlee/statistics/_models.py +++ b/src/crawlee/statistics/_models.py @@ -4,7 +4,7 @@ import warnings from dataclasses import asdict, dataclass from datetime import datetime, timedelta, timezone -from typing import Annotated, Any +from typing import TYPE_CHECKING, Annotated, Any from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field from typing_extensions import override @@ -77,9 +77,20 @@ class StatisticsState(BaseModel): crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None - errors: dict[str, Any] = Field(default_factory=dict) - retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict) - requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict) + + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + errors: dict[str, Any] = {} + retry_errors: dict[str, Any] = {} + requests_with_status_code: dict[str, int] = {} + else: + errors: Annotated[dict[str, Any], Field(default_factory=dict)] + retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)] + requests_with_status_code: Annotated[ + dict[str, int], + Field(alias='requestsWithStatusCode', default_factory=dict), + ] + stats_persisted_at: Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ] = None @@ -124,22 +135,22 @@ def crawler_runtime_for_serialization(self) -> timedelta: return self._runtime_offset + finished_at - self.crawler_last_started_at return self._runtime_offset - @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator] + @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) @property def request_total_duration(self) -> timedelta: return self.request_total_finished_duration + self.request_total_failed_duration - @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator] + @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) @property def request_avg_failed_duration(self) -> timedelta | None: return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None - @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator] + @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) @property def request_avg_finished_duration(self) -> timedelta | None: return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None - @computed_field(alias='requestsTotal') # type: ignore[prop-decorator] + @computed_field(alias='requestsTotal') @property def requests_total(self) -> int: return self.requests_failed + self.requests_finished diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 87573a3916..d2eeb86665 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -87,8 +87,8 @@ async def iterate_items( The backend method for the `Dataset.iterate_items` call. """ - # This syntax is to make mypy properly work with abstract AsyncIterator. + # This syntax is to make type checker properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError - if False: # type: ignore[unreachable] + if False: yield 0 diff --git a/src/crawlee/storage_clients/_base/_key_value_store_client.py b/src/crawlee/storage_clients/_base/_key_value_store_client.py index d23abd7d70..33c36f67bd 100644 --- a/src/crawlee/storage_clients/_base/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_base/_key_value_store_client.py @@ -72,10 +72,10 @@ async def iterate_keys( The backend method for the `KeyValueStore.iterate_keys` call. """ - # This syntax is to make mypy properly work with abstract AsyncIterator. + # This syntax is to make type checker properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError - if False: # type: ignore[unreachable] + if False: yield 0 @abstractmethod diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 55130d587f..4a222dc037 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -120,7 +120,7 @@ async def open( dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not dataset_base_path.exists(): - await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: dataset_base_path.mkdir(parents=True, exist_ok=True)) # Get a new instance by ID. if id: @@ -134,7 +134,7 @@ async def open( continue try: - file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) metadata = DatasetMetadata(**file_content) @@ -163,7 +163,7 @@ async def open( # If the dataset directory exists, reconstruct the client from the metadata file. if path_to_dataset.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: path_to_metadata.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) finally: @@ -211,7 +211,7 @@ async def drop(self) -> None: async def purge(self) -> None: async with self._lock: for file_path in await self._get_sorted_data_files(): - await asyncio.to_thread(file_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) await self._update_metadata( update_accessed_at=True, @@ -435,7 +435,7 @@ async def _update_metadata( self._metadata.item_count = new_item_count # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) @@ -456,7 +456,7 @@ async def _push_item(self, item: dict[str, Any], item_id: int) -> None: file_path = self.path_to_dataset / filename # Ensure the dataset directory exists. - await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_dataset.mkdir(parents=True, exist_ok=True)) # Dump the serialized item to the file. data = await json_dumps(item) @@ -473,9 +473,10 @@ async def _get_sorted_data_files(self) -> list[Path]: """ # Retrieve and sort all JSON files in the dataset directory numerically. files = await asyncio.to_thread( - sorted, - self.path_to_dataset.glob('*.json'), - key=lambda f: int(f.stem) if f.stem.isdigit() else 0, + lambda: sorted( + self.path_to_dataset.glob('*.json'), + key=lambda f: int(f.stem) if f.stem.isdigit() else 0, + ) ) # Remove the metadata file from the list if present. diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 6a3db78fbc..28e724fda8 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -119,7 +119,7 @@ async def open( kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not kvs_base_path.exists(): - await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: kvs_base_path.mkdir(parents=True, exist_ok=True)) # Get a new instance by ID. if id: @@ -133,7 +133,7 @@ async def open( continue try: - file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) metadata = KeyValueStoreMetadata(**file_content) @@ -162,7 +162,7 @@ async def open( # If the key-value store directory exists, reconstruct the client from the metadata file. if path_to_kvs.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: path_to_metadata.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) finally: @@ -212,7 +212,7 @@ async def purge(self) -> None: for file_path in self.path_to_kvs.glob('*'): if file_path.name == METADATA_FILENAME: continue - await asyncio.to_thread(file_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) await self._update_metadata( update_accessed_at=True, @@ -239,7 +239,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Read the metadata file async with self._lock: try: - file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: record_metadata_filepath.open(mode='r', encoding='utf-8')) except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value') return None @@ -346,11 +346,11 @@ async def delete_value(self, *, key: str) -> None: async with self._lock: # Delete the value file and its metadata if found if record_path.exists(): - await asyncio.to_thread(record_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda: record_path.unlink(missing_ok=True)) # Delete the metadata file if it exists if metadata_path.exists(): - await asyncio.to_thread(metadata_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda: metadata_path.unlink(missing_ok=True)) else: logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.') @@ -373,7 +373,7 @@ async def iterate_keys( # List and sort all files *inside* a brief lock, then release it immediately: async with self._lock: - files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*'))) + files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*')))) count = 0 @@ -395,7 +395,7 @@ async def iterate_keys( # Try to read and parse the metadata file try: - metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') + metadata_content = await asyncio.to_thread(lambda f=file_path: f.read_text(encoding='utf-8')) except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key_name}", skipping it.') continue @@ -475,7 +475,7 @@ async def _update_metadata( self._metadata.modified_at = now # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index e49771b7c9..1a91ecea9e 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -183,7 +183,7 @@ async def open( rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not rq_base_path.exists(): - await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: rq_base_path.mkdir(parents=True, exist_ok=True)) # Open an existing RQ by its ID, raise an error if not found. if id: @@ -197,7 +197,7 @@ async def open( continue try: - file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) metadata = RequestQueueMetadata(**file_content) @@ -232,7 +232,7 @@ async def open( # If the RQ directory exists, reconstruct the client from the metadata file. if path_to_rq.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: path_to_metadata.open(encoding='utf-8')) try: file_content = json.load(file) finally: @@ -300,7 +300,7 @@ async def purge(self) -> None: request_files = await self._get_request_files(self.path_to_rq) for file_path in request_files: - await asyncio.to_thread(file_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) # Clear recoverable state await self._state.reset() @@ -675,7 +675,7 @@ async def _update_metadata( self._metadata.had_multiple_clients = True # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) @@ -753,10 +753,10 @@ async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: A list of paths to all request files. """ # Create the requests directory if it doesn't exist. - await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: path_to_rq.mkdir(parents=True, exist_ok=True)) # List all the json files. - files = await asyncio.to_thread(list, path_to_rq.glob('*.json')) + files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json'))) # Filter out metadata file and non-file entries. filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files) @@ -775,7 +775,7 @@ async def _parse_request_file(cls, file_path: Path) -> Request | None: """ # Open the request file. try: - file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda f=file_path: f.open(mode='r', encoding='utf-8')) except FileNotFoundError: logger.warning(f'Request file "{file_path}" not found.') return None diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index 6c66e5db7b..8a54896577 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -179,7 +179,7 @@ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pip """Create a new Redis pipeline.""" async with self._redis.pipeline() as pipe: try: - pipe.multi() # type: ignore[no-untyped-call] + pipe.multi() yield pipe finally: if with_execute: @@ -187,7 +187,6 @@ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pip async def _create_storage(self, pipeline: Pipeline) -> None: """Create the actual storage structure in Redis.""" - _ = pipeline # To avoid unused variable mypy error async def _create_script(self, script_name: str) -> AsyncScript: """Load a Lua script from a file and return a Script object.""" @@ -262,8 +261,6 @@ async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> pipeline: The Redis pipeline to use for the update. **kwargs: Storage-specific update parameters. """ - _ = pipeline # To avoid unused variable mypy error - _ = kwargs async def _update_metadata( self, diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 44a78bce62..74c9d6c496 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -179,13 +179,13 @@ async def get_data( case (True, int(), None): json_path += f'[:-{offset}]' case (True, int(), int()): - json_path += f'[-{offset + limit}:-{offset}]' + json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator] case (False, 0, int()): json_path += f'[:{limit}]' case (False, int(), None): json_path += f'[{offset}:]' case (False, int(), int()): - json_path += f'[{offset}:{offset + limit}]' + json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator] if json_path == '$': json_path = '$[*]' @@ -210,7 +210,7 @@ async def get_data( limit=limit or (total - offset), total=total, desc=desc, - items=data, + items=data, # ty: ignore[invalid-argument-type] ) @override diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 99f9665ea7..8aeaa1a01d 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -144,7 +144,7 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No async with self._get_pipeline() as pipe: # redis-py typing issue - await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # type: ignore[arg-type] + await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type] await await_redis_response( pipe.hset( @@ -174,9 +174,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Query the record by key # redis-py typing issue - value_bytes: bytes | None = await await_redis_response( - self._redis.hget(self._items_key, key) # type: ignore[arg-type] - ) + value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment] if value_bytes is None: logger.warning(f'Value for key "{key}" is missing.') @@ -225,7 +223,7 @@ async def iterate_keys( raise TypeError('The items data was received in an incorrect format.') # Get all keys, sorted alphabetically - keys = sorted(items_data.keys()) + keys = sorted(items_data.keys()) # ty: ignore[invalid-argument-type] # Apply exclusive_start_key filter if provided if exclusive_start_key is not None: diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 90a86ee64f..74f9028bec 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -247,7 +247,6 @@ async def add_batch_of_requests( *, forefront: bool = False, ) -> AddRequestsResponse: - # Mypy workaround if self._add_requests_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') @@ -264,8 +263,8 @@ async def add_batch_of_requests( await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys)) await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys)) elif self._dedup_strategy == 'bloom': - await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) + await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) pipe_results = await pipe.execute() @@ -353,7 +352,6 @@ async def fetch_next_request(self) -> Request | None: if self._pending_fetch_cache: return self._pending_fetch_cache.popleft() - # Mypy workaround if self._fetch_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') @@ -399,7 +397,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key)) await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key)) elif self._dedup_strategy == 'bloom': - await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) await await_redis_response(pipe.hdel(self._data_key, request.unique_key)) @@ -499,17 +497,16 @@ async def _create_storage(self, pipeline: Pipeline) -> None: await await_redis_response( pipeline.bf().create( self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10 - ) # type: ignore[no-untyped-call] + ) ) await await_redis_response( pipeline.bf().create( self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10 - ) # type: ignore[no-untyped-call] + ) ) async def _reclaim_stale_requests(self) -> None: """Reclaim requests that have been in progress for too long.""" - # Mypy workaround if self._reclaim_stale_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') diff --git a/src/crawlee/storage_clients/_redis/_storage_client.py b/src/crawlee/storage_clients/_redis/_storage_client.py index 78e7bed603..a6c39f5def 100644 --- a/src/crawlee/storage_clients/_redis/_storage_client.py +++ b/src/crawlee/storage_clients/_redis/_storage_client.py @@ -57,16 +57,19 @@ def __init__( queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if `queue_dedup_strategy` is set to 'bloom'. """ - match (redis, connection_string): - case (None, None): - raise ValueError('Either redis or connection_string must be provided.') - case (Redis(), None): - self._redis = redis - case (None, str()): - self._redis = Redis.from_url(connection_string) - case (Redis(), str()): - raise ValueError('Either redis or connection_string must be provided, not both.') + if redis is None and connection_string is None: + raise ValueError('Either redis or connection_string must be provided.') + if redis is not None and connection_string is not None: + raise ValueError('Either redis or connection_string must be provided, not both.') + + if isinstance(redis, Redis) and connection_string is None: + self._redis = redis + + if isinstance(connection_string, str) and redis is None: + self._redis = Redis.from_url(connection_string) + + self._redis: Redis # to help type checker self._queue_dedup_strategy = queue_dedup_strategy self._queue_bloom_error_rate = queue_bloom_error_rate diff --git a/src/crawlee/storage_clients/_redis/_utils.py b/src/crawlee/storage_clients/_redis/_utils.py index a86f979fa4..27f051d692 100644 --- a/src/crawlee/storage_clients/_redis/_utils.py +++ b/src/crawlee/storage_clients/_redis/_utils.py @@ -19,5 +19,5 @@ async def await_redis_response(response: Awaitable[T] | T) -> T: def read_lua_script(script_name: str) -> str: """Read a Lua script from a file.""" file_path = Path(__file__).parent / 'lua_scripts' / script_name - with file_path.open('r', encoding='utf-8') as file: + with file_path.open(mode='r', encoding='utf-8') as file: return file.read() diff --git a/src/crawlee/storage_clients/_sql/_client_mixin.py b/src/crawlee/storage_clients/_sql/_client_mixin.py index c681e3a220..e7ee2ae8d9 100644 --- a/src/crawlee/storage_clients/_sql/_client_mixin.py +++ b/src/crawlee/storage_clients/_sql/_client_mixin.py @@ -105,7 +105,7 @@ async def _open( else: stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name) result = await session.execute(stmt) - orm_metadata = result.scalar_one_or_none() # type: ignore[assignment] + orm_metadata = result.scalar_one_or_none() if orm_metadata: client = cls(id=orm_metadata.id, storage_client=storage_client) diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index e7b5927d7d..2ebd65914d 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime -from typing import Annotated, Any, Generic +from typing import TYPE_CHECKING, Annotated, Any, Generic from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from typing_extensions import TypeVar @@ -127,8 +127,13 @@ class DatasetItemsListPage(BaseModel): desc: Annotated[bool, Field(default=False)] """Indicates if the returned list is in descending order.""" - items: Annotated[list[dict], Field(default_factory=list)] - """The list of dataset items returned on this page.""" + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + items: list[dict] = [] + """The list of dataset items returned on this page.""" + else: + items: Annotated[list[dict], Field(default_factory=list)] + """The list of dataset items returned on this page.""" @docs_group('Storage data') diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index f64d6e2c9f..e08fc80080 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -160,7 +160,7 @@ async def open_storage_instance( metadata = await client.get_metadata() - instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg] + instance = cls(client, metadata.id, metadata.name) # ty: ignore[too-many-positional-arguments] instance_name = getattr(instance, 'name', None) # Cache the instance. diff --git a/tests/unit/_autoscaling/test_snapshotter.py b/tests/unit/_autoscaling/test_snapshotter.py index 06a5682a32..7b3d50d75d 100644 --- a/tests/unit/_autoscaling/test_snapshotter.py +++ b/tests/unit/_autoscaling/test_snapshotter.py @@ -139,7 +139,10 @@ async def test_get_cpu_sample( events_data = [ EventSystemInfoData( - cpu_info=CpuInfo(used_ratio=0.5, created_at=now - timedelta(hours=delta)), + cpu_info=CpuInfo( + used_ratio=0.5, + created_at=now - timedelta(hours=delta), + ), memory_info=default_memory_info, ) for delta in range(5, 0, -1) diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index c802eee248..0a535e58db 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -191,7 +191,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): - html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. + html_to_text(1) # ty: ignore[invalid-argument-type] def test_html_to_text_parsel() -> None: diff --git a/tests/unit/_utils/test_recurring_task.py b/tests/unit/_utils/test_recurring_task.py index 78f43601eb..61951ec11e 100644 --- a/tests/unit/_utils/test_recurring_task.py +++ b/tests/unit/_utils/test_recurring_task.py @@ -48,7 +48,7 @@ async def test_execution(function: AsyncMock, delay: timedelta) -> None: await asyncio.sleep(0.1) # Wait enough for the task to execute a few times await task.stop() - assert isinstance(task.func, AsyncMock) # To let MyPy know that the function is a mocked + assert isinstance(task.func, AsyncMock) # To let type checker know that the function is a mock assert task.func.call_count >= 3 await task.stop() diff --git a/tests/unit/_utils/test_timedelta_ms.py b/tests/unit/_utils/test_timedelta_ms.py index dd5fc7a8f0..5f5b0f4f4f 100644 --- a/tests/unit/_utils/test_timedelta_ms.py +++ b/tests/unit/_utils/test_timedelta_ms.py @@ -30,6 +30,6 @@ class _ModelWithTimedeltaMs(BaseModel): def test_model_with_timedelta_ms_input_types( time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int ) -> None: - model = _ModelWithTimedeltaMs(time_delta=time_delta_input) + model = _ModelWithTimedeltaMs(time_delta=time_delta_input) # ty: ignore[invalid-argument-type] assert model.time_delta == expected_time_delta assert model.model_dump() == {'time_delta': expected_model_dump_value} diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index c2d0f153b1..3174adfd64 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -115,7 +115,7 @@ class TestInput: TestInput( expected_pw_count=0, expected_static_count=2, - rendering_types=cycle(['static']), + rendering_types=cycle(['static']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Static only', @@ -124,7 +124,7 @@ class TestInput: TestInput( expected_pw_count=2, expected_static_count=0, - rendering_types=cycle(['client only']), + rendering_types=cycle(['client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Client only', @@ -133,7 +133,7 @@ class TestInput: TestInput( expected_pw_count=1, expected_static_count=1, - rendering_types=cycle(['static', 'client only']), + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Mixed', @@ -142,7 +142,7 @@ class TestInput: TestInput( expected_pw_count=2, expected_static_count=2, - rendering_types=cycle(['static', 'client only']), + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([1]), ), id='Enforced rendering type detection', @@ -206,7 +206,8 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None async def test_adaptive_crawling_parsel(test_urls: list[str]) -> None: """Top level test for parsel. Only one argument combination. (The rest of code is tested with bs variant.)""" predictor = _SimpleRenderingTypePredictor( - rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] + detection_probability_recommendation=cycle([0]), ) crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( @@ -687,7 +688,8 @@ async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str] dynamically changed text instead of the original static text. """ browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( - rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + rendering_types=cycle(['client only']), # ty: ignore[invalid-argument-type] + detection_probability_recommendation=cycle([0]), ) expected_h3_tag = f'