Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a CSV file.
await crawler.export_data(path='results.csv')
# Use semicolon as delimiter and always quote strings.
await crawler.export_data(path='results.csv', delimiter=';', quoting='all')


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a JSON file.
await crawler.export_data(path='results.json')
# Set ensure_ascii=False to allow Unicode characters in the output.
await crawler.export_data(path='results.json', ensure_ascii=False)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/export_entire_dataset_to_file.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import JsonExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_json.py';
import CsvExample from '!!raw-loader!roa-loader!./code_examples/export_entire_dataset_to_file_csv.py';

This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format.
This example demonstrates how to use the <ApiLink to="class/BasicCrawler#export_data">`BasicCrawler.export_data`</ApiLink> method of the crawler to export the entire default dataset to a single file. This method supports exporting data in either CSV or JSON format and also accepts additional keyword arguments so you can fine-tune the underlying `json.dump` or `csv.writer` behavior.

:::note

Expand Down
7 changes: 7 additions & 0 deletions src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ async def export_csv_to_stream(
dst: TextIO,
**kwargs: Unpack[ExportDataCsvKwargs],
) -> None:
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
if 'lineterminator' not in kwargs:
kwargs['lineterminator'] = '\n'

writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
write_header = True

Expand Down
20 changes: 15 additions & 5 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from contextlib import AsyncExitStack, suppress
from datetime import timedelta
from functools import partial
from io import StringIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
from urllib.parse import ParseResult, urlparse
Expand All @@ -32,6 +33,8 @@
from crawlee._types import (
BasicCrawlingContext,
EnqueueLinksKwargs,
ExportDataCsvKwargs,
ExportDataJsonKwargs,
GetKeyValueStoreFromRequestHandlerFunction,
HttpHeaders,
HttpPayload,
Expand All @@ -41,7 +44,7 @@
SkippedReason,
)
from crawlee._utils.docs import docs_group
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
from crawlee._utils.recurring_task import RecurringTask
from crawlee._utils.robots import RobotsTxtFile
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
Expand Down Expand Up @@ -868,6 +871,7 @@ async def export_data(
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
**additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
) -> None:
"""Export all items from a Dataset to a JSON or CSV file.

Expand All @@ -880,6 +884,7 @@ async def export_data(
dataset_id: The ID of the Dataset to export from.
dataset_name: The name of the Dataset to export from (global scope, named storage).
dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
"""
dataset = await Dataset.open(
id=dataset_id,
Expand All @@ -889,13 +894,18 @@ async def export_data(
configuration=self._service_locator.get_configuration(),
)

path = path if isinstance(path, Path) else Path(path)
dst = path.open('w', newline='')
path = Path(path)

if path.suffix == '.csv':
await export_csv_to_stream(dataset.iterate_items(), dst)
dst = StringIO()
csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
await atomic_write(path, dst.getvalue())
elif path.suffix == '.json':
await export_json_to_stream(dataset.iterate_items(), dst)
dst = StringIO()
json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
await atomic_write(path, dst.getvalue())
else:
raise ValueError(f'Unsupported file extension: {path.suffix}')

Expand Down
31 changes: 29 additions & 2 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,29 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
{'id': 1, 'test': 'test'},
{'id': 2, 'test': 'test'},
]
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'

# On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
# On Unix/Linux, \n remains as \n.
if sys.platform == 'win32':
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
else:
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'


async def test_crawler_export_data_additional_kwargs(tmp_path: Path) -> None:
crawler = BasicCrawler()
dataset = await Dataset.open()

await dataset.push_data({'z': 1, 'a': 2})

json_path = tmp_path / 'dataset.json'
csv_path = tmp_path / 'dataset.csv'

await crawler.export_data(path=json_path, sort_keys=True, separators=(',', ':'))
await crawler.export_data(path=csv_path, delimiter=';', lineterminator='\n')

assert json_path.read_text() == '[{"a":2,"z":1}]'
assert csv_path.read_text() == 'z;a\n1;2\n'


async def test_context_push_and_export_data(tmp_path: Path) -> None:
Expand All @@ -754,7 +776,12 @@ async def handler(context: BasicCrawlingContext) -> None:
{'id': 2, 'test': 'test'},
]

assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
# On Windows, text mode file writes convert \n to \r\n, resulting in \r\n line endings.
# On Unix/Linux, \n remains as \n.
if sys.platform == 'win32':
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
else:
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\n0,test\n1,test\n2,test\n'


async def test_context_update_kv_store() -> None:
Expand Down