Skip to content
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20260212002508389038.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "add streamming to the two first workflows"
}
6 changes: 6 additions & 0 deletions packages/graphrag-input/graphrag_input/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@

import csv
import logging
import sys

from graphrag_input.structured_file_reader import StructuredFileReader
from graphrag_input.text_document import TextDocument

logger = logging.getLogger(__name__)

try:
csv.field_size_limit(sys.maxsize)
except OverflowError:
csv.field_size_limit(100 * 1024 * 1024)


class CSVFileReader(StructuredFileReader):
"""Reader implementation for csv files."""
Expand Down
4 changes: 4 additions & 0 deletions packages/graphrag-storage/graphrag_storage/file_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ async def get_creation_date(self, key: str) -> str:

return get_timestamp_formatted_with_local_tz(creation_time_utc)

def get_path(self, key: str) -> Path:
"""Get the full file path for a key (for streaming access)."""
return _join_path(self._base_dir, key)


def _join_path(file_path: Path, file_name: str) -> Path:
"""Join a path and a file. Independent of the OS."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Table provider module for GraphRAG storage."""

from .table import Table
from .table_provider import TableProvider

__all__ = ["TableProvider"]
__all__ = ["Table", "TableProvider"]
165 changes: 165 additions & 0 deletions packages/graphrag-storage/graphrag_storage/tables/csv_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright (c) 2025 Microsoft Corporation.
# Licensed under the MIT Licenses

"""A CSV-based implementation of the Table abstraction for streaming row access."""

from __future__ import annotations

import csv
import inspect
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any

import aiofiles

from graphrag_storage.file_storage import FileStorage
from graphrag_storage.tables.table import RowTransformer, Table

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from io import TextIOWrapper

from graphrag_storage import Storage

try:
csv.field_size_limit(sys.maxsize)
except OverflowError:
csv.field_size_limit(100 * 1024 * 1024)


def _identity(row: dict[str, Any]) -> Any:
"""Return row unchanged (default transformer)."""
return row


def _apply_transformer(transformer: RowTransformer, row: dict[str, Any]) -> Any:
"""Apply transformer to row, handling both callables and classes.

If transformer is a class (e.g., Pydantic model), calls it with **row.
Otherwise calls it with row as positional argument.
"""
if inspect.isclass(transformer):
return transformer(**row)
return transformer(row)


class CSVTable(Table):
"""Row-by-row streaming interface for CSV tables."""

def __init__(
self,
storage: Storage,
table_name: str,
transformer: RowTransformer | None = None,
truncate: bool = True,
encoding: str = "utf-8",
):
"""Initialize with storage backend and table name.

Args:
storage: Storage instance (File, Blob, or Cosmos)
table_name: Name of the table (e.g., "documents")
transformer: Optional callable to transform each row before
yielding. Receives a dict, returns a transformed dict.
Defaults to identity (no transformation).
truncate: If True (default), truncate file on first write.
If False, append to existing file.
encoding: Character encoding for reading/writing CSV files.
Defaults to "utf-8".
"""
self._storage = storage
self._table_name = table_name
self._file_key = f"{table_name}.csv"
self._transformer = transformer or _identity
self._truncate = truncate
self._encoding = encoding
self._write_file: TextIOWrapper | None = None
self._writer: csv.DictWriter | None = None
self._header_written = False

def __aiter__(self) -> AsyncIterator[Any]:
"""Iterate through rows one at a time.

The transformer is applied to each row before yielding.
If transformer is a Pydantic model, yields model instances.

Yields
------
Any:
Each row as dict or transformed type (e.g., Pydantic model).
"""
return self._aiter_impl()

async def _aiter_impl(self) -> AsyncIterator[Any]:
"""Implement async iteration over rows."""
if isinstance(self._storage, FileStorage):
file_path = self._storage.get_path(self._file_key)
with Path.open(file_path, "r", encoding=self._encoding) as f:
reader = csv.DictReader(f)
for row in reader:
yield _apply_transformer(self._transformer, row)

async def length(self) -> int:
"""Return the number of rows in the table."""
if isinstance(self._storage, FileStorage):
file_path = self._storage.get_path(self._file_key)
count = 0
async with aiofiles.open(file_path, "rb") as f:
while True:
chunk = await f.read(65536)
if not chunk:
break
count += chunk.count(b"\n")
return count - 1
return 0

async def has(self, row_id: str) -> bool:
"""Check if row with given ID exists."""
async for row in self:
# Handle both dict and object (e.g., Pydantic model)
if isinstance(row, dict):
if row.get("id") == row_id:
return True
elif getattr(row, "id", None) == row_id:
return True
return False

async def write(self, row: dict[str, Any]) -> None:
"""Write a single row to the CSV file.

On first write, opens the file. If truncate=True, overwrites any existing
file and writes header. If truncate=False, appends to existing file
(skips header if file exists).

Args
----
row: Dictionary representing a single row to write.
"""
if isinstance(self._storage, FileStorage) and self._write_file is None:
file_path = self._storage.get_path(self._file_key)
file_path.parent.mkdir(parents=True, exist_ok=True)
file_exists = file_path.exists() and file_path.stat().st_size > 0
mode = "w" if self._truncate else "a"
write_header = self._truncate or not file_exists
self._write_file = Path.open(
file_path, mode, encoding=self._encoding, newline=""
)
self._writer = csv.DictWriter(self._write_file, fieldnames=list(row.keys()))
if write_header:
self._writer.writeheader()
self._header_written = write_header

if self._writer is not None:
self._writer.writerow(row)

async def close(self) -> None:
"""Flush buffered writes and release resources.

Closes the file handle if writing was performed.
"""
if self._write_file is not None:
self._write_file.close()
self._write_file = None
self._writer = None
self._header_written = False
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Copyright (c) 2025 Microsoft Corporation.
# Licensed under the MIT License

"""CSV-based table provider implementation."""
Expand All @@ -9,7 +9,10 @@

import pandas as pd

from graphrag_storage.file_storage import FileStorage
from graphrag_storage.storage import Storage
from graphrag_storage.tables.csv_table import CSVTable
from graphrag_storage.tables.table import RowTransformer
from graphrag_storage.tables.table_provider import TableProvider

logger = logging.getLogger(__name__)
Expand All @@ -32,6 +35,9 @@ def __init__(self, storage: Storage, **kwargs) -> None:
**kwargs: Any
Additional keyword arguments (currently unused).
"""
if not isinstance(storage, FileStorage):
msg = "CSVTableProvider only works with FileStorage backends for now. "
raise TypeError(msg)
self._storage = storage

async def read_dataframe(self, table_name: str) -> pd.DataFrame:
Expand Down Expand Up @@ -108,3 +114,27 @@ def list(self) -> list[str]:
file.replace(".csv", "")
for file in self._storage.find(re.compile(r"\.csv$"))
]

def open(
self,
table_name: str,
transformer: RowTransformer | None = None,
truncate: bool = True,
encoding: str = "utf-8",
) -> CSVTable:
"""Open table for streaming.

Args:
table_name: Name of the table to open
transformer: Optional callable to transform each row
truncate: If True, truncate file on first write
encoding: Character encoding for reading/writing CSV files.
Defaults to "utf-8".
"""
return CSVTable(
self._storage,
table_name,
transformer=transformer,
truncate=truncate,
encoding=encoding,
)
Loading