Merge pull request rapidsai#18830 from rapidsai/branch-25.06

GPUtester · web-flow · commit 782fc707c906 · 2025-05-14T18:44:06.000-04:00
Forward-merge branch-25.06 into branch-25.08
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -408,9 +408,18 @@ def __init__(
                     "Multi-character comment prefix not supported for CSV reader"
                 )
             if not self.reader_options["has_header"]:
-                # Need to do some file introspection to get the number
-                # of columns so that column projection works right.
-                raise NotImplementedError("Reading CSV without header")
+                # TODO: To support reading headerless CSV files without requiring new
+                # column names, we would need to do file introspection to infer the number
+                # of columns so column projection works right.
+                reader_schema = self.reader_options.get("schema")
+                if not (
+                    reader_schema
+                    and isinstance(schema, dict)
+                    and "fields" in reader_schema
+                ):
+                    raise NotImplementedError(
+                        "Reading CSV without header requires user-provided column names via new_columns"
+                    )
         elif self.typ == "ndjson":
             # TODO: consider handling the low memory option here
             # (maybe use chunked JSON reader)
@@ -510,8 +519,8 @@ def read_csv_header(
                 # file provides column names
                 column_names = None
             usecols = with_columns
-            # TODO: support has_header=False
-            header = 0
+            has_header = reader_options["has_header"]
+            header = 0 if has_header else -1
 
             # polars defaults to no null recognition
             null_values = [""]
@@ -557,7 +566,7 @@ def read_csv_header(
                     options.set_names([str(name) for name in column_names])
                 else:
                     if (
-                        not POLARS_VERSION_LT_128 and skip_rows > header
+                        not POLARS_VERSION_LT_128 and header > -1 and skip_rows > header
                     ):  # pragma: no cover
                         # We need to read the header otherwise we would skip it
                         column_names = read_csv_header(path, str(sep))
diff --git a/python/cudf_polars/cudf_polars/testing/io.py b/python/cudf_polars/cudf_polars/testing/io.py
@@ -23,6 +23,7 @@ def make_partitioned_source(
     *,
     n_files: int = 1,
     row_group_size: int | None = None,
+    write_kwargs: dict | None = None,
 ) -> None:
     """
     Write the Polars DataFrame to one or more files of the desired format.
@@ -39,19 +40,23 @@ def make_partitioned_source(
         If greater than 1, splits the data into multiple files.
     row_group_size : optional, int
         Only used for Parquet. Specifies the row group size per file.
+    write_kwargs : dict, optional
+        Additional keyword arguments to pass to the write_* functions.
     """
     path = Path(path)
+    write_kwargs = write_kwargs or {}
 
     def write(part: pl.DataFrame, file_path: Path) -> None:
         match fmt:
             case "csv":
-                part.write_csv(file_path)
+                part.write_csv(file_path, **write_kwargs)
             case "ndjson":
-                part.write_ndjson(file_path)
+                part.write_ndjson(file_path, **write_kwargs)
             case "parquet" | "chunked_parquet":
                 part.write_parquet(
                     file_path,
                     row_group_size=row_group_size or (len(part) // 2),
+                    **write_kwargs,
                 )
             case _:
                 raise ValueError(f"Unsupported format: {fmt}")
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
@@ -63,7 +63,7 @@ def mask(request):
     params=[None, (1, 1)],
     ids=["no_slice", "slice_second"],
 )
-def slice(request):
+def zlice(request):
     # For use in testing that we handle
     # polars slice pushdown correctly
     return request.param
@@ -85,7 +85,7 @@ def scan_fn(format):
 
 
 def test_scan(
-    tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request
+    tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, zlice, request
 ):
     name, offset = row_index
     is_chunked = format == "chunked_parquet"
@@ -102,7 +102,7 @@ def test_scan(
         pytest.mark.xfail(
             condition=(
                 not POLARS_VERSION_LT_128
-                and slice is not None
+                and zlice is not None
                 and scan_fn is pl.scan_ndjson
             ),
             reason="slice pushdown not supported in the libcudf JSON reader",
@@ -116,8 +116,8 @@ def test_scan(
     )
     engine = pl.GPUEngine(raise_on_fail=True, parquet_options={"chunked": is_chunked})
 
-    if slice is not None:
-        q = q.slice(*slice)
+    if zlice is not None:
+        q = q.slice(*zlice)
     if mask is not None:
         q = q.filter(mask)
     if columns is not None:
@@ -422,3 +422,43 @@ def test_select_arbitrary_order_with_row_index_column(request, tmp_path):
         [pl.col("a"), pl.col("foo")]
     )
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "has_header,new_columns",
+    [
+        (True, None),
+        (False, ["a", "b", "c"]),
+    ],
+)
+def test_scan_csv_with_and_without_header(
+    df, tmp_path, has_header, new_columns, row_index, columns, zlice
+):
+    path = tmp_path / "test.csv"
+    make_partitioned_source(
+        df, path, "csv", write_kwargs={"include_header": has_header}
+    )
+
+    name, offset = row_index
+
+    q = pl.scan_csv(
+        path,
+        has_header=has_header,
+        new_columns=new_columns,
+        row_index_name=name,
+        row_index_offset=offset,
+    )
+
+    if zlice is not None:
+        q = q.slice(*zlice)
+    if columns is not None:
+        q = q.select(columns)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_without_header_and_new_column_names_raises(df, tmp_path):
+    path = tmp_path / "test.csv"
+    make_partitioned_source(df, path, "csv", write_kwargs={"include_header": False})
+    q = pl.scan_csv(path, has_header=False)
+    assert_ir_translation_raises(q, NotImplementedError)