From dfe8df994d79dc212885731901026ffb60b7c2ee Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 15 Oct 2025 09:18:00 -0400 Subject: [PATCH 01/12] pin <1.35 --- .../all_cuda-129_arch-aarch64.yaml | 2 +- .../all_cuda-129_arch-x86_64.yaml | 2 +- .../all_cuda-130_arch-aarch64.yaml | 2 +- .../all_cuda-130_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/recipe.yaml | 2 +- dependencies.yaml | 2 +- .../dsl/expressions/aggregation.py | 39 +++- .../cudf_polars/dsl/expressions/binaryop.py | 8 +- .../cudf_polars/dsl/expressions/boolean.py | 6 +- .../cudf_polars/dsl/expressions/datetime.py | 6 +- .../cudf_polars/dsl/expressions/string.py | 8 +- .../cudf_polars/dsl/expressions/struct.py | 6 +- .../cudf_polars/dsl/expressions/ternary.py | 10 +- python/cudf_polars/cudf_polars/dsl/ir.py | 23 ++- .../cudf_polars/cudf_polars/dsl/translate.py | 187 ++++++++++++------ .../cudf_polars/typing/__init__.py | 62 +++--- .../cudf_polars/cudf_polars/utils/versions.py | 1 + python/cudf_polars/pyproject.toml | 2 +- .../tests/dsl/test_serialization.py | 4 +- python/cudf_polars/tests/test_select.py | 3 +- .../cudf_polars/tests/testing/test_asserts.py | 15 +- 21 files changed, 267 insertions(+), 125 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index a3bdcf0f4f2..d7c77079283 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.29,<1.34 +- polars>=1.29,<1.35 - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 469f3996091..cc9309eae17 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -67,7 +67,7 @@ dependencies: - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.29,<1.34 +- polars>=1.29,<1.35 - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index 38ed98e4f98..3ccb13c4420 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.29,<1.34 +- polars>=1.29,<1.35 - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index 9d54dd734ac..cd543e3d863 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -67,7 +67,7 @@ dependencies: - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.29,<1.34 +- polars>=1.29,<1.35 - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml index d83a0f601df..745aea6e370 100644 --- a/conda/recipes/cudf-polars/recipe.yaml +++ b/conda/recipes/cudf-polars/recipe.yaml @@ -51,7 +51,7 @@ requirements: - nvidia-ml-py>=12 - python - pylibcudf =${{ version }} - - polars >=1.29,<1.34 + - polars >=1.29,<1.35 - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - if: python == "3.10" then: typing_extensions diff --git a/dependencies.yaml b/dependencies.yaml index eb543fc2657..9238dbd4649 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -727,7 +727,7 @@ dependencies: # 'nvidia-ml-py' provides the 'pynvml' module - &nvidia_ml_py nvidia-ml-py>=12 - packaging - - polars>=1.29,<1.34 + - polars>=1.29,<1.35 specific: - output_types: [requirements, pyproject] matrices: diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index fbbe8a0bd75..8a2c604d39f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -22,7 +22,14 @@ class Agg(Expr): - __slots__ = ("context", "name", "op", "options", "request") + __slots__ = ( + "_nunique_include_nulls", + "context", + "name", + "op", + "options", + "request", + ) _non_child = ("dtype", "name", "options", "context") def __init__( @@ -43,6 +50,10 @@ def __init__( raise NotImplementedError( f"Unsupported aggregation {name=}" ) # pragma: no cover; all valid aggs are supported + if name == "sum": + child = children[0] + if plc.traits.is_fixed_point(child.dtype.plc_type): + self.dtype = child.dtype # TODO: nan handling in groupby case if name == "min": req = plc.aggregation.min() @@ -52,6 +63,7 @@ def __init__( req = plc.aggregation.median() elif name == "n_unique": # TODO: datatype of result + self._nunique_include_nulls = True req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) elif name == "first" or name == "last": req = None @@ -159,6 +171,20 @@ def _reduce( and self.dtype.plc_type.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} ): column = column.astype(self.dtype) + + if column.size == 0 or column.null_count == column.size: + z = None + if self.name == "n_unique": + include_nulls = getattr(self, "_nunique_include_nulls", False) + if column.size == 0: + z = 0 + else: + z = 1 if include_nulls else 0 + return Column( + plc.Column.from_scalar(plc.Scalar.from_py(z, self.dtype.plc_type), 1), + name=column.name, + dtype=self.dtype, + ) return Column( plc.Column.from_scalar( plc.reduce.reduce(column.obj, request, self.dtype.plc_type), 1 @@ -188,6 +214,17 @@ def _sum(self, column: Column) -> Column: name=column.name, dtype=self.dtype, ) + if plc.traits.is_fixed_point(column.dtype.plc_type): + return Column( + plc.Column.from_scalar( + plc.reduce.reduce( + column.obj, plc.aggregation.sum(), column.dtype.plc_type + ), + 1, + ), + name=column.name, + dtype=column.dtype, + ) return self._reduce(column, request=plc.aggregation.sum()) def _min(self, column: Column, *, propagate_nans: bool) -> Column: diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py index 9d64ca99c3a..8a397532b5a 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py @@ -8,13 +8,15 @@ from typing import TYPE_CHECKING, ClassVar -from polars.polars import _expr_nodes as pl_expr +from polars import polars import pylibcudf as plc from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import ExecutionContext, Expr +pl_expr = polars._expr_nodes + if TYPE_CHECKING: from cudf_polars.containers import DataFrame, DataType @@ -59,7 +61,9 @@ def __init__( plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, } - _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { + _MAPPING: ClassVar[ + dict[polars._expr_nodes.Operator, plc.binaryop.BinaryOperator] + ] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 27994b71bfa..79b3710cefb 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -22,7 +22,9 @@ from typing_extensions import Self import polars.type_aliases as pl_types - from polars.polars import _expr_nodes as pl_expr + from polars import polars + + pl_expr = polars._expr_nodes from cudf_polars.containers import DataFrame @@ -53,7 +55,7 @@ class Name(IntEnum): Not = auto() @classmethod - def from_polars(cls, obj: pl_expr.BooleanFunction) -> Self: + def from_polars(cls, obj: polars._expr_nodes.BooleanFunction) -> Self: """Convert from polars' `BooleanFunction`.""" try: function, name = str(obj).split(".", maxsplit=1) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index b36ab7da033..cb07fee44fc 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -17,7 +17,9 @@ if TYPE_CHECKING: from typing_extensions import Self - from polars.polars import _expr_nodes as pl_expr + from polars import polars + + pl_expr = polars._expr_nodes from cudf_polars.containers import DataFrame, DataType @@ -75,7 +77,7 @@ class Name(IntEnum): Year = auto() @classmethod - def from_polars(cls, obj: pl_expr.TemporalFunction) -> Self: + def from_polars(cls, obj: polars._expr_nodes.TemporalFunction) -> Self: """Convert from polars' `TemporalFunction`.""" try: function, name = str(obj).split(".", maxsplit=1) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index cf2cfe04552..1239f33c91b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -12,8 +12,8 @@ from enum import IntEnum, auto from typing import TYPE_CHECKING, Any, ClassVar +from polars import polars from polars.exceptions import InvalidOperationError -from polars.polars import dtype_str_repr import pylibcudf as plc @@ -23,10 +23,12 @@ from cudf_polars.dsl.utils.reshape import broadcast from cudf_polars.utils.versions import POLARS_VERSION_LT_132 +dtype_str_repr = polars.dtype_str_repr + if TYPE_CHECKING: from typing_extensions import Self - from polars.polars import _expr_nodes as pl_expr + pl_expr = polars._expr_nodes from cudf_polars.containers import DataFrame, DataType @@ -101,7 +103,7 @@ class Name(IntEnum): ZFill = auto() @classmethod - def from_polars(cls, obj: pl_expr.StringFunction) -> Self: + def from_polars(cls, obj: polars._expr_nodes.StringFunction) -> Self: """Convert from polars' `StringFunction`.""" try: function, name = str(obj).split(".", maxsplit=1) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/struct.py b/python/cudf_polars/cudf_polars/dsl/expressions/struct.py index 5d3ff47da97..2355e94e467 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/struct.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/struct.py @@ -18,7 +18,9 @@ if TYPE_CHECKING: from typing_extensions import Self - from polars.polars import _expr_nodes as pl_expr + from polars import polars + + pl_expr = polars._expr_nodes from cudf_polars.containers import DataFrame, DataType @@ -42,7 +44,7 @@ class Name(IntEnum): ) # https://github.com/pola-rs/polars/pull/23022#issuecomment-2933910958 @classmethod - def from_polars(cls, obj: pl_expr.StructFunction) -> Self: + def from_polars(cls, obj: polars._expr_nodes.StructFunction) -> Self: """Convert from polars' `StructFunction`.""" try: function, name = str(obj).split(".", maxsplit=1) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py index 1bf6ea476f1..279c47ec33d 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py @@ -34,15 +34,21 @@ def __init__( self.children = (when, then, otherwise) self.is_pointwise = True - def do_evaluate( + def do_evaluate( # noqa: D102 self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME ) -> Column: - """Evaluate this expression given a dataframe for context.""" when, then, otherwise = ( child.evaluate(df, context=context) for child in self.children ) + + if then.dtype.plc_type != self.dtype.plc_type: + then = then.astype(self.dtype) + if otherwise.dtype.plc_type != self.dtype.plc_type: + otherwise = otherwise.astype(self.dtype) + then_obj = then.obj_scalar if then.is_scalar else then.obj otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj + return Column( plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj), dtype=self.dtype, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 91c4d86aa5f..62b8866747a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -39,7 +39,7 @@ from cudf_polars.dsl.utils.reshape import broadcast from cudf_polars.dsl.utils.windows import range_window_bounds from cudf_polars.utils import dtypes -from cudf_polars.utils.versions import POLARS_VERSION_LT_131 +from cudf_polars.utils.versions import POLARS_VERSION_LT_131, POLARS_VERSION_LT_134 if TYPE_CHECKING: from collections.abc import Callable, Hashable, Iterable, Sequence @@ -47,7 +47,9 @@ from typing_extensions import Self - from polars.polars import _expr_nodes as pl_expr + from polars import polars + + pl_expr = polars._expr_nodes from cudf_polars.containers.dataframe import NamedColumn from cudf_polars.typing import CSECache, ClosedInterval, Schema, Slice as Zlice @@ -1687,6 +1689,21 @@ def _strip_predicate_casts(node: expr.Expr) -> expr.Expr: ): return child + if ( + not POLARS_VERSION_LT_134 + and isinstance(child, expr.ColRef) + and ( + plc.traits.is_floating_point(src.plc_type) + and plc.traits.is_floating_point(dst.plc_type) + ) + ) or ( + plc.traits.is_integral(src.plc_type) + and plc.traits.is_integral(dst.plc_type) + ): + return child + + return expr.Cast(dst, child) + if not node.children: return node return node.reconstruct([_strip_predicate_casts(child) for child in node.children]) @@ -1800,7 +1817,7 @@ def __reduce__(self) -> tuple[Any, ...]: options: tuple[ tuple[ str, - pl_expr.Operator | Iterable[pl_expr.Operator], + polars._expr_nodes.Operator | Iterable[polars._expr_nodes.Operator], ] | None, bool, diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 82bbcd5d61f..a66aa612e57 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -14,8 +14,7 @@ from typing_extensions import assert_never import polars as pl -import polars.polars as plrs -from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +from polars import polars as plrs import pylibcudf as plc @@ -34,6 +33,7 @@ POLARS_VERSION_LT_131, POLARS_VERSION_LT_132, POLARS_VERSION_LT_133, + POLARS_VERSION_LT_134, POLARS_VERSION_LT_1323, ) @@ -228,7 +228,7 @@ def _translate_ir(node: Any, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.PythonScan, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.PythonScan, translator: Translator, schema: Schema) -> ir.IR: scan_fn, with_columns, source_type, predicate, nrows = node.options options = (scan_fn, with_columns, source_type, nrows) predicate = ( @@ -240,7 +240,7 @@ def _(node: pl_ir.PythonScan, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Scan, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Scan, translator: Translator, schema: Schema) -> ir.IR: typ, *options = node.scan_type paths = node.paths # Polars can produce a Scan with an empty ``node.paths`` (eg. the native @@ -293,7 +293,7 @@ def _(node: pl_ir.Scan, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Cache, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Cache, translator: Translator, schema: Schema) -> ir.IR: if POLARS_VERSION_LT_1323: # pragma: no cover refcount = node.cache_hits else: @@ -312,7 +312,9 @@ def _(node: pl_ir.Cache, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.DataFrameScan, translator: Translator, schema: Schema) -> ir.IR: +def _( + node: plrs._ir_nodes.DataFrameScan, translator: Translator, schema: Schema +) -> ir.IR: return ir.DataFrameScan( schema, node.df, @@ -321,7 +323,7 @@ def _(node: pl_ir.DataFrameScan, translator: Translator, schema: Schema) -> ir.I @_translate_ir.register -def _(node: pl_ir.Select, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Select, translator: Translator, schema: Schema) -> ir.IR: with set_node(translator.visitor, node.input): inp = translator.translate_ir(n=None) exprs = [ @@ -331,7 +333,7 @@ def _(node: pl_ir.Select, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.GroupBy, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.GroupBy, translator: Translator, schema: Schema) -> ir.IR: with set_node(translator.visitor, node.input): inp = translator.translate_ir(n=None) keys = [ @@ -381,7 +383,7 @@ def _align_decimal_scales( @_translate_ir.register -def _(node: pl_ir.Join, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Join, translator: Translator, schema: Schema) -> ir.IR: # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. @@ -460,7 +462,7 @@ def _(node: pl_ir.Join, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.HStack, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.HStack, translator: Translator, schema: Schema) -> ir.IR: with set_node(translator.visitor, node.input): inp = translator.translate_ir(n=None) exprs = [ @@ -471,7 +473,7 @@ def _(node: pl_ir.HStack, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register def _( - node: pl_ir.Reduce, translator: Translator, schema: Schema + node: plrs._ir_nodes.Reduce, translator: Translator, schema: Schema ) -> ir.IR: # pragma: no cover; polars doesn't emit this node yet with set_node(translator.visitor, node.input): inp = translator.translate_ir(n=None) @@ -482,7 +484,7 @@ def _( @_translate_ir.register -def _(node: pl_ir.Distinct, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Distinct, translator: Translator, schema: Schema) -> ir.IR: (keep, subset, maintain_order, zlice) = node.options keep = ir.Distinct._KEEP_MAP[keep] subset = frozenset(subset) if subset is not None else None @@ -497,7 +499,7 @@ def _(node: pl_ir.Distinct, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Sort, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Sort, translator: Translator, schema: Schema) -> ir.IR: with set_node(translator.visitor, node.input): inp = translator.translate_ir(n=None) by = [ @@ -512,14 +514,14 @@ def _(node: pl_ir.Sort, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Slice, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Slice, translator: Translator, schema: Schema) -> ir.IR: return ir.Slice( schema, node.offset, node.len, translator.translate_ir(n=node.input) ) @_translate_ir.register -def _(node: pl_ir.Filter, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Filter, translator: Translator, schema: Schema) -> ir.IR: with set_node(translator.visitor, node.input): inp = translator.translate_ir(n=None) mask = translate_named_expr(translator, n=node.predicate, schema=inp.schema) @@ -527,12 +529,16 @@ def _(node: pl_ir.Filter, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.SimpleProjection, translator: Translator, schema: Schema) -> ir.IR: +def _( + node: plrs._ir_nodes.SimpleProjection, translator: Translator, schema: Schema +) -> ir.IR: return ir.Projection(schema, translator.translate_ir(n=node.input)) @_translate_ir.register -def _(node: pl_ir.MergeSorted, translator: Translator, schema: Schema) -> ir.IR: +def _( + node: plrs._ir_nodes.MergeSorted, translator: Translator, schema: Schema +) -> ir.IR: key = node.key inp_left = translator.translate_ir(n=node.input_left) inp_right = translator.translate_ir(n=node.input_right) @@ -545,7 +551,9 @@ def _(node: pl_ir.MergeSorted, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.MapFunction, translator: Translator, schema: Schema) -> ir.IR: +def _( + node: plrs._ir_nodes.MapFunction, translator: Translator, schema: Schema +) -> ir.IR: name, *options = node.function return ir.MapFunction( schema, @@ -556,14 +564,14 @@ def _(node: pl_ir.MapFunction, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Union, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Union, translator: Translator, schema: Schema) -> ir.IR: return ir.Union( schema, node.options, *(translator.translate_ir(n=n) for n in node.inputs) ) @_translate_ir.register -def _(node: pl_ir.HConcat, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.HConcat, translator: Translator, schema: Schema) -> ir.IR: return ir.HConcat( schema, False, # noqa: FBT003 @@ -572,7 +580,7 @@ def _(node: pl_ir.HConcat, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register -def _(node: pl_ir.Sink, translator: Translator, schema: Schema) -> ir.IR: +def _(node: plrs._ir_nodes.Sink, translator: Translator, schema: Schema) -> ir.IR: payload = json.loads(node.payload) try: file = payload["File"] @@ -605,7 +613,7 @@ def _(node: pl_ir.Sink, translator: Translator, schema: Schema) -> ir.IR: def translate_named_expr( - translator: Translator, *, n: pl_expr.PyExprIR, schema: Schema + translator: Translator, *, n: plrs._expr_nodes.PyExprIR, schema: Schema ) -> expr.NamedExpr: """ Translate a polars-internal named expression IR object into our representation. @@ -651,15 +659,18 @@ def _translate_expr( @_translate_expr.register def _( - node: pl_expr.Function, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Function, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: name, *options = node.function_data options = tuple(options) - if isinstance(name, pl_expr.StringFunction): + if isinstance(name, plrs._expr_nodes.StringFunction): if name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, + plrs._expr_nodes.StringFunction.StripChars, + plrs._expr_nodes.StringFunction.StripCharsStart, + plrs._expr_nodes.StringFunction.StripCharsEnd, }: column, chars = ( translator.translate_expr(n=n, schema=schema) for n in node.input @@ -688,8 +699,8 @@ def _( options, *(translator.translate_expr(n=n, schema=schema) for n in node.input), ) - elif isinstance(name, pl_expr.BooleanFunction): - if name == pl_expr.BooleanFunction.IsBetween: + elif isinstance(name, plrs._expr_nodes.BooleanFunction): + if name == plrs._expr_nodes.BooleanFunction.IsBetween: column, lo, hi = ( translator.translate_expr(n=n, schema=schema) for n in node.input ) @@ -707,19 +718,19 @@ def _( options, *(translator.translate_expr(n=n, schema=schema) for n in node.input), ) - elif isinstance(name, pl_expr.TemporalFunction): + elif isinstance(name, plrs._expr_nodes.TemporalFunction): # functions for which evaluation of the expression may not return # the same dtype as polars, either due to libcudf returning a different # dtype, or due to our internal processing affecting what libcudf returns needs_cast = { - pl_expr.TemporalFunction.Year, - pl_expr.TemporalFunction.Month, - pl_expr.TemporalFunction.Day, - pl_expr.TemporalFunction.WeekDay, - pl_expr.TemporalFunction.Hour, - pl_expr.TemporalFunction.Minute, - pl_expr.TemporalFunction.Second, - pl_expr.TemporalFunction.Millisecond, + plrs._expr_nodes.TemporalFunction.Year, + plrs._expr_nodes.TemporalFunction.Month, + plrs._expr_nodes.TemporalFunction.Day, + plrs._expr_nodes.TemporalFunction.WeekDay, + plrs._expr_nodes.TemporalFunction.Hour, + plrs._expr_nodes.TemporalFunction.Minute, + plrs._expr_nodes.TemporalFunction.Second, + plrs._expr_nodes.TemporalFunction.Millisecond, } result_expr = expr.TemporalFunction( dtype, @@ -730,7 +741,9 @@ def _( if name in needs_cast: return expr.Cast(dtype, result_expr) return result_expr - elif not POLARS_VERSION_LT_131 and isinstance(name, pl_expr.StructFunction): + elif not POLARS_VERSION_LT_131 and isinstance( + name, plrs._expr_nodes.StructFunction + ): return expr.StructFunction( dtype, expr.StructFunction.Name.from_polars(name), @@ -756,14 +769,19 @@ def _( ) else: (child, base) = children - return expr.Cast( - DataType(pl.Float64()), - expr.BinOp( - dtype, - plc.binaryop.BinaryOperator.LOG_BASE, - child, - expr.Literal(dtype, base.value), - ), + res = expr.BinOp( + dtype, + plc.binaryop.BinaryOperator.LOG_BASE, + child, + expr.Literal(dtype, base.value), + ) + return ( + res + if not POLARS_VERSION_LT_134 + else expr.Cast( + DataType(pl.Float64()), + res, + ) ) elif name == "pow": return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children) @@ -775,9 +793,12 @@ def _( @_translate_expr.register def _( - node: pl_expr.Window, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Window, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: - if isinstance(node.options, pl_expr.RollingGroupOptions): + if isinstance(node.options, plrs._expr_nodes.RollingGroupOptions): # pl.col("a").rolling(...) with set_expr_context(translator, ExecutionContext.ROLLING): agg = translator.translate_expr(n=node.function, schema=schema) @@ -819,7 +840,7 @@ def _( for agg in named_aggs } return replace([named_post_agg.value], replacements)[0] - elif isinstance(node.options, pl_expr.WindowMapping): + elif isinstance(node.options, plrs._expr_nodes.WindowMapping): # pl.col("a").over(...) with set_expr_context(translator, ExecutionContext.WINDOW): agg = translator.translate_expr(n=node.function, schema=schema) @@ -860,7 +881,10 @@ def _( @_translate_expr.register def _( - node: pl_expr.Literal, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Literal, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: if isinstance(node.value, plrs.PySeries): return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value)) @@ -872,7 +896,7 @@ def _( @_translate_expr.register def _( - node: pl_expr.Sort, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Sort, translator: Translator, dtype: DataType, schema: Schema ) -> expr.Expr: # TODO: raise in groupby return expr.Sort( @@ -882,7 +906,10 @@ def _( @_translate_expr.register def _( - node: pl_expr.SortBy, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.SortBy, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: options = node.sort_options return expr.SortBy( @@ -895,7 +922,10 @@ def _( @_translate_expr.register def _( - node: pl_expr.Slice, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Slice, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: offset = translator.translate_expr(n=node.offset, schema=schema) length = translator.translate_expr(n=node.length, schema=schema) @@ -911,7 +941,10 @@ def _( @_translate_expr.register def _( - node: pl_expr.Gather, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Gather, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: return expr.Gather( dtype, @@ -922,7 +955,10 @@ def _( @_translate_expr.register def _( - node: pl_expr.Filter, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Filter, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: return expr.Filter( dtype, @@ -933,7 +969,7 @@ def _( @_translate_expr.register def _( - node: pl_expr.Cast, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Cast, translator: Translator, dtype: DataType, schema: Schema ) -> expr.Expr: inner = translator.translate_expr(n=node.expr, schema=schema) # Push casts into literals so we can handle Cast(Literal(Null)) @@ -948,19 +984,25 @@ def _( @_translate_expr.register def _( - node: pl_expr.Column, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Column, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: return expr.Col(dtype, node.name) @_translate_expr.register def _( - node: pl_expr.Agg, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Agg, translator: Translator, dtype: DataType, schema: Schema ) -> expr.Expr: agg_name = node.name args = [translator.translate_expr(n=arg, schema=schema) for arg in node.arguments] - if agg_name not in ("count", "n_unique", "mean", "median", "quantile"): + aggs = ["count", "n_unique", "mean", "median"] + if POLARS_VERSION_LT_134: + aggs.append("quantile") + if agg_name not in aggs: args = [ expr.Cast(dtype, arg) if plc.traits.is_fixed_point(arg.dtype.plc_type) @@ -978,7 +1020,10 @@ def _( @_translate_expr.register def _( - node: pl_expr.Ternary, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Ternary, + translator: Translator, + dtype: DataType, + schema: Schema, ) -> expr.Expr: return expr.Ternary( dtype, @@ -990,7 +1035,7 @@ def _( @_translate_expr.register def _( - node: pl_expr.BinaryExpr, + node: plrs._expr_nodes.BinaryExpr, translator: Translator, dtype: DataType, schema: Schema, @@ -1000,10 +1045,10 @@ def _( if ( POLARS_VERSION_LT_133 and plc.traits.is_boolean(dtype.plc_type) - and node.op == pl_expr.Operator.TrueDivide + and node.op == plrs._expr_nodes.Operator.TrueDivide ): dtype = DataType(pl.Float64()) # pragma: no cover - if node.op == pl_expr.Operator.TrueDivide and ( + if node.op == plrs._expr_nodes.Operator.TrueDivide and ( plc.traits.is_fixed_point(left.dtype.plc_type) or plc.traits.is_fixed_point(right.dtype.plc_type) ): @@ -1018,6 +1063,18 @@ def _( ), ) + if ( + not POLARS_VERSION_LT_134 + and node.op == plrs._expr_nodes.Operator.Multiply + and plc.traits.is_fixed_point(left.dtype.plc_type) + and plc.traits.is_fixed_point(right.dtype.plc_type) + ): + dtype = DataType( + pl.Decimal( + 38, -(left.dtype.plc_type.scale() + right.dtype.plc_type.scale()) + ) + ) + return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], @@ -1028,7 +1085,7 @@ def _( @_translate_expr.register def _( - node: pl_expr.Len, translator: Translator, dtype: DataType, schema: Schema + node: plrs._expr_nodes.Len, translator: Translator, dtype: DataType, schema: Schema ) -> expr.Expr: value = expr.Len(dtype) if dtype.id() != plc.TypeId.INT32: diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index c41ed09ca1e..e974712e769 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -19,7 +19,7 @@ import polars as pl import polars.datatypes -from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +from polars import polars as plrs if TYPE_CHECKING: from collections.abc import Callable @@ -54,39 +54,39 @@ ] PolarsIR: TypeAlias = Union[ - pl_ir.PythonScan, - pl_ir.Scan, - pl_ir.Cache, - pl_ir.DataFrameScan, - pl_ir.Select, - pl_ir.GroupBy, - pl_ir.Join, - pl_ir.HStack, - pl_ir.Distinct, - pl_ir.Sort, - pl_ir.Slice, - pl_ir.Filter, - pl_ir.SimpleProjection, - pl_ir.MapFunction, - pl_ir.Union, - pl_ir.HConcat, - pl_ir.ExtContext, + plrs._ir_nodes.PythonScan, + plrs._ir_nodes.Scan, + plrs._ir_nodes.Cache, + plrs._ir_nodes.DataFrameScan, + plrs._ir_nodes.Select, + plrs._ir_nodes.GroupBy, + plrs._ir_nodes.Join, + plrs._ir_nodes.HStack, + plrs._ir_nodes.Distinct, + plrs._ir_nodes.Sort, + plrs._ir_nodes.Slice, + plrs._ir_nodes.Filter, + plrs._ir_nodes.SimpleProjection, + plrs._ir_nodes.MapFunction, + plrs._ir_nodes.Union, + plrs._ir_nodes.HConcat, + plrs._ir_nodes.ExtContext, ] PolarsExpr: TypeAlias = Union[ - pl_expr.Function, - pl_expr.Window, - pl_expr.Literal, - pl_expr.Sort, - pl_expr.SortBy, - pl_expr.Gather, - pl_expr.Filter, - pl_expr.Cast, - pl_expr.Column, - pl_expr.Agg, - pl_expr.BinaryExpr, - pl_expr.Len, - pl_expr.PyExprIR, + plrs._expr_nodes.Function, + plrs._expr_nodes.Window, + plrs._expr_nodes.Literal, + plrs._expr_nodes.Sort, + plrs._expr_nodes.SortBy, + plrs._expr_nodes.Gather, + plrs._expr_nodes.Filter, + plrs._expr_nodes.Cast, + plrs._expr_nodes.Column, + plrs._expr_nodes.Agg, + plrs._expr_nodes.BinaryExpr, + plrs._expr_nodes.Len, + plrs._expr_nodes.PyExprIR, ] PolarsSchema: TypeAlias = dict[str, pl.DataType] diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index cf1556551e7..2f81376524b 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -18,6 +18,7 @@ POLARS_VERSION_LT_1321 = POLARS_VERSION < parse("1.32.1") POLARS_VERSION_LT_1323 = POLARS_VERSION < parse("1.32.3") POLARS_VERSION_LT_133 = POLARS_VERSION < parse("1.33.0") +POLARS_VERSION_LT_134 = POLARS_VERSION < parse("1.34.0") def _ensure_polars_version() -> None: diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 046b506a10f..a88bb41d98d 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -21,7 +21,7 @@ requires-python = ">=3.10" dependencies = [ "nvidia-ml-py>=12", "packaging", - "polars>=1.29,<1.34", + "polars>=1.29,<1.35", "pylibcudf==25.12.*,>=0.0.0a0", "typing-extensions; python_version < '3.11'", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_polars/tests/dsl/test_serialization.py b/python/cudf_polars/tests/dsl/test_serialization.py index b4b89992e67..35fad5db594 100644 --- a/python/cudf_polars/tests/dsl/test_serialization.py +++ b/python/cudf_polars/tests/dsl/test_serialization.py @@ -7,7 +7,7 @@ import pytest -from polars.polars import _expr_nodes as pl_expr +from polars import polars from cudf_polars.dsl.expressions.boolean import BooleanFunction from cudf_polars.dsl.expressions.datetime import TemporalFunction @@ -47,7 +47,7 @@ def test_function_name_invalid(function): def test_from_polars_all_names(function): # Test that all valid names of polars expressions are correctly converted - polars_function = getattr(pl_expr, function.__name__) + polars_function = getattr(polars._expr_nodes, function.__name__) polars_names = [name for name in dir(polars_function) if not name.startswith("_")] # Check names advertised by polars are the same as we advertise polars_names_set = set(polars_names) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index 10fcf9f660d..b55d709be6f 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -12,6 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils.versions import POLARS_VERSION_LT_134 def test_select(): @@ -48,7 +49,7 @@ def test_select_decimal_precision_none_result_max_precision(): query = ldf.select(pl.col("a")) cpu_result = query.collect() gpu_result = query.collect(engine="gpu") - assert cpu_result.schema["a"].precision is None + assert cpu_result.schema["a"].precision is None if POLARS_VERSION_LT_134 else 38 assert gpu_result.schema["a"].precision == 38 diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index ce1548fe680..22c78ebe2c2 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime from pathlib import Path import pytest @@ -18,7 +19,17 @@ def test_translation_assert_raises(): - df = pl.LazyFrame({"a": [1, 2, 3]}) + df = pl.LazyFrame( + { + "time": pl.datetime_range( + start=datetime(2021, 12, 16), + end=datetime(2021, 12, 16, 3), + interval="30m", + eager=True, + ), + "n": range(7), + } + ) # This should succeed assert_gpu_result_equal(df) @@ -30,7 +41,7 @@ def test_translation_assert_raises(): class E(Exception): pass - unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b")) + unsupported = df.group_by_dynamic("time", every="1d").agg(pl.col("n").sum()) # Unsupported query should raise NotImplementedError assert_ir_translation_raises(unsupported, NotImplementedError) From df9606c0af068e7c2594448ec3019885a92c2117 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 15 Oct 2025 10:37:00 -0400 Subject: [PATCH 02/12] adhere to new polars multiplication scale promotion rules --- .../cudf_polars/cudf_polars/dsl/translate.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a66aa612e57..62b61620764 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -1069,10 +1069,20 @@ def _( and plc.traits.is_fixed_point(left.dtype.plc_type) and plc.traits.is_fixed_point(right.dtype.plc_type) ): - dtype = DataType( - pl.Decimal( - 38, -(left.dtype.plc_type.scale() + right.dtype.plc_type.scale()) - ) + left_scale = -left.dtype.plc_type.scale() + right_scale = -right.dtype.plc_type.scale() + out_scale = max(left_scale, right_scale) + + return expr.UnaryFunction( + DataType(pl.Decimal(38, out_scale)), + "round", + (out_scale, "half_to_even"), + expr.BinOp( + DataType(pl.Decimal(38, left_scale + right_scale)), + expr.BinOp._MAPPING[node.op], + left, + right, + ), ) return expr.BinOp( From de6344f7c1ebf44dd392d8d21cccfacb88b69fec Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 15 Oct 2025 14:34:29 -0400 Subject: [PATCH 03/12] pass more upstream polars tests --- .../cudf_polars/containers/dataframe.py | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 51 +++++++++++++------ .../cudf_polars/cudf_polars/dsl/translate.py | 11 ++++ .../cudf_polars/cudf_polars/testing/plugin.py | 5 +- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 8d1d6a3b0db..25f5dca5878 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -324,7 +324,7 @@ def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns) - def select_columns(self, names: Set[str]) -> list[Column]: + def select_columns(self, names: Iterable[str]) -> list[Column]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 6cdcf8ff901..cd8afa2309c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1711,15 +1711,18 @@ def _strip_predicate_casts(node: expr.Expr) -> expr.Expr: not POLARS_VERSION_LT_134 and isinstance(child, expr.ColRef) and ( - plc.traits.is_floating_point(src.plc_type) - and plc.traits.is_floating_point(dst.plc_type) + ( + plc.traits.is_floating_point(src.plc_type) + and plc.traits.is_floating_point(dst.plc_type) + ) + or ( + plc.traits.is_integral(src.plc_type) + and plc.traits.is_integral(dst.plc_type) + and src.plc_type.id() == dst.plc_type.id() + ) ) - ) or ( - plc.traits.is_integral(src.plc_type) - and plc.traits.is_integral(dst.plc_type) ): return child - return expr.Cast(dst, child) if not node.children: @@ -2222,22 +2225,40 @@ def do_evaluate( right.dtypes, ) if coalesce and how == "Full": + left_keys_tbl = plc.copying.gather(left_on.table, lg, left_policy) + right_keys_tbl = plc.copying.gather(right_on.table, rg, right_policy) + + left_keys_df = DataFrame.from_table( + left_keys_tbl, left_on.column_names, left_on.dtypes + ) + right_keys_df = DataFrame.from_table( + right_keys_tbl, right_on.column_names, right_on.dtypes + ) + + left_key_names = [c.name for c in left_on.columns] + right_key_names = [c.name for c in right_on.columns] + + left_keys = { + c.name: c for c in left_keys_df.select_columns(left_key_names) + } + right_keys = { + c.name: c for c in right_keys_df.select_columns(right_key_names) + } + left = left.with_columns( ( Column( - plc.replace.replace_nulls(left_col.obj, right_col.obj), - name=left_col.name, - dtype=left_col.dtype, - ) - for left_col, right_col in zip( - left.select_columns(left_on.column_names_set), - right.select_columns(right_on.column_names_set), - strict=True, + plc.replace.replace_nulls( + left_keys[name].obj, right_keys[name].obj + ), + name=name, + dtype=left_keys[name].dtype, ) + for name in left_key_names ), replace_only=True, ) - right = right.discard_columns(right_on.column_names_set) + right = right.discard_columns(set(right_key_names)) if how == "Right": # Undo the swap for right join before gluing together. left, right = right, left diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 62b61620764..d7f1ebc2f9f 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -972,6 +972,17 @@ def _( node: plrs._expr_nodes.Cast, translator: Translator, dtype: DataType, schema: Schema ) -> expr.Expr: inner = translator.translate_expr(n=node.expr, schema=schema) + + if plc.traits.is_floating_point(inner.dtype.plc_type) and plc.traits.is_fixed_point( + dtype.plc_type + ): + return expr.Cast( + dtype, + expr.UnaryFunction( + inner.dtype, "round", (-dtype.plc_type.scale(), "half_to_even"), inner + ), + ) + # Push casts into literals so we can handle Cast(Literal(Null)) if isinstance(inner, expr.Literal): return inner.astype(dtype) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 4958b023046..26984daf0f9 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -122,7 +122,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU", "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", - "tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "polars returns decimal column with precision=None", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", @@ -143,7 +142,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[False]": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[True]": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", - "tests/unit/operations/test_join.py::test_join_filter_pushdown_iejoin": "Row order differs due to multiple matches per left row index; join results are correct but unsorted", # We match the behavior of the polars[cpu] streaming engine (it makes doesn't make any ordering guarantees either when maintain_order is none). # But this test does because the test is run with the polars[cpu] in-memory engine, which still preserves the order of the left dataframe # when maintain order is none. @@ -174,8 +172,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR", - "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899", - "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899", "tests/unit/test_cse.py::test_cse_predicate_self_join[False]": "polars removed the refcount in the logical plan", "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv]": "CSV multiscan with row_index and no row limit is not yet supported.", "tests/unit/io/test_scan.py::test_scan_empty_paths_friendly_error[scan_parquet-failed to retrieve first file schema (parquet)-'parquet scan']": "Debug output on stderr doesn't match", @@ -200,6 +196,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit2-0-0-False]": "Aggregating a list literal: cudf#19610", "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit2-0-len1-False]": "Aggregating a list literal: cudf#19610", "tests/unit/operations/test_slice.py::test_schema_slice_on_literal_23999[lit1-0-0-False]": "Aggregating a list literal: cudf#19610", + "tests/unit/operations/namespaces/test_binary.py::test_binary_compounded_literal_aggstate_24460": "Aggregating a list literal: cudf#19610", "tests/unit/operations/test_top_k.py::test_top_k_non_elementwise_by_24163": "Ternary with scalar predicate does not broadcast mask cudf#20210", } From 605ab022c31e808f0390f64a65603c97d2f57b60 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 15 Oct 2025 17:26:07 -0400 Subject: [PATCH 04/12] pass more upstream polars tests --- python/cudf_polars/cudf_polars/dsl/ir.py | 41 +++++++++---------- .../cudf_polars/cudf_polars/testing/plugin.py | 1 + 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index cd8afa2309c..66e601a3d8f 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -2225,36 +2225,35 @@ def do_evaluate( right.dtypes, ) if coalesce and how == "Full": - left_keys_tbl = plc.copying.gather(left_on.table, lg, left_policy) - right_keys_tbl = plc.copying.gather(right_on.table, rg, right_policy) - - left_keys_df = DataFrame.from_table( - left_keys_tbl, left_on.column_names, left_on.dtypes - ) - right_keys_df = DataFrame.from_table( - right_keys_tbl, right_on.column_names, right_on.dtypes - ) - - left_key_names = [c.name for c in left_on.columns] - right_key_names = [c.name for c in right_on.columns] - - left_keys = { - c.name: c for c in left_keys_df.select_columns(left_key_names) - } - right_keys = { - c.name: c for c in right_keys_df.select_columns(right_key_names) - } + left_key_names = left_on.column_names + right_key_names = right_on.column_names + left_key_cols = left.select(left_key_names).column_map + right_key_cols = right.select(right_key_names).column_map + + def _align(rc: Column | None, target: Column) -> Column | None: + if rc is None: + return None + if rc.dtype.plc_type.id() == target.dtype.plc_type.id(): + return rc + return ( + rc.astype(target.dtype) + if dtypes.can_cast(rc.dtype.plc_type, target.dtype.plc_type) + else None + ) left = left.with_columns( ( Column( plc.replace.replace_nulls( - left_keys[name].obj, right_keys[name].obj + left_key_cols[name].obj, + _x.obj, ), name=name, - dtype=left_keys[name].dtype, + dtype=left_key_cols[name].dtype, ) for name in left_key_names + if (_x := _align(right_key_cols.get(name), left_key_cols[name])) + is not None ), replace_only=True, ) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 26984daf0f9..998b285299f 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -113,6 +113,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394", "tests/unit/io/test_parquet.py::test_field_overwrites_metadata": "cannot serialize in-memory sink target.", + "tests/unit/io/test_parquet.py::test_binary_offset_roundtrip": "binary offset type unsupported", "tests/unit/io/test_parquet_field_overwrites.py::test_required_flat": "cannot serialize in-memory sink target.", "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype0]": "cannot serialize in-memory sink target.", "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype1]": "cannot serialize in-memory sink target.", From 5b8275274f181a01cbc8a6bb9324e32a3dd60e57 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 16 Oct 2025 09:38:42 -0400 Subject: [PATCH 05/12] add missing xfail --- python/cudf_polars/cudf_polars/testing/plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index f673a5d81db..9ff012e7615 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -148,6 +148,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_parquet.py::test_scan_parquet_filter_statistics_load_missing_column_21391": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_binary_offset_roundtrip": "binary offset type unsupported", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", From 4d37a68238caae76657913c38504c65314033f10 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 16 Oct 2025 13:34:05 -0400 Subject: [PATCH 06/12] code coverage --- .../dsl/expressions/aggregation.py | 8 +-- .../cudf_polars/dsl/expressions/ternary.py | 8 +-- .../cudf_polars/dsl/expressions/unary.py | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 67 +++++++------------ .../cudf_polars/cudf_polars/dsl/translate.py | 4 +- .../cudf_polars/dsl/utils/aggregations.py | 27 ++++---- .../tests/expressions/test_numeric_binops.py | 13 ++++ 7 files changed, 59 insertions(+), 70 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index f259f8ae5c7..696cb2d1c3f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -23,7 +23,6 @@ class Agg(Expr): __slots__ = ( - "_nunique_include_nulls", "context", "name", "op", @@ -63,7 +62,6 @@ def __init__( req = plc.aggregation.median() elif name == "n_unique": # TODO: datatype of result - self._nunique_include_nulls = True req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) elif name == "first" or name == "last": req = None @@ -176,11 +174,7 @@ def _reduce( if column.size == 0 or column.null_count == column.size: z = None if self.name == "n_unique": - include_nulls = getattr(self, "_nunique_include_nulls", False) - if column.size == 0: - z = 0 - else: - z = 1 if include_nulls else 0 + z = 0 if column.size == 0 else 1 return Column( plc.Column.from_scalar(plc.Scalar.from_py(z, self.dtype.plc_type), 1), name=column.name, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py index 279c47ec33d..8e9ebe98528 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py @@ -34,18 +34,14 @@ def __init__( self.children = (when, then, otherwise) self.is_pointwise = True - def do_evaluate( # noqa: D102 + def do_evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME ) -> Column: + """Evaluate this expression given a dataframe for context.""" when, then, otherwise = ( child.evaluate(df, context=context) for child in self.children ) - if then.dtype.plc_type != self.dtype.plc_type: - then = then.astype(self.dtype) - if otherwise.dtype.plc_type != self.dtype.plc_type: - otherwise = otherwise.astype(self.dtype) - then_obj = then.obj_scalar if then.is_scalar else then.obj otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 4e1adb4f9de..eaf857e327e 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -149,7 +149,7 @@ def __init__( ) if self.name not in UnaryFunction._supported_fns: - raise NotImplementedError(f"Unary function {name=}") + raise NotImplementedError(f"Unary function {name=}") # pragma: no cover if self.name in UnaryFunction._supported_cum_aggs: (reverse,) = self.options if reverse: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index d8e748cd7e3..a377e3ede22 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -260,6 +260,22 @@ def _parquet_physical_types( return dict(zip(schema.keys(), [c.type() for c in df.tbl.columns()], strict=True)) +def _cast_literal_to_decimal( + side: expr.Expr, lit: expr.Literal, phys_type_map: dict[str, plc.DataType] +) -> expr.Expr: + if isinstance(side, expr.Cast): + col = side.children[0] + assert isinstance(col, expr.Col) + name = col.name + else: + assert isinstance(side, expr.Col) + name = side.name + if phys_type_map[name].id() in _DECIMAL_IDS: + scale = abs(phys_type_map[name].scale()) + return expr.Cast(side.dtype, expr.Cast(DataType(pl.Decimal(38, scale)), lit)) + return lit + + def _cast_literals_to_physical_types( node: expr.Expr, phys_type_map: dict[str, plc.DataType] ) -> expr.Expr: @@ -268,32 +284,14 @@ def _cast_literals_to_physical_types( left = _cast_literals_to_physical_types(left, phys_type_map) right = _cast_literals_to_physical_types(right, phys_type_map) if node.op in _COMPARISON_BINOPS: - if ( - isinstance(left, expr.Col) - and isinstance(right, expr.Literal) - and phys_type_map[left.name].id() in _DECIMAL_IDS + if isinstance(left, (expr.Col, expr.Cast)) and isinstance( + right, expr.Literal ): - right = expr.Cast( - left.dtype, - expr.Cast( - DataType(pl.Decimal(38, abs(phys_type_map[left.name].scale()))), - right, - ), - ) - elif ( - isinstance(right, expr.Col) - and isinstance(left, expr.Literal) - and phys_type_map[right.name].id() in _DECIMAL_IDS + right = _cast_literal_to_decimal(left, right, phys_type_map) + elif isinstance(right, (expr.Col, expr.Cast)) and isinstance( + left, expr.Literal ): - left = expr.Cast( - right.dtype, - expr.Cast( - DataType( - pl.Decimal(38, abs(phys_type_map[right.name].scale())) - ), - left, - ), - ) + left = _cast_literal_to_decimal(right, left, phys_type_map) return node.reconstruct([left, right]) return node @@ -2228,30 +2226,17 @@ def do_evaluate( left_key_cols = left.select(left_key_names).column_map right_key_cols = right.select(right_key_names).column_map - def _align(rc: Column | None, target: Column) -> Column | None: - if rc is None: - return None - if rc.dtype.plc_type.id() == target.dtype.plc_type.id(): - return rc - return ( - rc.astype(target.dtype) - if dtypes.can_cast(rc.dtype.plc_type, target.dtype.plc_type) - else None - ) - left = left.with_columns( ( Column( - plc.replace.replace_nulls( - left_key_cols[name].obj, - _x.obj, - ), + plc.replace.replace_nulls(left_key_cols[name].obj, _rc.obj), name=name, dtype=left_key_cols[name].dtype, ) for name in left_key_names - if (_x := _align(right_key_cols.get(name), left_key_cols[name])) - is not None + if (_rc := right_key_cols.get(name)) is not None + and _rc.dtype.plc_type.id() + == left_key_cols[name].dtype.plc_type.id() ), replace_only=True, ) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index d7f1ebc2f9f..8ac91c34d2e 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -1010,9 +1010,7 @@ def _( agg_name = node.name args = [translator.translate_expr(n=arg, schema=schema) for arg in node.arguments] - aggs = ["count", "n_unique", "mean", "median"] - if POLARS_VERSION_LT_134: - aggs.append("quantile") + aggs = ["count", "n_unique", "mean", "median", "quantile"] if agg_name not in aggs: args = [ expr.Cast(dtype, arg) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 5699dc639bd..bb1ce5e8351 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -16,7 +16,7 @@ from cudf_polars.containers import DataType from cudf_polars.dsl import expr, ir from cudf_polars.dsl.expressions.base import ExecutionContext -from cudf_polars.utils.versions import POLARS_VERSION_LT_1323 +from cudf_polars.utils.versions import POLARS_VERSION_LT_134, POLARS_VERSION_LT_1323 if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable, Sequence @@ -185,18 +185,21 @@ def decompose_single_agg( # mean/median on decimal: Polars returns float -> pre-cast decimal_unsupported = False if plc.traits.is_fixed_point(child_dtype): - if is_quantile: + cast_for_quantile = is_quantile and not POLARS_VERSION_LT_134 + cast_for_mean_or_median = ( + agg.name in {"mean", "median"} + ) and plc.traits.is_floating_point(agg.dtype.plc_type) + + if cast_for_quantile or cast_for_mean_or_median: + child = expr.Cast( + agg.dtype + if plc.traits.is_floating_point(agg.dtype.plc_type) + else DataType(pl.Float64()), + child, + ) + child_dtype = child.dtype.plc_type + elif is_quantile and POLARS_VERSION_LT_134: decimal_unsupported = True - elif agg.name in {"mean", "median"}: - tid = agg.dtype.plc_type.id() - if tid in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}: - cast_to = ( - DataType(pl.Float64()) - if tid == plc.TypeId.FLOAT64 - else DataType(pl.Float32()) - ) - child = expr.Cast(cast_to, child) - child_dtype = child.dtype.plc_type is_group_quantile_supported = plc.traits.is_integral( child_dtype diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 280437def50..50ca29d361f 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -121,3 +121,16 @@ def test_true_div_with_decimals(): ) q = df.select(pl.col("bar") / pl.col("foo")) assert_gpu_result_equal(q, check_dtypes=not POLARS_VERSION_LT_132) + + +def test_multiply_with_decimals(): + df = pl.LazyFrame( + { + "x": [Decimal("1.23"), Decimal("4.56"), Decimal("7.89")], + "y": [Decimal("2.00"), Decimal("3.00"), Decimal("4.00")], + }, + schema={"x": pl.Decimal(10, 2), "y": pl.Decimal(10, 3)}, + ) + + q = df.select(pl.col("x") * pl.col("y")) + assert_gpu_result_equal(q) From be68d0cf5dc511ebc097a23af42a4ad58a9e1540 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Sun, 19 Oct 2025 13:08:30 -0400 Subject: [PATCH 07/12] version guard test --- python/cudf_polars/tests/expressions/test_numeric_binops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 50ca29d361f..e499583d28f 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -133,4 +133,4 @@ def test_multiply_with_decimals(): ) q = df.select(pl.col("x") * pl.col("y")) - assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtypes=not POLARS_VERSION_LT_132) From 1023972797536fa193c76e29b7300ed68d0c6764 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 23 Oct 2025 15:12:57 -0400 Subject: [PATCH 08/12] clean up --- .../cudf_polars/cudf_polars/dsl/expressions/aggregation.py | 4 ---- python/cudf_polars/cudf_polars/dsl/expressions/boolean.py | 2 -- python/cudf_polars/cudf_polars/dsl/expressions/datetime.py | 2 -- python/cudf_polars/cudf_polars/dsl/expressions/string.py | 6 +----- python/cudf_polars/cudf_polars/dsl/expressions/struct.py | 2 -- python/cudf_polars/cudf_polars/dsl/expressions/ternary.py | 2 -- python/cudf_polars/cudf_polars/dsl/ir.py | 2 -- 7 files changed, 1 insertion(+), 19 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index c403ca969f3..10390d129e3 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -51,10 +51,6 @@ def __init__( raise NotImplementedError( f"Unsupported aggregation {name=}" ) # pragma: no cover; all valid aggs are supported - if name == "sum": - child = children[0] - if plc.traits.is_fixed_point(child.dtype.plc_type): - self.dtype = child.dtype # TODO: nan handling in groupby case if name == "min": req = plc.aggregation.min() diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 9c2d76684f2..14ad38ab540 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -24,8 +24,6 @@ import polars.type_aliases as pl_types from polars import polars - pl_expr = polars._expr_nodes - from cudf_polars.containers import DataFrame __all__ = ["BooleanFunction"] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index cb07fee44fc..069003ebc15 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -19,8 +19,6 @@ from polars import polars - pl_expr = polars._expr_nodes - from cudf_polars.containers import DataFrame, DataType __all__ = ["TemporalFunction"] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 697d693c14f..4d7bf6daa27 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -23,13 +23,9 @@ from cudf_polars.dsl.utils.reshape import broadcast from cudf_polars.utils.versions import POLARS_VERSION_LT_132 -dtype_str_repr = polars.dtype_str_repr - if TYPE_CHECKING: from typing_extensions import Self - pl_expr = polars._expr_nodes - from cudf_polars.containers import DataFrame, DataType __all__ = ["StringFunction"] @@ -285,7 +281,7 @@ def _validate_input(self) -> None: and width.value is not None and width.value < 0 ): # pragma: no cover - dtypestr = dtype_str_repr(width.dtype.polars_type) + dtypestr = polars.dtype_str_repr(width.dtype.polars_type) raise InvalidOperationError( f"conversion from `{dtypestr}` to `u64` " f"failed in column 'literal' for 1 out of " diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/struct.py b/python/cudf_polars/cudf_polars/dsl/expressions/struct.py index de0e1da365b..8af5cd1ec85 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/struct.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/struct.py @@ -20,8 +20,6 @@ from polars import polars - pl_expr = polars._expr_nodes - from cudf_polars.containers import DataFrame, DataType __all__ = ["StructFunction"] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py index 8e9ebe98528..1bf6ea476f1 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py @@ -41,10 +41,8 @@ def do_evaluate( when, then, otherwise = ( child.evaluate(df, context=context) for child in self.children ) - then_obj = then.obj_scalar if then.is_scalar else then.obj otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj - return Column( plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj), dtype=self.dtype, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index b63410080db..f38584f4150 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -54,8 +54,6 @@ from polars import polars - pl_expr = polars._expr_nodes - from rmm.pylibrmm.stream import Stream from cudf_polars.containers.dataframe import NamedColumn From 07e84d66d655bf009423ac70471e78271be7f36c Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 5 Nov 2025 10:54:16 -0500 Subject: [PATCH 09/12] rerun CI --- .github/workflows/pr.yaml | 204 +++++++++++++++++++------------------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f9d00bfe3e9..62c3977a072 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -13,9 +13,9 @@ jobs: # Please keep pr-builder as the top job here pr-builder: needs: - # - check-nightly-ci - # - changed-files - # - checks + - check-nightly-ci + - changed-files + - checks # - conda-cpp-build # - cpp-linters # - conda-cpp-checks @@ -39,100 +39,100 @@ jobs: # - unit-tests-cudf-pandas # - pandas-tests # - narwhals-tests - # - telemetry-setup + - telemetry-setup # - third-party-integration-tests-cudf-pandas secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main if: always() with: needs: ${{ toJSON(needs) }} - # telemetry-setup: - # continue-on-error: true - # runs-on: ubuntu-latest - # env: - # OTEL_SERVICE_NAME: 'pr-cudf' - # steps: - # - name: Telemetry setup - # if: ${{ vars.TELEMETRY_ENABLED == 'true' }} - # uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main - # check-nightly-ci: - # runs-on: ubuntu-latest - # env: - # RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # steps: - # - name: Check if nightly CI is passing - # uses: rapidsai/shared-actions/check_nightly_success/dispatch@main - # with: - # repo: cudf - # changed-files: - # secrets: inherit - # needs: telemetry-setup - # uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main - # with: - # files_yaml: | - # test_cpp: - # - '**' - # - '!.devcontainer/**' - # - '!CONTRIBUTING.md' - # - '!README.md' - # - '!ci/cudf_pandas_scripts/**' - # - '!ci/release/update-version.sh' - # - '!docs/**' - # - '!img/**' - # - '!java/**' - # - '!notebooks/**' - # - '!python/**' - # test_cudf_pandas: - # - '**' - # - '!.clang-format' - # - '!.devcontainer/**' - # - '!CONTRIBUTING.md' - # - '!README.md' - # - '!ci/release/update-version.sh' - # - '!docs/**' - # - '!img/**' - # - '!java/**' - # - '!notebooks/**' - # test_java: - # - '**' - # - '!.clang-format' - # - '!.devcontainer/**' - # - '!CONTRIBUTING.md' - # - '!README.md' - # - '!ci/cudf_pandas_scripts/**' - # - '!ci/release/update-version.sh' - # - '!docs/**' - # - '!img/**' - # - '!notebooks/**' - # - '!python/**' - # test_notebooks: - # - '**' - # - '!.clang-format' - # - '!.devcontainer/**' - # - '!CONTRIBUTING.md' - # - '!README.md' - # - '!ci/cudf_pandas_scripts/**' - # - '!ci/release/update-version.sh' - # - '!java/**' - # test_python: - # - '**' - # - '!.clang-format' - # - '!.devcontainer/**' - # - '!CONTRIBUTING.md' - # - '!README.md' - # - '!ci/cudf_pandas_scripts/**' - # - '!ci/release/update-version.sh' - # - '!docs/**' - # - '!img/**' - # - '!java/**' - # - '!notebooks/**' - # checks: - # secrets: inherit - # needs: telemetry-setup - # uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main - # with: - # enable_check_generated_files: false - # ignored_pr_jobs: "telemetry-summarize spark-rapids-jni wheel-tests-cudf-polars-with-rapidsmpf" + telemetry-setup: + continue-on-error: true + runs-on: ubuntu-latest + env: + OTEL_SERVICE_NAME: 'pr-cudf' + steps: + - name: Telemetry setup + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} + uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main + check-nightly-ci: + runs-on: ubuntu-latest + env: + RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Check if nightly CI is passing + uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + with: + repo: cudf + changed-files: + secrets: inherit + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main + with: + files_yaml: | + test_cpp: + - '**' + - '!.devcontainer/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!ci/release/update-version.sh' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + - '!python/**' + test_cudf_pandas: + - '**' + - '!.clang-format' + - '!.devcontainer/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/release/update-version.sh' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + test_java: + - '**' + - '!.clang-format' + - '!.devcontainer/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!ci/release/update-version.sh' + - '!docs/**' + - '!img/**' + - '!notebooks/**' + - '!python/**' + test_notebooks: + - '**' + - '!.clang-format' + - '!.devcontainer/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!ci/release/update-version.sh' + - '!java/**' + test_python: + - '**' + - '!.clang-format' + - '!.devcontainer/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!ci/cudf_pandas_scripts/**' + - '!ci/release/update-version.sh' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' + checks: + secrets: inherit + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main + with: + enable_check_generated_files: false + ignored_pr_jobs: "telemetry-summarize spark-rapids-jni wheel-tests-cudf-polars-with-rapidsmpf" # conda-cpp-build: # needs: checks # secrets: inherit @@ -398,14 +398,14 @@ jobs: # uses: ./.github/workflows/spark-rapids-jni.yaml # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java - # telemetry-summarize: - # # This job must use a self-hosted runner to record telemetry traces. - # runs-on: linux-amd64-cpu4 - # needs: pr-builder - # if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} - # continue-on-error: true - # steps: - # - name: Telemetry summarize - # uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main - # env: - # GH_TOKEN: ${{ github.token }} + telemetry-summarize: + # This job must use a self-hosted runner to record telemetry traces. + runs-on: linux-amd64-cpu4 + needs: pr-builder + if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} + continue-on-error: true + steps: + - name: Telemetry summarize + uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main + env: + GH_TOKEN: ${{ github.token }} From 11dd18f45b5e113306604008dd9d376b4215b2a2 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 5 Nov 2025 12:12:42 -0500 Subject: [PATCH 10/12] address review --- .../dsl/expressions/aggregation.py | 57 +++---------------- python/cudf_polars/cudf_polars/dsl/ir.py | 4 +- .../cudf_polars/cudf_polars/dsl/translate.py | 7 ++- .../cudf_polars/cudf_polars/testing/plugin.py | 3 + 4 files changed, 19 insertions(+), 52 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 262f1b0f255..6a8b7c55de7 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -9,18 +9,16 @@ from functools import partial from typing import TYPE_CHECKING, Any, ClassVar -import polars as pl - import pylibcudf as plc -from cudf_polars.containers import Column, DataType +from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import ExecutionContext, Expr from cudf_polars.dsl.expressions.literal import Literal if TYPE_CHECKING: from rmm.pylibrmm.stream import Stream - from cudf_polars.containers import DataFrame + from cudf_polars.containers import DataFrame, DataType __all__ = ["Agg"] @@ -164,44 +162,22 @@ def agg_request(self) -> plc.aggregation.Aggregation: # noqa: D102 def _reduce( self, column: Column, *, request: plc.aggregation.Aggregation, stream: Stream ) -> Column: - is_mean_or_median = self.name in {"mean", "median"} - is_quantile = self.name == "quantile" - - out_dtype = self.dtype - if plc.traits.is_fixed_point(column.dtype.plc_type) and ( - is_mean_or_median or is_quantile + if ( + self.name in {"mean", "median"} + and plc.traits.is_fixed_point(column.dtype.plc_type) + and self.dtype.plc_type.id() in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64} ): - cast_to = ( - self.dtype - if is_mean_or_median - and plc.traits.is_floating_point(self.dtype.plc_type) - else DataType(pl.Float64()) - ) - column = column.astype(cast_to, stream=stream) - out_dtype = cast_to - if column.size == 0 or column.null_count == column.size: - res = None - if self.name == "n_unique": - res = 0 if column.size == 0 else 1 - return Column( - plc.Column.from_scalar( - plc.Scalar.from_py(res, out_dtype.plc_type, stream=stream), - 1, - stream=stream, - ), - name=column.name, - dtype=out_dtype, - ) + column = column.astype(self.dtype, stream=stream) return Column( plc.Column.from_scalar( plc.reduce.reduce( - column.obj, request, out_dtype.plc_type, stream=stream + column.obj, request, self.dtype.plc_type, stream=stream ), 1, stream=stream, ), name=column.name, - dtype=out_dtype, + dtype=self.dtype, ) def _count(self, column: Column, *, include_nulls: bool, stream: Stream) -> Column: @@ -229,21 +205,6 @@ def _sum(self, column: Column, stream: Stream) -> Column: name=column.name, dtype=self.dtype, ) - if plc.traits.is_fixed_point(column.dtype.plc_type): - return Column( - plc.Column.from_scalar( - plc.reduce.reduce( - column.obj, - plc.aggregation.sum(), - column.dtype.plc_type, - stream=stream, - ), - 1, - stream=stream, - ), - name=column.name, - dtype=column.dtype, - ) return self._reduce(column, request=plc.aggregation.sum(), stream=stream) def _min(self, column: Column, *, propagate_nans: bool, stream: Stream) -> Column: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 2747beab2bb..9c277515482 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -318,8 +318,8 @@ def _cast_literal_to_decimal( else: assert isinstance(side, expr.Col) name = side.name - if phys_type_map[name].id() in _DECIMAL_IDS: - scale = abs(phys_type_map[name].scale()) + if (type_ := phys_type_map[name]).id() in _DECIMAL_IDS: + scale = abs(type_.scale()) return expr.Cast(side.dtype, expr.Cast(DataType(pl.Decimal(38, scale)), lit)) return lit diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 37b07782ea1..85cfb14b1ce 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -14,6 +14,10 @@ from typing_extensions import assert_never import polars as pl + +# polars.polars is not a part of the public API, +# so we cannot rely on importing it directly +# See https://github.com/pola-rs/polars/issues/24826 from polars import polars as plrs import pylibcudf as plc @@ -1028,8 +1032,7 @@ def _( agg_name = node.name args = [translator.translate_expr(n=arg, schema=schema) for arg in node.arguments] - aggs = ["count", "n_unique", "mean", "median", "quantile"] - if agg_name not in aggs: + if agg_name not in ("count", "n_unique", "mean", "median", "quantile"): args = [ expr.Cast(dtype, arg) if plc.traits.is_fixed_point(arg.dtype.plc_type) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index a8a29656506..017aeef5d1b 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -178,6 +178,9 @@ def pytest_configure(config: pytest.Config) -> None: # But this test does because the test is run with the polars[cpu] in-memory engine, which still preserves the order of the left dataframe # when maintain order is none. "tests/unit/operations/test_join.py::test_join_preserve_order_left": "polars[gpu] makes no ordering guarantees when maintain_order is none", + # TODO: As of polars 1.34, the column names for left and right came in unaligned, which causes the dtypes to mismatch when calling plc.replace.replace_nulls + # Need to investigate what changed in polars + "tests/unit/operations/test_join.py::test_join_coalesce_column_order_23177": "Misaligned left/right column names left and right tables in join op", "tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", From 4ac55a87c88afa67d1f21cd9636d4a324c1755bc Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 5 Nov 2025 14:25:55 -0500 Subject: [PATCH 11/12] xfail a test --- .../cudf_polars/dsl/expressions/aggregation.py | 8 +------- python/cudf_polars/cudf_polars/testing/plugin.py | 1 + python/cudf_polars/tests/test_select.py | 2 ++ 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 6a8b7c55de7..99ac3ece406 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -24,13 +24,7 @@ class Agg(Expr): - __slots__ = ( - "context", - "name", - "op", - "options", - "request", - ) + __slots__ = ("context", "name", "op", "options", "request") _non_child = ("dtype", "name", "options", "context") def __init__( diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 017aeef5d1b..39016ae831a 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -194,6 +194,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/test_predicates.py::test_predicate_pushdown_split_pushable": "Casting that raises not supported on GPU", "tests/unit/io/test_scan_row_deletion.py::test_scan_row_deletion_skips_file_with_all_rows_deleted": "The test intentionally corrupts the parquet file, so we cannot read the row count from the header.", "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match", + "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/rapidsai/cudf/issues/20508", "tests/unit/datatypes/test_struct.py::test_struct_agg_all": "Needs nested list[struct] support", "tests/unit/constructors/test_structs.py::test_constructor_non_strict_schema_17956": "Needs nested list[struct] support", "tests/unit/io/test_delta.py::test_read_delta_arrow_map_type": "Needs nested list[struct] support", diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index b55d709be6f..afe94a6055a 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -49,6 +49,8 @@ def test_select_decimal_precision_none_result_max_precision(): query = ldf.select(pl.col("a")) cpu_result = query.collect() gpu_result = query.collect(engine="gpu") + # See github.com/pola-rs/polars/issues/19784 + # for context on the decimal changes. assert cpu_result.schema["a"].precision is None if POLARS_VERSION_LT_134 else 38 assert gpu_result.schema["a"].precision == 38 From 88bd2de6d069f8d5afc38bfc23a1ac2269577bf7 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 6 Nov 2025 10:35:37 -0500 Subject: [PATCH 12/12] revert CI --- .github/workflows/pr.yaml | 438 +++++++++++++++++++------------------- 1 file changed, 219 insertions(+), 219 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 62c3977a072..6021483bdef 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -16,31 +16,31 @@ jobs: - check-nightly-ci - changed-files - checks - # - conda-cpp-build - # - cpp-linters - # - conda-cpp-checks - # - conda-cpp-tests - # - conda-python-build - # - conda-python-cudf-tests - # - conda-python-other-tests - # - conda-java-tests - # - conda-notebook-tests - # - docs-build + - conda-cpp-build + - cpp-linters + - conda-cpp-checks + - conda-cpp-tests + - conda-python-build + - conda-python-cudf-tests + - conda-python-other-tests + - conda-java-tests + - conda-notebook-tests + - docs-build - wheel-build-libcudf - wheel-build-pylibcudf - # - wheel-build-cudf - # - wheel-tests-cudf + - wheel-build-cudf + - wheel-tests-cudf - wheel-build-cudf-polars - wheel-tests-cudf-polars - cudf-polars-polars-tests - # - wheel-build-dask-cudf - # - wheel-tests-dask-cudf - # - devcontainer - # - unit-tests-cudf-pandas - # - pandas-tests - # - narwhals-tests + - wheel-build-dask-cudf + - wheel-tests-dask-cudf + - devcontainer + - unit-tests-cudf-pandas + - pandas-tests + - narwhals-tests - telemetry-setup - # - third-party-integration-tests-cudf-pandas + - third-party-integration-tests-cudf-pandas secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main if: always() @@ -133,92 +133,92 @@ jobs: with: enable_check_generated_files: false ignored_pr_jobs: "telemetry-summarize spark-rapids-jni wheel-tests-cudf-polars-with-rapidsmpf" - # conda-cpp-build: - # needs: checks - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main - # with: - # build_type: pull-request - # node_type: "cpu16" - # script: ci/build_cpp.sh - # cpp-linters: - # secrets: inherit - # needs: checks - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # with: - # build_type: pull-request - # script: "ci/cpp_linters.sh" - # node_type: "cpu16" - # conda-cpp-checks: - # needs: conda-cpp-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@main - # with: - # build_type: pull-request - # conda-cpp-tests: - # needs: [conda-cpp-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp - # with: - # build_type: pull-request - # script: ci/test_cpp.sh - # conda-python-build: - # needs: conda-cpp-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main - # with: - # build_type: pull-request - # script: ci/build_python.sh - # conda-python-cudf-tests: - # needs: [conda-python-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - # with: - # build_type: pull-request - # script: "ci/test_python_cudf.sh" - # conda-python-other-tests: - # # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism - # needs: [conda-python-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - # with: - # build_type: pull-request - # script: "ci/test_python_other.sh" - # conda-java-tests: - # needs: [conda-cpp-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java - # with: - # build_type: pull-request - # node_type: "gpu-l4-latest-1" - # arch: "amd64" - # container_image: "rapidsai/ci-conda:25.12-latest" - # script: "ci/test_java.sh" - # conda-notebook-tests: - # needs: [conda-python-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks - # with: - # build_type: pull-request - # node_type: "gpu-l4-latest-1" - # arch: "amd64" - # container_image: "rapidsai/ci-conda:25.12-latest" - # script: "ci/test_notebooks.sh" - # docs-build: - # needs: conda-python-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # with: - # build_type: pull-request - # node_type: "gpu-l4-latest-1" - # arch: "amd64" - # container_image: "rapidsai/ci-conda:25.12-latest" - # script: "ci/build_docs.sh" + conda-cpp-build: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main + with: + build_type: pull-request + node_type: "cpu16" + script: ci/build_cpp.sh + cpp-linters: + secrets: inherit + needs: checks + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + with: + build_type: pull-request + script: "ci/cpp_linters.sh" + node_type: "cpu16" + conda-cpp-checks: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@main + with: + build_type: pull-request + conda-cpp-tests: + needs: [conda-cpp-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp + with: + build_type: pull-request + script: ci/test_cpp.sh + conda-python-build: + needs: conda-cpp-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main + with: + build_type: pull-request + script: ci/build_python.sh + conda-python-cudf-tests: + needs: [conda-python-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + with: + build_type: pull-request + script: "ci/test_python_cudf.sh" + conda-python-other-tests: + # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism + needs: [conda-python-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + with: + build_type: pull-request + script: "ci/test_python_other.sh" + conda-java-tests: + needs: [conda-cpp-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java + with: + build_type: pull-request + node_type: "gpu-l4-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:25.12-latest" + script: "ci/test_java.sh" + conda-notebook-tests: + needs: [conda-python-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks + with: + build_type: pull-request + node_type: "gpu-l4-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:25.12-latest" + script: "ci/test_notebooks.sh" + docs-build: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + with: + build_type: pull-request + node_type: "gpu-l4-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:25.12-latest" + script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks secrets: inherit @@ -240,23 +240,23 @@ jobs: script: "ci/build_wheel_pylibcudf.sh" package-name: pylibcudf package-type: python - # wheel-build-cudf: - # needs: wheel-build-pylibcudf - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main - # with: - # build_type: pull-request - # script: "ci/build_wheel_cudf.sh" - # package-name: cudf - # package-type: python - # wheel-tests-cudf: - # needs: [wheel-build-cudf, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - # with: - # build_type: pull-request - # script: ci/test_wheel_cudf.sh + wheel-build-cudf: + needs: wheel-build-pylibcudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main + with: + build_type: pull-request + script: "ci/build_wheel_cudf.sh" + package-name: cudf + package-type: python + wheel-tests-cudf: + needs: [wheel-build-cudf, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + with: + build_type: pull-request + script: ci/test_wheel_cudf.sh wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit @@ -300,103 +300,103 @@ jobs: matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request script: "ci/test_cudf_polars_polars_tests.sh" - # wheel-build-dask-cudf: - # needs: wheel-build-cudf - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main - # with: - # # This selects "ARCH=amd64 + the latest supported Python + CUDA". - # matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) - # build_type: pull-request - # script: "ci/build_wheel_dask_cudf.sh" - # package-name: dask_cudf - # package-type: python - # pure-wheel: true - # wheel-tests-dask-cudf: - # needs: [wheel-build-dask-cudf, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - # with: - # # This selects "ARCH=amd64 + the latest supported Python + CUDA". - # matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) - # build_type: pull-request - # script: ci/test_wheel_dask_cudf.sh - # devcontainer: - # secrets: inherit - # needs: telemetry-setup - # uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main - # with: - # arch: '["amd64", "arm64"]' - # cuda: '["13.0"]' - # node_type: "cpu8" - # rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN - # env: | - # SCCACHE_DIST_MAX_RETRIES=inf - # SCCACHE_SERVER_LOG=sccache=debug - # SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false - # SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1 - # build_command: | - # sccache --zero-stats; - # build-all -j0 -DBUILD_BENCHMARKS=ON --verbose 2>&1 | tee telemetry-artifacts/build.log; - # sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt; - # unit-tests-cudf-pandas: - # needs: [wheel-build-cudf, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas - # with: - # # This selects the latest supported Python + CUDA minor versions for each ARCH/CUDA major version combo - # matrix_filter: group_by([(.ARCH), (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) - # build_type: pull-request - # script: ci/cudf_pandas_scripts/run_tests.sh - # third-party-integration-tests-cudf-pandas: - # needs: [conda-python-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas - # with: - # build_type: pull-request - # branch: ${{ inputs.branch }} - # date: ${{ inputs.date }} - # sha: ${{ inputs.sha }} - # node_type: "gpu-l4-latest-1" - # continue-on-error: true - # # TODO: Switch to ci-conda:25-10-latest when XGBoost has CUDA 13 packages - # container_image: "rapidsai/ci-conda:25.12-cuda12.9.1-ubuntu24.04-py3.13" - # script: | - # ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml - # pandas-tests: - # # run the Pandas unit tests using PR branch - # needs: [wheel-build-cudf, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas - # with: - # build_type: pull-request - # branch: ${{ inputs.branch }} - # date: ${{ inputs.date }} - # sha: ${{ inputs.sha }} - # node_type: "gpu-l4-latest-1" - # container_image: "rapidsai/citestwheel:25.12-latest" - # script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr - # narwhals-tests: - # needs: [conda-python-build, changed-files] - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - # with: - # build_type: pull-request - # branch: ${{ inputs.branch }} - # date: ${{ inputs.date }} - # sha: ${{ inputs.sha }} - # node_type: "gpu-l4-latest-1" - # container_image: "rapidsai/ci-conda:25.12-latest" - # script: ci/test_narwhals.sh - # spark-rapids-jni: - # needs: changed-files - # uses: ./.github/workflows/spark-rapids-jni.yaml - # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java + wheel-build-dask-cudf: + needs: wheel-build-cudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + script: "ci/build_wheel_dask_cudf.sh" + package-name: dask_cudf + package-type: python + pure-wheel: true + wheel-tests-dask-cudf: + needs: [wheel-build-dask-cudf, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + script: ci/test_wheel_dask_cudf.sh + devcontainer: + secrets: inherit + needs: telemetry-setup + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main + with: + arch: '["amd64", "arm64"]' + cuda: '["13.0"]' + node_type: "cpu8" + rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN + env: | + SCCACHE_DIST_MAX_RETRIES=inf + SCCACHE_SERVER_LOG=sccache=debug + SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false + SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1 + build_command: | + sccache --zero-stats; + build-all -j0 -DBUILD_BENCHMARKS=ON --verbose 2>&1 | tee telemetry-artifacts/build.log; + sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt; + unit-tests-cudf-pandas: + needs: [wheel-build-cudf, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas + with: + # This selects the latest supported Python + CUDA minor versions for each ARCH/CUDA major version combo + matrix_filter: group_by([(.ARCH), (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + script: ci/cudf_pandas_scripts/run_tests.sh + third-party-integration-tests-cudf-pandas: + needs: [conda-python-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas + with: + build_type: pull-request + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + continue-on-error: true + # TODO: Switch to ci-conda:25-10-latest when XGBoost has CUDA 13 packages + container_image: "rapidsai/ci-conda:25.12-cuda12.9.1-ubuntu24.04-py3.13" + script: | + ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml + pandas-tests: + # run the Pandas unit tests using PR branch + needs: [wheel-build-cudf, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas + with: + build_type: pull-request + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/citestwheel:25.12-latest" + script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr + narwhals-tests: + needs: [conda-python-build, changed-files] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + with: + build_type: pull-request + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/ci-conda:25.12-latest" + script: ci/test_narwhals.sh + spark-rapids-jni: + needs: changed-files + uses: ./.github/workflows/spark-rapids-jni.yaml + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java telemetry-summarize: # This job must use a self-hosted runner to record telemetry traces.