diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 34488ef9132..82df38e3354 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -12,6 +12,8 @@ import pyarrow as pa from typing_extensions import Self +import pylibcudf as plc + import cudf from cudf.api.types import is_scalar from cudf.core.column import column @@ -30,12 +32,11 @@ if TYPE_CHECKING: from collections.abc import Mapping, MutableSequence, Sequence - from pylibcudf import Scalar as plc_Scalar - from cudf._typing import ( ColumnBinaryOperand, ColumnLike, Dtype, + DtypeObj, ScalarLike, ) from cudf.core.buffer import Buffer @@ -53,18 +54,6 @@ _DEFAULT_CATEGORICAL_VALUE = np.int8(-1) -def validate_categorical_children(children) -> None: - if not ( - len(children) == 1 - and isinstance(children[0], cudf.core.column.numerical.NumericalColumn) - and children[0].dtype.kind in "iu" - ): - # TODO: Enforce unsigned integer? - raise ValueError( - "Must specify exactly one child NumericalColumn of integers for representing the codes." - ) - - class CategoricalColumn(column.ColumnBase): """ Implements operations for Columns of Categorical type @@ -95,35 +84,59 @@ class CategoricalColumn(column.ColumnBase): "__gt__", "__ge__", } + # TODO: See if we can narrow these integer types + _VALID_PLC_TYPES = { + plc.TypeId.INT8, + plc.TypeId.INT16, + plc.TypeId.INT32, + plc.TypeId.INT64, + plc.TypeId.UINT8, + plc.TypeId.UINT16, + plc.TypeId.UINT32, + plc.TypeId.UINT64, + } def __init__( self, - data: None, + plc_column: plc.Column, size: int, dtype: CategoricalDtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple[NumericalColumn], - ): - if data is not None: - raise ValueError(f"{data=} must be None") - validate_categorical_children(children) + exposed: bool, + ) -> None: if not isinstance(dtype, CategoricalDtype): raise ValueError( f"{dtype=} must be cudf.CategoricalDtype instance." ) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) self._codes = self.children[0].set_mask(self.mask) + def _get_data_buffer_from_pylibcudf_column( + self, plc_column: plc.Column, exposed: bool + ) -> None: + """ + This column considers the plc_column (i.e. codes) as children + """ + return None + + def _get_children_from_pylibcudf_column( + self, plc_column: plc.Column, dtype: DtypeObj, exposed: bool + ) -> tuple[ColumnBase]: + """ + This column considers the plc_column (i.e. codes) as children + """ + return ( + type(self).from_pylibcudf(plc_column, data_ptr_exposed=exposed), + ) + @property def base_size(self) -> int: return int( @@ -137,15 +150,6 @@ def __contains__(self, item: ScalarLike) -> bool: return False return self._encode(item) in self.codes - def set_base_data(self, value): - if value is not None: - raise RuntimeError( - "CategoricalColumns do not use data attribute of Column, use " - "`set_base_children` instead" - ) - else: - super().set_base_data(value) - def _process_values_for_isin( self, values: Sequence ) -> tuple[ColumnBase, ColumnBase]: @@ -158,7 +162,6 @@ def set_base_mask(self, value: Buffer | None) -> None: def set_base_children(self, value: tuple[NumericalColumn]) -> None: # type: ignore[override] super().set_base_children(value) - validate_categorical_children(value) self._codes = value[0].set_mask(self.mask) @property @@ -217,7 +220,7 @@ def __setitem__(self, key, value): def _fill( self, - fill_value: plc_Scalar, + fill_value: plc.Scalar, begin: int, end: int, inplace: bool = False, @@ -374,7 +377,7 @@ def unique(self) -> Self: def _cast_self_and_other_for_where( self, other: ScalarLike | ColumnBase, inplace: bool - ) -> tuple[ColumnBase, plc_Scalar | ColumnBase]: + ) -> tuple[ColumnBase, plc.Scalar | ColumnBase]: if is_scalar(other): try: other = self._encode(other) @@ -569,7 +572,7 @@ def notnull(self) -> ColumnBase: def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike - ) -> plc_Scalar | ColumnBase: + ) -> plc.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" if is_scalar(fill_value): if fill_value != _DEFAULT_CATEGORICAL_VALUE: @@ -711,13 +714,12 @@ def _concat( def _with_type_metadata(self: Self, dtype: Dtype) -> Self: if isinstance(dtype, CategoricalDtype): return type(self)( - data=self.data, # type: ignore[arg-type] + plc_column=self.plc_column, size=self.size, dtype=dtype, - mask=self.base_mask, offset=self.offset, null_count=self.null_count, - children=self.base_children, # type: ignore[arg-type] + exposed=False, ) return self diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 64477879b32..4cb159d4470 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -9,7 +9,7 @@ from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast import cupy as cp import numpy as np @@ -176,17 +176,25 @@ class ColumnBase(Serializable, BinaryOperand, Reducible): "max", "min", } + _VALID_PLC_TYPES: ClassVar[set[plc.TypeId]] = set() def __init__( self, - data: None | Buffer, + plc_column: plc.Column, size: int, dtype: DtypeObj, - mask: None | Buffer, offset: int, null_count: int, - children: tuple[ColumnBase, ...], + exposed: bool, ) -> None: + if not ( + isinstance(plc_column, plc.Column) + and plc_column.type().id() in self._VALID_PLC_TYPES + ): + raise ValueError( + f"plc_column must be a pylibcudf.Column with a TypeId in {self._VALID_PLC_TYPES}" + ) + self.plc_column = plc_column if size < 0: raise ValueError("size must be >=0") self._size = size @@ -200,10 +208,103 @@ def __init__( self._base_mask = None self._data = None self._children = None + data = self._get_data_buffer_from_pylibcudf_column( + self.plc_column, exposed + ) + mask = self._get_mask_buffer_from_pylibcudf_column( + self.plc_column, exposed + ) + children = self._get_children_from_pylibcudf_column( + self.plc_column, + dtype, + exposed, + ) self.set_base_children(children) self.set_base_data(data) self.set_base_mask(mask) + def _get_data_buffer_from_pylibcudf_column( + self, plc_column: plc.Column, exposed: bool + ) -> Buffer | None: + """ + Extract the data buffer from a pylibcudf.Column. + + Necessary to wrap the data buffer in a cuDF Buffer for spilling support. + + Parameters + ---------- + plc_column : plc.Column + The pylibcudf.Column to extract the data buffer from. + exposed : bool + Whether the data buffer is exposed. + + Returns + ------- + Buffer | None + The data buffer. + """ + data_view = plc_column.data() + return ( + as_buffer(data_view.obj, exposed=exposed) + if data_view is not None + else None + ) + + def _get_mask_buffer_from_pylibcudf_column( + self, plc_column: plc.Column, exposed: bool + ) -> Buffer | None: + """ + Extract the mask buffer from a pylibcudf.Column. + + Necessary to wrap the mask buffer in a cuDF Buffer for spilling support. + + Parameters + ---------- + plc_column : plc.Column + The pylibcudf.Column to extract the mask buffer from. + exposed : bool + Whether the mask buffer is exposed. + + Returns + ------- + Buffer | None + The mask buffer. + """ + mask_view = plc_column.null_mask() + return ( + as_buffer(mask_view.obj, exposed=exposed) + if mask_view is not None + else None + ) + + def _get_children_from_pylibcudf_column( + self, + plc_column: plc.Column, + dtype: DtypeObj, + exposed: bool, + ) -> tuple[ColumnBase, ...]: + """ + Extract the children columns from a pylibcudf.Column. + + ColumnBase currently assumes children are also ColumnBase objects. + + Parameters + ---------- + plc_column : plc.Column + The pylibcudf.Column to extract the children columns from. + exposed : bool + Whether the children columns are exposed. + + Returns + ------- + tuple[ColumnBase, ...] + The children columns. + """ + return tuple( + type(self).from_pylibcudf(child, data_ptr_exposed=exposed) + for child in plc_column.children() + ) + @property def _PANDAS_NA_VALUE(self): """Return appropriate NA value based on dtype.""" @@ -510,7 +611,6 @@ def to_pylibcudf( pylibcudf.Column A new pylibcudf.Column referencing the same data. """ - # TODO: Categoricals will need to be treated differently eventually. # There is no 1-1 correspondence between cudf and libcudf for # categoricals because cudf supports ordered and unordered categoricals @@ -606,23 +706,13 @@ def from_pylibcudf( dtype = dtype_from_pylibcudf_column(col) - data_view = col.data() - mask_view = col.null_mask() return build_column( # type: ignore[return-value] - data=as_buffer(data_view.obj, exposed=data_ptr_exposed) - if data_view is not None - else None, - dtype=dtype, + plc_column=col, size=col.size(), - mask=as_buffer(mask_view.obj, exposed=data_ptr_exposed) - if mask_view is not None - else None, + dtype=dtype, offset=col.offset(), null_count=col.null_count(), - children=tuple( - cls.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) - for child in col.children() - ), + exposed=data_ptr_exposed, ) @classmethod @@ -1042,24 +1132,30 @@ def copy(self, deep: bool = True) -> Self: ) return result._with_type_metadata(self.dtype) # type: ignore[return-value] else: - return cast( - Self, - build_column( - data=self.base_data - if self.base_data is None - else self.base_data.copy(deep=False), - dtype=self.dtype, - mask=self.base_mask - if self.base_mask is None - else self.base_mask.copy(deep=False), - size=self.size, - offset=self.offset, - null_count=self.null_count, - children=tuple( - col.copy(deep=False) for col in self.base_children - ), - ), + col = type(self)( + plc_column=self.plc_column, + size=self.size, + dtype=self.dtype, + offset=self.offset, + null_count=self.null_count, + exposed=False, + ) + # copy-on-write logic tracked on the Buffers + # so copy over the Buffers from self + col.set_base_children( + tuple(child.copy(deep=False) for child in self.base_children) + ) + col.set_base_data( + self.base_data.copy(deep=False) + if self.base_data is not None + else None + ) + col.set_base_mask( + self.base_mask.copy(deep=False) + if self.base_mask is not None + else None ) + return col def element_indexing(self, index: int): """Default implementation for indexing to an element @@ -2521,70 +2617,62 @@ def column_empty( def build_column( - data: Buffer | None, + plc_column: plc.Column, dtype: DtypeObj, *, size: int, - mask: Buffer | None, offset: int, null_count: int, - children: tuple[ColumnBase, ...], + exposed: bool, ) -> ColumnBase: """ Build a Column of the appropriate type from the given parameters Parameters ---------- - data : Buffer - The data buffer (can be None if constructing certain Column - types like StringColumn, ListColumn, or CategoricalColumn) + plc_column : plc.Column + The backing pylibcudf.Column dtype The dtype associated with the Column to construct - mask : Buffer, optional - The mask buffer size : int, optional offset : int, optional - children : tuple, optional + null_count : int, optional """ if isinstance(dtype, CategoricalDtype): return cudf.core.column.CategoricalColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, + plc_column=plc_column, size=size, + dtype=dtype, offset=offset, null_count=null_count, - children=children, # type: ignore[arg-type] + exposed=exposed, ) elif isinstance(dtype, pd.DatetimeTZDtype): return cudf.core.column.datetime.DatetimeTZColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, + plc_column=plc_column, size=size, + dtype=dtype, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif dtype.kind == "M": return cudf.core.column.DatetimeColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, + plc_column=plc_column, size=size, + dtype=dtype, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif dtype.kind == "m": return cudf.core.column.TimeDeltaColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, + plc_column=plc_column, size=size, + dtype=dtype, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif ( dtype == CUDF_STRING_DTYPE @@ -2593,83 +2681,75 @@ def build_column( or (isinstance(dtype, pd.ArrowDtype) and dtype.kind == "U") ): return cudf.core.column.StringColumn( - data=data, # type: ignore[arg-type] + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, - children=children, # type: ignore[arg-type] null_count=null_count, + exposed=exposed, ) elif isinstance(dtype, ListDtype): return cudf.core.column.ListColumn( - data=None, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, # type: ignore[arg-type] + exposed=exposed, ) elif isinstance(dtype, IntervalDtype): return cudf.core.column.IntervalColumn( - data=None, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, # type: ignore[arg-type] + exposed=exposed, ) elif isinstance(dtype, StructDtype): return cudf.core.column.StructColumn( - data=None, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif isinstance(dtype, cudf.Decimal64Dtype): return cudf.core.column.Decimal64Column( - data=data, # type: ignore[arg-type] + plc_column=plc_column, size=size, - offset=offset, dtype=dtype, - mask=mask, + offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif isinstance(dtype, cudf.Decimal32Dtype): return cudf.core.column.Decimal32Column( - data=data, # type: ignore[arg-type] + plc_column=plc_column, size=size, - offset=offset, dtype=dtype, - mask=mask, + offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif isinstance(dtype, cudf.Decimal128Dtype): return cudf.core.column.Decimal128Column( - data=data, # type: ignore[arg-type] + plc_column=plc_column, size=size, - offset=offset, dtype=dtype, - mask=mask, + offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) elif dtype.kind in "iufb": return cudf.core.column.NumericalColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, + plc_column=plc_column, size=size, + dtype=dtype, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) else: raise TypeError(f"Unrecognized dtype: {dtype}") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index a48749bc176..7eac38b6eec 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -24,7 +24,7 @@ get_compatible_timezone, get_tz_data, ) -from cudf.core.buffer import Buffer, acquire_spill_lock +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.temporal_base import TemporalBaseColumn from cudf.utils.dtypes import ( @@ -101,26 +101,30 @@ class DatetimeColumn(TemporalBaseColumn): "__radd__", "__rsub__", } + _VALID_PLC_TYPES = { + plc.TypeId.TIMESTAMP_SECONDS, + plc.TypeId.TIMESTAMP_MILLISECONDS, + plc.TypeId.TIMESTAMP_MICROSECONDS, + plc.TypeId.TIMESTAMP_NANOSECONDS, + } def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: np.dtype | pd.DatetimeTZDtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple, - ): + exposed: bool, + ) -> None: dtype = self._validate_dtype_instance(dtype) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) def _clear_cache(self) -> None: @@ -678,13 +682,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def _with_type_metadata(self, dtype: DtypeObj) -> DatetimeColumn: if isinstance(dtype, pd.DatetimeTZDtype): return DatetimeTZColumn( - data=self.base_data, # type: ignore[arg-type] - dtype=dtype, - mask=self.base_mask, + plc_column=self.plc_column, size=self.size, + dtype=dtype, offset=self.offset, null_count=self.null_count, - children=self.base_children, + exposed=False, ) if cudf.get_option("mode.pandas_compatible"): self._dtype = get_dtype_of_same_type(dtype, self.dtype) @@ -846,13 +849,12 @@ def time_unit(self) -> str: def _utc_time(self) -> DatetimeColumn: """Return UTC time as naive timestamps.""" return DatetimeColumn( - data=self.base_data, # type: ignore[arg-type] - dtype=_get_base_dtype(self.dtype), - mask=self.base_mask, + plc_column=self.plc_column, size=self.size, + dtype=_get_base_dtype(self.dtype), offset=self.offset, null_count=self.null_count, - children=self.base_children, + exposed=False, ) @functools.cached_property diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index d2ef90a7eff..de45c2c67e4 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -49,7 +49,6 @@ DtypeObj, ScalarLike, ) - from cudf.core.buffer import Buffer from cudf.core.column.numerical import NumericalColumn from cudf.core.column.string import StringColumn @@ -77,26 +76,22 @@ class DecimalBaseColumn(NumericalBaseColumn): def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: DecimalDtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple, - ): - if not isinstance(size, int): - raise ValueError("Must specify an integer size") + exposed: bool, + ) -> None: if not isinstance(dtype, DecimalDtype): raise ValueError(f"{dtype=} must be a DecimalDtype instance") super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) @property @@ -376,26 +371,26 @@ def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn: class Decimal32Column(DecimalBaseColumn): + _VALID_PLC_TYPES = {plc.TypeId.DECIMAL32} + def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: Decimal32Dtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple, - ): + exposed: bool, + ) -> None: if not isinstance(dtype, Decimal32Dtype): raise ValueError(f"{dtype=} must be a Decimal32Dtype instance") super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) @classmethod @@ -446,16 +441,17 @@ def _with_type_metadata(self: Self, dtype: DtypeObj) -> Self: class Decimal128Column(DecimalBaseColumn): + _VALID_PLC_TYPES = {plc.TypeId.DECIMAL128} + def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: Decimal128Dtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple, - ): + exposed: bool, + ) -> None: if ( not cudf.get_option("mode.pandas_compatible") and not isinstance(dtype, Decimal128Dtype) @@ -465,13 +461,12 @@ def __init__( ): raise ValueError(f"{dtype=} must be a Decimal128Dtype instance") super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) @classmethod @@ -498,26 +493,26 @@ def _with_type_metadata(self: Self, dtype: DtypeObj) -> Self: class Decimal64Column(DecimalBaseColumn): + _VALID_PLC_TYPES = {plc.TypeId.DECIMAL64} + def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: Decimal64Dtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple, - ): + exposed: bool, + ) -> None: if not isinstance(dtype, Decimal64Dtype): raise ValueError(f"{dtype=} must be a Decimal64Dtype instance") super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) @classmethod diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 4d931dc3948..92bd7ffffc6 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -17,33 +17,32 @@ if TYPE_CHECKING: from typing_extensions import Self - from cudf.core.buffer import Buffer + import pylibcudf as plc + from cudf.core.column import ColumnBase class IntervalColumn(StructColumn): def __init__( self, - data: None, + plc_column: plc.Column, size: int, dtype: IntervalDtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple[ColumnBase, ColumnBase], - ): - if len(children) != 2: + exposed: bool, + ) -> None: + if plc_column.num_children() != 2: raise ValueError( - "children must be a tuple of two columns (left edges, right edges)." + "plc_column must have two children (left edges, right edges)." ) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) @staticmethod diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 62592813d1a..579d051ce90 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -35,25 +35,22 @@ from typing_extensions import Self from cudf._typing import ColumnBinaryOperand, ColumnLike, DtypeObj - from cudf.core.buffer import Buffer from cudf.core.column.string import StringColumn class ListColumn(ColumnBase): _VALID_BINARY_OPERATIONS = {"__add__", "__radd__"} + _VALID_PLC_TYPES = {plc.TypeId.LIST} def __init__( self, - data: None, + plc_column: plc.Column, size: int, dtype: ListDtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple[NumericalColumn, ColumnBase], - ): - if data is not None: - raise ValueError("data must be None") + exposed: bool, + ) -> None: if ( not cudf.get_option("mode.pandas_compatible") and not isinstance(dtype, ListDtype) @@ -62,24 +59,27 @@ def __init__( and not is_dtype_obj_list(dtype) ): raise ValueError("dtype must be a cudf.ListDtype") - if not ( - len(children) == 2 - and isinstance(children[0], NumericalColumn) - # TODO: Enforce int32_t (size_type) used in libcudf? - and children[0].dtype.kind == "i" - and isinstance(children[1], ColumnBase) - ): - raise ValueError( - "children must a tuple of 2 columns of (signed integer offsets, list values)" - ) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, + ) + + def _get_children_from_pylibcudf_column( + self, + plc_column: plc.Column, + dtype: ListDtype, # type: ignore[override] + exposed: bool, + ) -> tuple[ColumnBase, ColumnBase]: + children = super()._get_children_from_pylibcudf_column( + plc_column, dtype, exposed + ) + return ( + children[0], + children[1]._with_type_metadata(dtype.element_type), ) def _prep_pandas_compat_repr(self) -> StringColumn | Self: @@ -203,15 +203,6 @@ def to_arrow(self) -> pa.Array: children=[elements], ) - def set_base_data(self, value: None | Buffer) -> None: - if value is not None: - raise RuntimeError( - "ListColumn's do not use data attribute of Column, use " - "`set_base_children` instead" - ) - else: - super().set_base_data(value) - @property def __cuda_array_interface__(self) -> Mapping[str, Any]: raise NotImplementedError( @@ -223,14 +214,26 @@ def _with_type_metadata(self: Self, dtype: DtypeObj) -> Self: elements = self.base_children[1]._with_type_metadata( dtype.element_type ) + new_children = [ + self.plc_column.children()[0], + elements.to_pylibcudf(mode="read"), + ] + new_plc_column = plc.Column( + plc.DataType(plc.TypeId.LIST), + self.plc_column.size(), + self.plc_column.data(), + self.plc_column.null_mask(), + self.plc_column.null_count(), + self.plc_column.offset(), + new_children, + ) return type(self)( - data=None, - dtype=dtype, - mask=self.base_mask, + plc_column=new_plc_column, size=self.size, + dtype=dtype, offset=self.offset, null_count=self.null_count, - children=(self.base_children[0], elements), # type: ignore[arg-type] + exposed=False, ) # For pandas dtypes, store them directly in the column's dtype property elif isinstance(dtype, pd.ArrowDtype) and isinstance( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 08b7c8fabd1..1b94aa0d92e 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -48,7 +48,6 @@ DtypeObj, ScalarLike, ) - from cudf.core.buffer import Buffer from cudf.core.column import DecimalBaseColumn from cudf.core.column.datetime import DatetimeColumn from cudf.core.column.string import StringColumn @@ -69,17 +68,29 @@ class NumericalColumn(NumericalBaseColumn): """ _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS + _VALID_PLC_TYPES = { + plc.TypeId.INT8, + plc.TypeId.INT16, + plc.TypeId.INT32, + plc.TypeId.INT64, + plc.TypeId.UINT8, + plc.TypeId.UINT16, + plc.TypeId.UINT32, + plc.TypeId.UINT64, + plc.TypeId.FLOAT32, + plc.TypeId.FLOAT64, + plc.TypeId.BOOL8, + } def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: np.dtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple, - ): + exposed: bool, + ) -> None: if ( cudf.get_option("mode.pandas_compatible") and dtype.kind not in "iufb" @@ -91,13 +102,12 @@ def __init__( f"dtype must be a floating, integer or boolean dtype. Got: {dtype}" ) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) def _clear_cache(self) -> None: @@ -892,19 +902,18 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: def _with_type_metadata( self: Self, - dtype: Dtype, + dtype: DtypeObj, ) -> ColumnBase: if isinstance(dtype, CategoricalDtype): codes_dtype = min_unsigned_type(len(dtype.categories)) codes = cast(NumericalColumn, self.astype(codes_dtype)) return CategoricalColumn( - data=None, - size=self.size, + plc_column=codes.to_pylibcudf(mode="read"), + size=codes.size, dtype=dtype, - mask=self.base_mask, - offset=self.offset, - null_count=self.null_count, - children=(codes,), + offset=codes.offset, + null_count=codes.null_count, + exposed=False, ) if cudf.get_option("mode.pandas_compatible"): res_dtype = get_dtype_of_same_type(dtype, self.dtype) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3133238e3d4..2952f7047fa 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -11,7 +11,7 @@ import pylibcudf as plc import cudf -from cudf.core.buffer import Buffer, acquire_spill_lock +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, column_empty from cudf.core.missing import NA from cudf.core.mixins import Scannable @@ -19,7 +19,6 @@ if TYPE_CHECKING: from cudf._typing import ScalarLike - from cudf.core.column.decimal import DecimalDtype _unaryop_map = { @@ -56,30 +55,6 @@ class NumericalBaseColumn(ColumnBase, Scannable): "cummax", } - def __init__( - self, - data: Buffer, - size: int, - dtype: DecimalDtype | np.dtype, - mask: Buffer | None, - offset: int, - null_count: int, - children: tuple, - ): - if not isinstance(data, Buffer): - raise ValueError("data must be a Buffer instance.") - if len(children) != 0: - raise ValueError(f"{type(self).__name__} must have no children.") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls() diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a1df138668a..648dd4541d2 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -23,7 +23,6 @@ from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, - SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type, dtype_to_pylibcudf_type, get_dtype_of_same_kind, @@ -115,19 +114,17 @@ class StringColumn(ColumnBase): "__truediv__", "__floordiv__", } + _VALID_PLC_TYPES = {plc.TypeId.STRING} def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: np.dtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple[ColumnBase], - ): - if not isinstance(data, Buffer): - raise ValueError("data must be a Buffer") + exposed: bool, + ) -> None: if ( not cudf.get_option("mode.pandas_compatible") and dtype != CUDF_STRING_DTYPE @@ -143,33 +140,19 @@ def __init__( and dtype.kind == "U" ): dtype = CUDF_STRING_DTYPE - if len(children) > 1: - raise ValueError("StringColumn must have at most 1 offset column.") - - if len(children) == 0 and size != 0: - # all nulls-column: - offsets = as_column(0, length=size + 1, dtype=SIZE_TYPE_DTYPE) - - children = (offsets,) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) self._start_offset = None self._end_offset = None - def copy(self, deep: bool = True) -> Self: - # Since string columns are immutable, both deep - # and shallow copies share the underlying device data and mask. - return super().copy(deep=False) - @property def start_offset(self) -> int: if self._start_offset is None: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index c57c829f12e..440941c3323 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -8,6 +8,8 @@ import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf from cudf.core.column.column import ColumnBase from cudf.core.dtypes import StructDtype @@ -26,10 +28,7 @@ from typing_extensions import Self - import pylibcudf as plc - from cudf._typing import DtypeObj - from cudf.core.buffer import Buffer from cudf.core.column.string import StringColumn @@ -51,27 +50,42 @@ class StructColumn(ColumnBase): the number of fields in the Struct Dtype. """ + _VALID_PLC_TYPES = {plc.TypeId.STRUCT} + def __init__( self, - data: None, + plc_column: plc.Column, size: int, dtype: StructDtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple[ColumnBase, ...], + exposed: bool, ): - if data is not None: - raise ValueError("data must be None.") dtype = self._validate_dtype_instance(dtype) super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, + ) + + def _get_children_from_pylibcudf_column( + self, + plc_column: plc.Column, + dtype: StructDtype, # type: ignore[override] + exposed: bool, + ) -> tuple[ColumnBase, ...]: + return tuple( + child._with_type_metadata(field_dtype) + for child, field_dtype in zip( + super()._get_children_from_pylibcudf_column( + plc_column, dtype=dtype, exposed=exposed + ), + dtype.fields.values(), + strict=True, + ) ) def _prep_pandas_compat_repr(self) -> StringColumn | Self: @@ -202,29 +216,50 @@ def _with_type_metadata( # Check IntervalDtype first because it's a subclass of StructDtype if isinstance(dtype, IntervalDtype): + new_children = [ + child.astype(dtype.subtype).to_pylibcudf(mode="read") + for child in self.base_children + ] + new_plc_column = plc.Column( + plc.DataType(plc.TypeId.STRUCT), + self.plc_column.size(), + self.plc_column.data(), + self.plc_column.null_mask(), + self.plc_column.null_count(), + self.plc_column.offset(), + new_children, + ) return IntervalColumn( - data=None, + plc_column=new_plc_column, size=self.size, dtype=dtype, - mask=self.base_mask, offset=self.offset, null_count=self.null_count, - children=tuple( # type: ignore[arg-type] - child.astype(dtype.subtype) for child in self.base_children - ), + exposed=False, ) elif isinstance(dtype, StructDtype): + new_children = [ + self.base_children[i] + ._with_type_metadata(dtype.fields[f]) + .to_pylibcudf(mode="read") + for i, f in enumerate(dtype.fields.keys()) + ] + new_plc_column = plc.Column( + plc.DataType(plc.TypeId.STRUCT), + self.plc_column.size(), + self.plc_column.data(), + self.plc_column.null_mask(), + self.plc_column.null_count(), + self.plc_column.offset(), + new_children, + ) return StructColumn( - data=None, - dtype=dtype, - children=tuple( - self.base_children[i]._with_type_metadata(dtype.fields[f]) - for i, f in enumerate(dtype.fields.keys()) - ), - mask=self.base_mask, + plc_column=new_plc_column, size=self.size, + dtype=dtype, offset=self.offset, null_count=self.null_count, + exposed=False, ) # For pandas dtypes, store them directly in the column's dtype property elif isinstance(dtype, pd.ArrowDtype) and isinstance( diff --git a/python/cudf/cudf/core/column/temporal_base.py b/python/cudf/cudf/core/column/temporal_base.py index c3c7438b305..07855d12b84 100644 --- a/python/cudf/cudf/core/column/temporal_base.py +++ b/python/cudf/cudf/core/column/temporal_base.py @@ -17,7 +17,6 @@ import cudf from cudf.api.types import is_scalar -from cudf.core.buffer.buffer import Buffer from cudf.core.column.column import ColumnBase, as_column, column_empty from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, @@ -49,30 +48,6 @@ class TemporalBaseColumn(ColumnBase): _NP_SCALAR: ClassVar[type[np.datetime64] | type[np.timedelta64]] _PD_SCALAR: pd.Timestamp | pd.Timedelta - def __init__( - self, - data: Buffer, - size: int, - dtype: np.dtype | pd.DatetimeTZDtype, - mask: Buffer | None, - offset: int, - null_count: int, - children: tuple, - ): - if not isinstance(data, Buffer): - raise ValueError("data must be a Buffer.") - if len(children) != 0: - raise ValueError(f"{type(self).__name__} must have no children.") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - def __contains__(self, item: np.datetime64 | np.timedelta64) -> bool: """ Check if the column contains a given value. diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index e289daceac9..bd13444ecbc 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -15,7 +15,7 @@ import cudf from cudf.core._internals import binaryop -from cudf.core.buffer import Buffer, acquire_spill_lock +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.temporal_base import TemporalBaseColumn from cudf.utils.dtypes import ( @@ -47,24 +47,6 @@ def get_np_td_unit_conversion( class TimeDeltaColumn(TemporalBaseColumn): - """ - Parameters - ---------- - data : Buffer - The Timedelta values - dtype : np.dtype - The data type - size : int - Size of memory allocation. - mask : Buffer; optional - The validity mask - offset : int - Data offset - null_count : int, optional - The number of null values. - If None, it is calculated automatically. - """ - _NP_SCALAR = np.timedelta64 _PD_SCALAR = pd.Timedelta _VALID_BINARY_OPERATIONS = { @@ -87,30 +69,34 @@ class TimeDeltaColumn(TemporalBaseColumn): "__rtruediv__", "__rfloordiv__", } + _VALID_PLC_TYPES = { + plc.TypeId.DURATION_SECONDS, + plc.TypeId.DURATION_MILLISECONDS, + plc.TypeId.DURATION_MICROSECONDS, + plc.TypeId.DURATION_NANOSECONDS, + } def __init__( self, - data: Buffer, + plc_column: plc.Column, size: int, dtype: np.dtype, - mask: Buffer | None, offset: int, null_count: int, - children: tuple = (), - ): + exposed: bool, + ) -> None: if cudf.get_option("mode.pandas_compatible"): if not dtype.kind == "m": raise ValueError("dtype must be a timedelta numpy dtype.") elif not (isinstance(dtype, np.dtype) and dtype.kind == "m"): raise ValueError("dtype must be a timedelta numpy dtype.") super().__init__( - data=data, + plc_column=plc_column, size=size, dtype=dtype, - mask=mask, offset=offset, null_count=null_count, - children=children, + exposed=exposed, ) def _clear_cache(self) -> None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 796ccb205d7..9d82a342097 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3574,7 +3574,7 @@ def _apply(self, func, kernel_class, *args, **kwargs): else: col = as_column(ans_col, retty) - col.set_base_mask(ans_mask.as_mask()) + col = col.set_mask(ans_mask.as_mask()) result = cudf.Series._from_column( col, index=self.index, attrs=self.attrs ) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_copy.py b/python/cudf/cudf/tests/indexes/index/methods/test_copy.py index ba83d743ad8..665944d71d0 100644 --- a/python/cudf/cudf/tests/indexes/index/methods/test_copy.py +++ b/python/cudf/cudf/tests/indexes/index/methods/test_copy.py @@ -35,14 +35,7 @@ def test_index_copy(data, deep, copy_on_write): with cudf.option_context("copy_on_write", copy_on_write): if not isinstance(cidx, cudf.RangeIndex): - if ( - isinstance(cidx._column, cudf.core.column.StringColumn) - or not deep - or (copy_on_write and not deep) - ): - # StringColumn is immutable hence, deep copies of a - # Index with string dtype will share the same StringColumn. - + if not deep or (copy_on_write and not deep): # When `copy_on_write` is turned on, Index objects will # have unique column object but they all point to same # data pointers. diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py index 6f6acd969e0..36662d78855 100644 --- a/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py @@ -1,14 +1,16 @@ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -import operator -from functools import reduce import pandas as pd import pytest import cudf from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_column_memory_eq, + assert_column_memory_ne, +) def test_multiindex_copy_sem(): @@ -27,8 +29,8 @@ def test_multiindex_copy_sem(): @pytest.mark.parametrize( "data", [ - { - "Date": [ + [ + [ "2020-08-27", "2020-08-28", "2020-08-31", @@ -39,18 +41,7 @@ def test_multiindex_copy_sem(): "2020-08-28", "2020-08-31", ], - "Close": [ - 3400.00, - 3401.80, - 3450.96, - 226.58, - 228.91, - 225.53, - 505.13, - 525.91, - 534.98, - ], - "Symbol": [ + [ "AMZN", "AMZN", "AMZN", @@ -61,7 +52,7 @@ def test_multiindex_copy_sem(): "NVDA", "NVDA", ], - }, + ], pd.MultiIndex( levels=[[1001, 1002], [2001, 2002]], codes=[[1, 1, 0, 0], [0, 1, 0, 1]], @@ -73,31 +64,18 @@ def test_multiindex_copy_sem(): @pytest.mark.parametrize("deep", [True, False]) def test_multiindex_copy_deep(data, copy_on_write, deep): """Test memory identity for deep copy - Case1: Constructed from GroupBy, StringColumns + Case1: Constructed from arrays, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ with cudf.option_context("copy_on_write", copy_on_write): - if isinstance(data, dict): - gdf = cudf.DataFrame(data) - mi1 = gdf.groupby(["Date", "Symbol"]).mean().index + if isinstance(data, list): + mi1 = cudf.MultiIndex.from_arrays(data) mi2 = mi1.copy(deep=deep) - - lchildren = [col.children for col in mi1._columns] - rchildren = [col.children for col in mi2._columns] - - # Flatten - lchildren = reduce(operator.add, lchildren) - rchildren = reduce(operator.add, rchildren) - - lptrs = [ - child.base_data.get_ptr(mode="read") for child in lchildren - ] - rptrs = [ - child.base_data.get_ptr(mode="read") for child in rchildren - ] - - assert all((x == y) for x, y in zip(lptrs, rptrs, strict=True)) - + for col1, col2 in zip(mi1._columns, mi2._columns, strict=True): + if not deep or (copy_on_write and not deep): + assert_column_memory_eq(col1, col2) + else: + assert_column_memory_ne(col1, col2) elif isinstance(data, pd.MultiIndex): data = cudf.MultiIndex( levels=data.levels, diff --git a/python/cudf/cudf/tests/private_objects/test_column.py b/python/cudf/cudf/tests/private_objects/test_column.py index 0af66141b64..4a6d1bf9403 100644 --- a/python/cudf/cudf/tests/private_objects/test_column.py +++ b/python/cudf/cudf/tests/private_objects/test_column.py @@ -1,6 +1,5 @@ # SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -import sys from decimal import Decimal import cupy as cp @@ -9,14 +8,11 @@ import pyarrow as pa import pytest -import rmm - import cudf from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION, ) -from cudf.core.buffer import as_buffer from cudf.core.column.column import _can_values_be_equal, as_column from cudf.core.column.decimal import Decimal32Column, Decimal64Column from cudf.testing import assert_eq @@ -93,13 +89,12 @@ def test_column_set_equal_length_object_by_mask(): def test_column_offset_and_size(pandas_input, offset, size): col = as_column(pandas_input) col = cudf.core.column.build_column( - data=col.base_data, - dtype=col.dtype, - mask=col.base_mask, + plc_column=col.to_pylibcudf(mode="read"), size=size, + dtype=col.dtype, offset=offset, null_count=col.null_count, - children=col.base_children, + exposed=False, ) if isinstance(col.dtype, cudf.CategoricalDtype): @@ -515,25 +510,6 @@ def test__can_values_be_equal(left, right, expected): assert _can_values_be_equal(right, left) is expected -def test_string_no_children_properties(): - empty_col = cudf.core.column.StringColumn( - as_buffer(rmm.DeviceBuffer(size=0)), - size=0, - dtype=np.dtype("object"), - mask=None, - offset=0, - null_count=0, - children=(), - ) - assert empty_col.base_children == () - assert empty_col.base_size == 0 - - assert empty_col.children == () - assert empty_col.size == 0 - - assert sys.getsizeof(empty_col) >= 0 # Accounts for Python GC overhead - - def test_string_int_to_ipv4(): gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype( "uint32" diff --git a/python/cudf/cudf/tests/testing/test_assert_column_equal.py b/python/cudf/cudf/tests/testing/test_assert_column_equal.py index 0aa90290684..e9006049a19 100644 --- a/python/cudf/cudf/tests/testing/test_assert_column_equal.py +++ b/python/cudf/cudf/tests/testing/test_assert_column_equal.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 -import numpy as np import pyarrow as pa import pytest @@ -55,25 +54,23 @@ def test_assert_column_memory_slice(arrow_arrays): def test_assert_column_memory_basic_same(arrow_arrays): data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - buf = cudf.core.buffer.as_buffer(data.base_data) + plc_col = data.to_pylibcudf(mode="read") left = cudf.core.column.build_column( - buf, - dtype=np.dtype(np.int8), - size=len(arrow_arrays), - mask=None, + plc_column=plc_col, + dtype=data.dtype, + size=data.size, offset=0, null_count=data.null_count, - children=(), + exposed=False, ) right = cudf.core.column.build_column( - buf, - dtype=np.dtype(np.int8), - size=len(arrow_arrays), - mask=None, + plc_column=plc_col, + dtype=data.dtype, + size=data.size, offset=0, null_count=data.null_count, - children=(), + exposed=False, ) assert_column_memory_eq(left, right)