Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e789cb6
Make ColumnBase.deserialize construct via pylibcudf
mroeschke Sep 30, 2025
c64df29
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Sep 30, 2025
fe4caa1
remove initial warning filter
mroeschke Sep 30, 2025
d6d3aec
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 1, 2025
d96929e
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 7, 2025
1711292
Fold in Tom's review from set_mask PR
mroeschke Oct 7, 2025
8f66659
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 8, 2025
3420a3d
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 8, 2025
606a820
deserialize(): wrap SpillableBuffer in a spillable_gpumemoryview
madsbk Oct 10, 2025
ec5dc5e
Merge branch 'branch-25.12' into ref/cudf/column_serialize
madsbk Oct 10, 2025
abf5b53
Merge branch 'ref/cudf/column_serialize' of https://github.com/mroesc…
mroeschke Oct 10, 2025
7777e0b
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 10, 2025
522941a
Undo warning changes in test_spilling.py
mroeschke Oct 10, 2025
8755ec4
Remove default arguments from Column constructors
mroeschke Oct 10, 2025
2c0f8f2
CategoricalColumn size was never None
mroeschke Oct 10, 2025
e61defc
TemporalColumns never have None size
mroeschke Oct 10, 2025
366311e
NumericalColumn never gets None size
mroeschke Oct 10, 2025
508ccba
StringColumn never has None size
mroeschke Oct 10, 2025
aa9a118
Validate that null_count >=0
mroeschke Oct 10, 2025
22ff2fa
Try annotating dtype
mroeschke Oct 10, 2025
f586eba
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 16, 2025
068ae77
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 16, 2025
c29c9ee
Merge remote-tracking branch 'upstream/branch-25.12' into ref/cudf/co…
mroeschke Oct 17, 2025
1b13b5a
Adjust constructors to just take pylibcudf columns
mroeschke Oct 17, 2025
5522d58
Create functions to convert data, mask, children to cudf compatable o…
mroeschke Oct 17, 2025
2137b9d
Fix naming of method
mroeschke Oct 17, 2025
f6621d0
Pass dtype to children construction
mroeschke Oct 18, 2025
088a1df
Fix test_assert_column_memory_basic_same
mroeschke Oct 20, 2025
7f7fd85
Remove StringColumn.copy override
mroeschke Oct 21, 2025
62e0377
Merge remote-tracking branch 'upstream/main' into ref/cudf/construct_plc
mroeschke Oct 21, 2025
15e0f79
Fix copy(deep=False) to account for copy on write
mroeschke Oct 21, 2025
01fe248
Merge remote-tracking branch 'upstream/main' into ref/cudf/construct_plc
mroeschke Oct 22, 2025
bc23e3e
Merge remote-tracking branch 'upstream/main' into ref/cudf/construct_plc
mroeschke Oct 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 42 additions & 40 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import pyarrow as pa
from typing_extensions import Self

import pylibcudf as plc

import cudf
from cudf.api.types import is_scalar
from cudf.core.column import column
Expand All @@ -29,12 +31,11 @@
if TYPE_CHECKING:
from collections.abc import Mapping, MutableSequence, Sequence

from pylibcudf import Scalar as plc_Scalar

from cudf._typing import (
ColumnBinaryOperand,
ColumnLike,
Dtype,
DtypeObj,
ScalarLike,
)
from cudf.core.buffer import Buffer
Expand All @@ -52,18 +53,6 @@
_DEFAULT_CATEGORICAL_VALUE = np.int8(-1)


def validate_categorical_children(children) -> None:
if not (
len(children) == 1
and isinstance(children[0], cudf.core.column.numerical.NumericalColumn)
and children[0].dtype.kind in "iu"
):
# TODO: Enforce unsigned integer?
raise ValueError(
"Must specify exactly one child NumericalColumn of integers for representing the codes."
)


class CategoricalColumn(column.ColumnBase):
"""
Implements operations for Columns of Categorical type
Expand Down Expand Up @@ -94,35 +83,59 @@ class CategoricalColumn(column.ColumnBase):
"__gt__",
"__ge__",
}
# TODO: See if we can narrow these integer types
_VALID_PLC_TYPES = {
plc.TypeId.INT8,
plc.TypeId.INT16,
plc.TypeId.INT32,
plc.TypeId.INT64,
plc.TypeId.UINT8,
plc.TypeId.UINT16,
plc.TypeId.UINT32,
plc.TypeId.UINT64,
}

def __init__(
self,
data: None,
plc_column: plc.Column,
size: int,
dtype: CategoricalDtype,
mask: Buffer | None,
offset: int,
null_count: int,
children: tuple[NumericalColumn],
):
if data is not None:
raise ValueError(f"{data=} must be None")
validate_categorical_children(children)
exposed: bool,
) -> None:
if not isinstance(dtype, CategoricalDtype):
raise ValueError(
f"{dtype=} must be cudf.CategoricalDtype instance."
)
super().__init__(
data=data,
plc_column=plc_column,
size=size,
dtype=dtype,
mask=mask,
offset=offset,
null_count=null_count,
children=children,
exposed=exposed,
)
self._codes = self.children[0].set_mask(self.mask)

def _get_data_buffer_from_pylibcudf_column(
self, plc_column: plc.Column, exposed: bool
) -> None:
"""
This column considers the plc_column (i.e. codes) as children
"""
return None

def _get_children_from_pylibcudf_column(
self, plc_column: plc.Column, dtype: DtypeObj, exposed: bool
) -> tuple[ColumnBase]:
"""
This column considers the plc_column (i.e. codes) as children
"""
return (
type(self).from_pylibcudf(plc_column, data_ptr_exposed=exposed),
)

@property
def base_size(self) -> int:
return int(
Expand All @@ -136,15 +149,6 @@ def __contains__(self, item: ScalarLike) -> bool:
return False
return self._encode(item) in self.codes

def set_base_data(self, value):
if value is not None:
raise RuntimeError(
"CategoricalColumns do not use data attribute of Column, use "
"`set_base_children` instead"
)
else:
super().set_base_data(value)

def _process_values_for_isin(
self, values: Sequence
) -> tuple[ColumnBase, ColumnBase]:
Expand All @@ -157,7 +161,6 @@ def set_base_mask(self, value: Buffer | None) -> None:

def set_base_children(self, value: tuple[NumericalColumn]) -> None: # type: ignore[override]
super().set_base_children(value)
validate_categorical_children(value)
self._codes = value[0].set_mask(self.mask)

@property
Expand Down Expand Up @@ -216,7 +219,7 @@ def __setitem__(self, key, value):

def _fill(
self,
fill_value: plc_Scalar,
fill_value: plc.Scalar,
begin: int,
end: int,
inplace: bool = False,
Expand Down Expand Up @@ -373,7 +376,7 @@ def unique(self) -> Self:

def _cast_self_and_other_for_where(
self, other: ScalarLike | ColumnBase, inplace: bool
) -> tuple[ColumnBase, plc_Scalar | ColumnBase]:
) -> tuple[ColumnBase, plc.Scalar | ColumnBase]:
if is_scalar(other):
try:
other = self._encode(other)
Expand Down Expand Up @@ -568,7 +571,7 @@ def notnull(self) -> ColumnBase:

def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> plc_Scalar | ColumnBase:
) -> plc.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if is_scalar(fill_value):
if fill_value != _DEFAULT_CATEGORICAL_VALUE:
Expand Down Expand Up @@ -710,13 +713,12 @@ def _concat(
def _with_type_metadata(self: Self, dtype: Dtype) -> Self:
if isinstance(dtype, CategoricalDtype):
return type(self)(
data=self.data, # type: ignore[arg-type]
plc_column=self.plc_column,
size=self.size,
dtype=dtype,
mask=self.base_mask,
offset=self.offset,
null_count=self.null_count,
children=self.base_children, # type: ignore[arg-type]
exposed=False,
)

return self
Expand Down
Loading
Loading