Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-43683: [Python] Use pandas StringDtype when enabled (pandas 3+) #44195

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dev/tasks/tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1582,6 +1582,12 @@ tasks:
# ensure we have at least one build with parquet encryption disabled
PARQUET_REQUIRE_ENCRYPTION: "OFF"
{% endif %}
{% if pandas_version == "nightly" %}
# TODO can be removed once this is enabled by default in pandas >= 3
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
# This is to enable the Pandas feature.
# See: https://github.com/pandas-dev/pandas/pull/58459
PANDAS_FUTURE_INFER_STRING: "1"
{% endif %}
{% if not cache_leaf %}
# use the latest pandas release, so prevent reusing any cached layers
flags: --no-leaf-cache
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,7 @@ services:
PYTEST_ARGS: # inherit
HYPOTHESIS_PROFILE: # inherit
PYARROW_TEST_HYPOTHESIS: # inherit
PANDAS_FUTURE_INFER_STRING: # inherit
volumes: *conda-volumes
command: *python-conda-command

Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ def _handle_arrow_array_protocol(obj, type, mask, size):
"return a pyarrow Array or ChunkedArray.")
if isinstance(res, ChunkedArray) and res.num_chunks==1:
res = res.chunk(0)
if type is not None and res.type != type:
res = res.cast(type)
return res


Expand Down
17 changes: 16 additions & 1 deletion python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype, _lock
bint has_sparse
bint _pd024
bint _is_v1, _is_ge_v21, _is_ge_v3
bint _is_v1, _is_ge_v21, _is_ge_v3, _is_ge_v3_strict

def __init__(self):
self._lock = Lock()
Expand Down Expand Up @@ -80,6 +80,7 @@ cdef class _PandasAPIShim(object):
self._is_v1 = self._loose_version < Version('2.0.0')
self._is_ge_v21 = self._loose_version >= Version('2.1.0')
self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
self._is_ge_v3_strict = self._loose_version >= Version('3.0.0')

self._compat_module = pdcompat
self._data_frame = pd.DataFrame
Expand Down Expand Up @@ -174,6 +175,20 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._is_ge_v3

def is_ge_v3_strict(self):
self._check_import()
return self._is_ge_v3_strict

def uses_string_dtype(self):
if self.is_ge_v3_strict():
return True
try:
if self.pd.options.future.infer_string:
return True
except:
pass
return False

@property
def categorical_type(self):
self._check_import()
Expand Down
48 changes: 45 additions & 3 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,10 +781,12 @@ def table_to_dataframe(
table, index = _reconstruct_index(table, index_descriptors,
all_columns, types_mapper)
ext_columns_dtypes = _get_extension_dtypes(
table, all_columns, types_mapper)
table, all_columns, types_mapper, options, categories)
else:
index = _pandas_api.pd.RangeIndex(table.num_rows)
ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper)
ext_columns_dtypes = _get_extension_dtypes(
table, [], types_mapper, options, categories
)

_check_data_column_metadata_consistency(all_columns)
columns = _deserialize_column_index(table, all_columns, column_indexes)
Expand Down Expand Up @@ -829,7 +831,7 @@ def table_to_dataframe(
}


def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
def _get_extension_dtypes(table, columns_metadata, types_mapper, options, categories):
"""
Based on the stored column pandas metadata and the extension types
in the arrow schema, infer which columns should be converted to a
Expand All @@ -842,12 +844,25 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
and then we can check if this dtype supports conversion from arrow.

"""
strings_to_categorical = options["strings_to_categorical"]
categories = categories or []

ext_columns = {}

# older pandas version that does not yet support extension dtypes
if _pandas_api.extension_dtype is None:
return ext_columns

# for pandas 3.0+, use pandas' new default string dtype
if _pandas_api.uses_string_dtype() and not strings_to_categorical:
for field in table.schema:
if (
pa.types.is_string(field.type)
or pa.types.is_large_string(field.type)
or pa.types.is_string_view(field.type)
) and field.name not in categories:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am curious on how were categories interpreted before inferring the new string type, was this just not taken into account on the arrow side?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If field.name in categories is true, that means the user asked to convert this column to a categorical dtype on the pandas side. This is handled on the C++ side to dictionary encode the column, and so in this case we don't have to specify any custom pandas extension dtype here, because then our conversion layer will convert that dictionary encoded column to a pandas categorical.

ext_columns[field.name] = _pandas_api.pd.StringDtype(na_value=np.nan)

# infer the extension columns from the pandas metadata
for col_meta in columns_metadata:
try:
Expand All @@ -861,6 +876,19 @@ def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
# that are certainly numpy dtypes
pandas_dtype = _pandas_api.pandas_dtype(dtype)
if isinstance(pandas_dtype, _pandas_api.extension_dtype):
if isinstance(pandas_dtype, _pandas_api.pd.StringDtype):
# when the metadata indicate to use the string dtype,
# ignore this in case:
# - it is specified to convert strings / this column to categorical
# - the column itself is dictionary encoded and would otherwise be
# converted to categorical
if strings_to_categorical or name in categories:
continue
try:
if pa.types.is_dictionary(table.schema.field(name).type):
continue
except KeyError:
pass
if hasattr(pandas_dtype, "__from_arrow__"):
ext_columns[name] = pandas_dtype

Expand Down Expand Up @@ -1133,6 +1161,20 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
# GH-41503: if the column index was decimal, restore to decimal
elif pandas_dtype == "decimal":
level = _pandas_api.pd.Index([decimal.Decimal(i) for i in level])
elif (
level.dtype == "str" and numpy_dtype == "object"
and ("mixed" in pandas_dtype or pandas_dtype in ["unicode", "string"])
):
# the metadata indicate that the original dataframe used object dtype,
# but ignore this and keep string dtype if:
# - the original columns used mixed types -> we don't attempt to faithfully
# roundtrip in this case, but keep the column names as strings
# - the original columns were inferred to be strings but stored in object
# dtype -> we don't restore the object dtype because all metadata
# generated using pandas < 3 will have this case by default, and
# for pandas >= 3 we want to use the default string dtype for .columns
new_levels.append(level)
continue
elif level.dtype != dtype:
level = level.astype(dtype)
# ARROW-9096: if original DataFrame was upcast we keep that
Expand Down
6 changes: 4 additions & 2 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2539,7 +2539,8 @@ Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arr
}
if (options.strings_to_categorical) {
for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
if (is_base_binary_like((*arrays)[i]->type()->id())) {
if (is_base_binary_like((*arrays)[i]->type()->id()) ||
is_binary_view_like((*arrays)[i]->type()->id())) {
columns_to_encode.push_back(i);
}
}
Expand Down Expand Up @@ -2573,7 +2574,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,
py_ref = nullptr;
}

if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) {
if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) ||
is_binary_view_like(arr->type()->id()))) {
if (options.zero_copy_only) {
return Status::Invalid("Need to dictionary encode a column, but ",
"only zero-copy conversions allowed");
Expand Down
19 changes: 10 additions & 9 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,7 +1020,7 @@ def test_replace_slice():
offsets = range(-3, 4)

arr = pa.array([None, '', 'a', 'ab', 'abc', 'abcd', 'abcde'])
series = arr.to_pandas()
series = arr.to_pandas().astype(object).replace({np.nan: None})
for start in offsets:
for stop in offsets:
expected = series.str.slice_replace(start, stop, 'XX')
Expand All @@ -1031,7 +1031,7 @@ def test_replace_slice():
assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual

arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde'])
series = arr.to_pandas()
series = arr.to_pandas().astype(object).replace({np.nan: None})
for start in offsets:
for stop in offsets:
expected = series.str.slice_replace(start, stop, 'XX')
Expand Down Expand Up @@ -2125,50 +2125,51 @@ def test_strftime():
for fmt in formats:
options = pc.StrftimeOptions(fmt)
result = pc.strftime(tsa, options=options)
expected = pa.array(ts.strftime(fmt))
# cast to the same type as result to ignore string vs large_string
expected = pa.array(ts.strftime(fmt)).cast(result.type)
assert result.equals(expected)

fmt = "%Y-%m-%dT%H:%M:%S"

# Default format
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions())
expected = pa.array(ts.strftime(fmt))
expected = pa.array(ts.strftime(fmt)).cast(result.type)
assert result.equals(expected)

# Default format plus timezone
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z"))
expected = pa.array(ts.strftime(fmt + "%Z"))
expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type)
assert result.equals(expected)

# Pandas %S is equivalent to %S in arrow for unit="s"
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions("%S")
result = pc.strftime(tsa, options=options)
expected = pa.array(ts.strftime("%S"))
expected = pa.array(ts.strftime("%S")).cast(result.type)
assert result.equals(expected)

# Pandas %S.%f is equivalent to %S in arrow for unit="us"
tsa = pa.array(ts, type=pa.timestamp("us", timezone))
options = pc.StrftimeOptions("%S")
result = pc.strftime(tsa, options=options)
expected = pa.array(ts.strftime("%S.%f"))
expected = pa.array(ts.strftime("%S.%f")).cast(result.type)
assert result.equals(expected)

# Test setting locale
tsa = pa.array(ts, type=pa.timestamp("s", timezone))
options = pc.StrftimeOptions(fmt, locale="C")
result = pc.strftime(tsa, options=options)
expected = pa.array(ts.strftime(fmt))
expected = pa.array(ts.strftime(fmt)).cast(result.type)
assert result.equals(expected)

# Test timestamps without timezone
fmt = "%Y-%m-%dT%H:%M:%S"
ts = pd.to_datetime(times)
tsa = pa.array(ts, type=pa.timestamp("s"))
result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
expected = pa.array(ts.strftime(fmt))
expected = pa.array(ts.strftime(fmt)).cast(result.type)

# Positional format
assert pc.strftime(tsa, fmt) == result
Expand Down
6 changes: 5 additions & 1 deletion python/pyarrow/tests/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,11 @@ def test_empty_strings(version):
@pytest.mark.pandas
def test_all_none(version):
df = pd.DataFrame({'all_none': [None] * 10})
_check_pandas_roundtrip(df, version=version)
if version == 1 and pa.pandas_compat._pandas_api.uses_string_dtype():
expected = df.astype("str")
else:
expected = df
_check_pandas_roundtrip(df, version=version, expected=expected)


@pytest.mark.pandas
Expand Down
58 changes: 44 additions & 14 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,10 @@ def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
_check_pandas_roundtrip(df, preserve_index=True)

def test_index_metadata_field_name(self):
def test_index_metadata_field_name(self, request):
if _pandas_api.uses_string_dtype():
# https://github.com/pandas-dev/pandas/issues/59879
request.applymarker(pytest.mark.xfail(reason="bug in pandas string dtype"))
# test None case, and strangely named non-index columns
df = pd.DataFrame(
[(1, 'a', 3.1), (2, 'b', 2.2), (3, 'c', 1.3)],
Expand Down Expand Up @@ -411,7 +414,9 @@ def test_string_column_index(self):
column_indexes, = js['column_indexes']
assert column_indexes['name'] == 'stringz'
assert column_indexes['name'] == column_indexes['field_name']
assert column_indexes['numpy_type'] == 'object'
assert column_indexes['numpy_type'] == (
'str' if _pandas_api.uses_string_dtype() else 'object'
)
assert column_indexes['pandas_type'] == 'unicode'

md = column_indexes['metadata']
Expand Down Expand Up @@ -1680,7 +1685,10 @@ def test_pandas_unicode(self):
repeats = 1000
values = ['foo', None, 'bar', 'mañana', np.nan]
df = pd.DataFrame({'strings': values * repeats})
field = pa.field('strings', pa.string())
field = pa.field(
'strings',
pa.large_string() if _pandas_api.uses_string_dtype() else pa.string()
)
schema = pa.schema([field])
ex_values = ['foo', None, 'bar', 'mañana', None]
expected = pd.DataFrame({'strings': ex_values * repeats})
Expand Down Expand Up @@ -3299,6 +3307,10 @@ def _assert_nunique(obj, expected):


def test_to_pandas_deduplicate_strings_array_types():
if _pandas_api.uses_string_dtype():
pytest.skip(
"pandas uses string dtype and not object dtype, keyword has no effect"
)
nunique = 100
repeats = 10
values = _generate_dedup_example(nunique, repeats)
Expand All @@ -3311,6 +3323,10 @@ def test_to_pandas_deduplicate_strings_array_types():


def test_to_pandas_deduplicate_strings_table_types():
if _pandas_api.uses_string_dtype():
pytest.skip(
"pandas uses string dtype and not object dtype, keyword has no effect"
)
nunique = 100
repeats = 10
values = _generate_dedup_example(nunique, repeats)
Expand Down Expand Up @@ -3774,20 +3790,26 @@ def _check_to_pandas_memory_unchanged(obj, **kwargs):
x = obj.to_pandas(**kwargs) # noqa

# Memory allocation unchanged -- either zero copy or self-destructing
assert pa.total_allocated_bytes() == prior_allocation
if _pandas_api.uses_string_dtype():
# for the string array of the columns Index
# -> increase the size to account for overallocation for small arrays
max_index_allocation = max(192, x.columns.nbytes * 2)
assert pa.total_allocated_bytes() <= (prior_allocation + max_index_allocation)
else:
assert pa.total_allocated_bytes() == prior_allocation


def test_to_pandas_split_blocks():
# ARROW-3789
t = pa.table([
pa.array([1, 2, 3, 4, 5], type='i1'),
pa.array([1, 2, 3, 4, 5], type='i4'),
pa.array([1, 2, 3, 4, 5], type='i8'),
pa.array([1, 2, 3, 4, 5], type='f4'),
pa.array([1, 2, 3, 4, 5], type='f8'),
pa.array([1, 2, 3, 4, 5], type='f8'),
pa.array([1, 2, 3, 4, 5], type='f8'),
pa.array([1, 2, 3, 4, 5], type='f8'),
pa.array([1, 2, 3, 4, 5]*100, type='i1'),
pa.array([1, 2, 3, 4, 5]*100, type='i4'),
pa.array([1, 2, 3, 4, 5]*100, type='i8'),
pa.array([1, 2, 3, 4, 5]*100, type='f4'),
pa.array([1, 2, 3, 4, 5]*100, type='f8'),
pa.array([1, 2, 3, 4, 5]*100, type='f8'),
pa.array([1, 2, 3, 4, 5]*100, type='f8'),
pa.array([1, 2, 3, 4, 5]*100, type='f8'),
], ['f{}'.format(i) for i in range(8)])

_check_blocks_created(t, 8)
Expand Down Expand Up @@ -3832,7 +3854,12 @@ def test_table_uses_memory_pool():
prior_allocation = pa.total_allocated_bytes()
x = t.to_pandas()

assert pa.total_allocated_bytes() == (prior_allocation + 3 * N * 8)
new_allocation = 3 * N * 8
if _pandas_api.uses_string_dtype():
# for the small columns Index
new_allocation += 128

assert pa.total_allocated_bytes() == (prior_allocation + new_allocation)

# Check successful garbage collection
x = None # noqa
Expand Down Expand Up @@ -4110,7 +4137,10 @@ def test_dictionary_encoded_nested_to_pandas():

def test_dictionary_from_pandas():
cat = pd.Categorical(['a', 'b', 'a'])
expected_type = pa.dictionary(pa.int8(), pa.string())
expected_type = pa.dictionary(
pa.int8(),
pa.large_string() if _pandas_api.uses_string_dtype() else pa.string()
)

result = pa.array(cat)
assert result.to_pylist() == ['a', 'b', 'a']
Expand Down
Loading