Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 3b22273

Browse files
Migrating to pandas=1.3.4 (#997)
* Migrating to pandas=1.3.4 * Fixing conda dependency resolution failure during SDC build
1 parent da003ef commit 3b22273

14 files changed

+69
-46
lines changed

README.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ Building on Linux with setuptools
8585

8686
export PYVER=<3.6 or 3.7>
8787
export NUMPYVER=<1.16 or 1.17>
88-
conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.54.1 pandas=1.2.0 pyarrow=4.0.1 gcc_linux-64 gxx_linux-64
88+
conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.54.1 pandas=1.3.4 pyarrow=4.0.1 gcc_linux-64 gxx_linux-64
8989
source activate sdc-env
9090
git clone https://github.com/IntelPython/sdc.git
9191
cd sdc
@@ -123,7 +123,7 @@ Building on Windows with setuptools
123123

124124
set PYVER=<3.6 or 3.7>
125125
set NUMPYVER=<1.16 or 1.17>
126-
conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.54.1 pandas=1.2.0 pyarrow=4.0.1
126+
conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.54.1 pandas=1.3.4 pyarrow=4.0.1
127127
conda activate sdc-env
128128
set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include
129129
set LIB=%LIB%;%CONDA_PREFIX%\Library\lib

conda-recipe/meta.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{% set NUMBA_VERSION = "==0.54.1" %}
2-
{% set PANDAS_VERSION = "==1.2.0" %}
2+
{% set PANDAS_VERSION = "==1.3.4" %}
33
{% set PYARROW_VERSION = "==4.0.1" %}
44

55
package:
@@ -26,7 +26,6 @@ requirements:
2626
- setuptools
2727
- numba {{ NUMBA_VERSION }}
2828
- numpy
29-
- pandas {{ PANDAS_VERSION }}
3029
- pyarrow {{ PYARROW_VERSION }}
3130
- wheel
3231
- tbb-devel

docs/source/getting_started.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla
4141
Intel SDC conda package can be installed using the steps below:
4242
::
4343

44-
> conda create -n sdc_env python=<3.7 or 3.6> pyarrow=4.0.1 pandas=1.2.0 -c anaconda -c conda-forge
44+
> conda create -n sdc_env python=<3.7 or 3.6> pyarrow=4.0.1 pandas=1.3.4 -c anaconda -c conda-forge
4545
> conda activate sdc_env
4646
> conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels
4747

4848
Intel SDC wheel package can be installed using the steps below:
4949
::
5050

51-
> conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=4.0.1 pandas=1.2.0 -c anaconda -c conda-forge
51+
> conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=4.0.1 pandas=1.3.4 -c anaconda -c conda-forge
5252
> conda activate sdc_env
5353
> pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc
5454

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
numpy>=1.16
2-
pandas==1.2.0
2+
pandas==1.3.4
33
pyarrow==4.0.1
44
numba==0.54.1
55
tbb

sdc/datatypes/categorical/boxing.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,21 @@ def box_Categorical(typ, val, c):
5353
pandas_module_name = c.context.insert_const_string(c.builder.module, "pandas")
5454
pandas_module = c.pyapi.import_module_noblock(pandas_module_name)
5555

56-
constructor = c.pyapi.object_getattr_string(pandas_module, "Categorical")
57-
58-
empty_list = c.pyapi.list_new(c.context.get_constant(types.intp, 0))
59-
args = c.pyapi.tuple_pack([empty_list])
60-
categorical = c.pyapi.call(constructor, args)
56+
categorical_class = c.pyapi.object_getattr_string(pandas_module, "Categorical")
57+
method_from_codes = c.pyapi.object_getattr_string(categorical_class, "from_codes")
6158

6259
dtype = box_CategoricalDtype(typ.pd_dtype, val, c)
63-
c.pyapi.object_setattr_string(categorical, "_dtype", dtype)
64-
6560
codes = boxing.box_array(typ.codes, val, c)
66-
c.pyapi.object_setattr_string(categorical, "_codes", codes)
61+
py_none = c.pyapi.make_none()
62+
args = c.pyapi.tuple_pack([codes, py_none, py_none, dtype])
63+
categorical = c.pyapi.call(method_from_codes, args=args)
6764

65+
c.pyapi.decref(args)
66+
c.pyapi.decref(py_none)
6867
c.pyapi.decref(codes)
6968
c.pyapi.decref(dtype)
70-
c.pyapi.decref(args)
71-
c.pyapi.decref(empty_list)
72-
c.pyapi.decref(constructor)
69+
c.pyapi.decref(method_from_codes)
70+
c.pyapi.decref(categorical_class)
7371
c.pyapi.decref(pandas_module)
7472
return categorical
7573

sdc/datatypes/hpat_pandas_functions.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,13 @@ def _param_checker(x, accepted_types, defaults):
236236

237237
# infer the resulting DF type as a numba type
238238
pandas_df_type = csv_reader_infer_nb_pandas_type(
239-
py_filepath_or_buffer, py_sep, py_delimiter, py_names, py_usecols, py_dtype, py_skiprows, py_parse_dates
239+
py_filepath_or_buffer,
240+
delimiter=py_delimiter,
241+
names=py_names,
242+
usecols=py_usecols,
243+
dtype=py_dtype,
244+
skiprows=py_skiprows,
245+
parse_dates=py_parse_dates
240246
)
241247

242248
col_names = pandas_df_type.columns
@@ -260,6 +266,8 @@ def _check_usecol_type(py_val, py_type):
260266
use_user_converters = not (isinstance(converters, types.NoneType) or converters is None)
261267
if not use_user_converters:
262268

269+
# dtype parameter is deliberately captured into objmode as global value to avoid
270+
# IR grow due to passing large tuples as function arguments
263271
def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecols, dtype,
264272
converters, skiprows, parse_dates):
265273
with objmode(df=pandas_df_type):
@@ -278,7 +286,7 @@ def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecol
278286

279287
# fix when PyArrow will support predicted categories
280288
for cat_column_name in cat_columns_list:
281-
df[cat_column_name] = df[cat_column_name].astype(py_col_dtypes[cat_column_name])
289+
df[cat_column_name].cat.set_categories(py_col_dtypes[cat_column_name].categories, inplace=True)
282290

283291
return df
284292

@@ -289,13 +297,12 @@ def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecol
289297
converterted_cols = set(converters.fields)
290298
py_col_dtypes.update(dict.fromkeys(converterted_cols, 'str'))
291299
arrow_table_type = csv_reader_infer_nb_arrow_type(py_filepath_or_buffer,
292-
py_sep,
293-
py_delimiter,
294-
py_names,
295-
py_usecols,
296-
py_col_dtypes,
297-
py_skiprows,
298-
py_parse_dates)
300+
delimiter=py_delimiter,
301+
names=py_names,
302+
usecols=py_usecols,
303+
dtype=py_col_dtypes,
304+
skiprows=py_skiprows,
305+
parse_dates=py_parse_dates)
299306

300307
n_cols = len(col_names)
301308
pa_table_type = PyarrowTableType()
@@ -340,7 +347,7 @@ def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecol
340347
col_as_series = pa_table.column(col_names[i]).to_pandas(categories=cat_columns_list)
341348
# fix when PyArrow will support predicted categories
342349
if isinstance(col_as_series, pd.CategoricalDtype):
343-
col_as_series = col_as_series.astype(py_col_dtypes[col_names[i]])
350+
col_as_series.cat.set_categories(py_col_dtypes[col_names[i]], inplace=True)
344351
ret_cols[i] = col_as_series
345352

346353
maybe_unboxed_columns = tuple(ret_cols)

sdc/datatypes/hpat_pandas_series_rolling_functions.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,9 @@ def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
10051005
_hpat_pandas_series_rolling_cov_check_types(self, other=other,
10061006
pairwise=pairwise, ddof=ddof)
10071007

1008-
return _gen_hpat_pandas_rolling_series_cov_impl(other)
1008+
# prior to pandas_#39388 df.rolling.cov was different from series cov in handling inf values
1009+
# so this specific overload had align_finiteness=False
1010+
return _gen_hpat_pandas_rolling_series_cov_impl(other, align_finiteness=True)
10091011

10101012

10111013
@sdc_overload_method(SeriesRollingType, 'kurt')

sdc/io/csv_ext.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,11 @@ def wrapper(*args, **kwargs):
9393
@pyarrow_cpu_count_equal_numba_num_treads
9494
def do_read_csv(filepath_or_buffer, sep, delimiter, names, usecols, dtype, skiprows, parse_dates):
9595

96+
if delimiter is None:
97+
delimiter = sep
98+
9699
pa_options = get_pyarrow_read_csv_options(
97-
sep, delimiter, names, usecols, dtype, skiprows, parse_dates)
100+
delimiter, names, usecols, dtype, skiprows, parse_dates)
98101

99102
table = csv.read_csv(
100103
filepath_or_buffer,
@@ -107,11 +110,11 @@ def do_read_csv(filepath_or_buffer, sep, delimiter, names, usecols, dtype, skipr
107110

108111

109112
def csv_reader_infer_nb_arrow_type(
110-
filepath_or_buffer, sep, delimiter, names, usecols, dtype, skiprows, parse_dates
113+
filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
111114
):
112115

113116
read_opts, parse_opts, convert_opts = get_pyarrow_read_csv_options(
114-
sep, delimiter, names, usecols, dtype, skiprows, parse_dates)
117+
delimiter, names, usecols, dtype, skiprows, parse_dates)
115118
csv_reader = csv.open_csv(filepath_or_buffer,
116119
read_options=read_opts,
117120
parse_options=parse_opts,
@@ -138,13 +141,13 @@ def csv_reader_infer_nb_arrow_type(
138141

139142

140143
def csv_reader_infer_nb_pandas_type(
141-
filepath_or_buffer, sep, delimiter, names, usecols, dtype, skiprows, parse_dates
144+
filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
142145
):
143146

144147
# infer column types from the first block (similarly as Arrow does this)
145148
# TO-DO: tune the block size or allow user configure it via env var
146149
rows_to_read = 1000
147-
df = pd.read_csv(filepath_or_buffer, sep=sep, delimiter=delimiter, names=names,
150+
df = pd.read_csv(filepath_or_buffer, delimiter=delimiter, names=names,
148151
usecols=usecols, dtype=dtype, skiprows=skiprows, nrows=rows_to_read,
149152
parse_dates=parse_dates)
150153

@@ -185,10 +188,7 @@ def csv_reader_get_pyarrow_read_options(names, skiprows):
185188
return read_options
186189

187190

188-
def csv_reader_get_pyarrow_parse_options(delimiter, sep):
189-
190-
if delimiter is None:
191-
delimiter = sep
191+
def csv_reader_get_pyarrow_parse_options(delimiter):
192192

193193
parse_options = csv.ParseOptions(
194194
delimiter=delimiter,
@@ -264,11 +264,11 @@ def csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates):
264264
return convert_options
265265

266266

267-
def get_pyarrow_read_csv_options(sep, delimiter, names, usecols, dtype, skiprows, parse_dates):
267+
def get_pyarrow_read_csv_options(delimiter, names, usecols, dtype, skiprows, parse_dates):
268268
""" This function attempts to map pandas read_csv parameters to pyarrow read_csv options to be used """
269269

270270
read_opts = csv_reader_get_pyarrow_read_options(names, skiprows)
271-
parse_opts = csv_reader_get_pyarrow_parse_options(delimiter, sep)
271+
parse_opts = csv_reader_get_pyarrow_parse_options(delimiter)
272272
convert_opts = csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates)
273273

274274
return (read_opts, parse_opts, convert_opts)

sdc/tests/categorical/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2525
# *****************************************************************************
2626

27-
from . import test_categorical
28-
from . import test_categoricaldtype
29-
from . import test_series_category
30-
from . import test_df_category
27+
from .test_categorical import *
28+
from .test_categoricaldtype import *
29+
from .test_series_category import *
30+
from .test_df_category import *

sdc/tests/categorical/test_categorical.py

+5
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from sdc.tests.test_base import TestCase
2828

29+
import unittest
2930
import pandas as pd
3031
import numba as nb
3132
from numba import types
@@ -91,3 +92,7 @@ def func():
9192

9293
boxed = func()
9394
assert(boxed.equals(self._pd_value()))
95+
96+
97+
if __name__ == "__main__":
98+
unittest.main()

sdc/tests/categorical/test_df_category.py

+5
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from sdc.tests.test_base import TestCase
2828

29+
import unittest
2930
import numpy as np
3031
import pandas as pd
3132
import numba as nb
@@ -123,3 +124,7 @@ def func():
123124

124125
boxed = func()
125126
assert(boxed.equals(self._pd_value()))
127+
128+
129+
if __name__ == "__main__":
130+
unittest.main()

sdc/tests/categorical/test_series_category.py

+5
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from sdc.tests.test_base import TestCase
2828

29+
import unittest
2930
import pandas as pd
3031
import numpy as np
3132
import numba as nb
@@ -114,3 +115,7 @@ def func():
114115

115116
boxed = func()
116117
assert(boxed.equals(self._pd_value()))
118+
119+
120+
if __name__ == "__main__":
121+
unittest.main()

sdc/tests/test_groupby.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ def test_impl():
211211

212212
pd.testing.assert_frame_equal(sdc_impl(), test_impl(), **kwargs)
213213

214+
@unittest.expectedFailure # FIXME_pandas#43292: pandas groupby.sum impl broken
214215
def test_dataframe_groupby_mean(self):
215216
def test_impl(df):
216217
return df.groupby('A').mean()
@@ -222,6 +223,7 @@ def test_impl(df):
222223
# TODO: implement index classes, as current indexes do not have names
223224
pd.testing.assert_frame_equal(result, result_ref, check_names=False)
224225

226+
@unittest.expectedFailure # FIXME_pandas#43292: pandas groupby.sum impl broken
225227
def test_dataframe_groupby_mean_no_unboxing(self):
226228
def test_impl():
227229
df = pd.DataFrame({
@@ -267,7 +269,6 @@ def test_impl():
267269
# TODO: implement index classes, as current indexes do not have names
268270
pd.testing.assert_frame_equal(result_jit, result_ref, check_names=False)
269271

270-
@unittest.expectedFailure # pandas groupby.median returns unstable dtype (int or float) unlike series.median
271272
def test_dataframe_groupby_median_result_dtype(self):
272273
def test_impl(df):
273274
return df.groupby('A').median()
@@ -326,6 +327,7 @@ def test_impl(df):
326327
# TODO: implement index classes, as current indexes do not have names
327328
pd.testing.assert_frame_equal(result, result_ref, check_names=False)
328329

330+
@unittest.expectedFailure # FIXME_pandas#43292: pandas groupby.sum impl broken
329331
def test_dataframe_groupby_sum_no_unboxing(self):
330332
def test_impl():
331333
df = pd.DataFrame({

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ def run(self):
430430
package_data={'sdc.tests': ['*.bz2'], },
431431
install_requires=[
432432
'numpy>=1.16',
433-
'pandas==1.2.0',
433+
'pandas==1.3.4',
434434
'pyarrow==4.0.1',
435435
'numba==0.54.1',
436436
'tbb'

0 commit comments

Comments
 (0)