Migrating to pandas=1.3.4 (#997)

kozlov-alexey · web-flow · commit 3b222737dab1 · 2021-12-06T16:45:52.000+03:00
* Migrating to pandas=1.3.4

* Fixing conda dependency resolution failure during SDC build
diff --git a/README.rst b/README.rst
@@ -85,7 +85,7 @@ Building on Linux with setuptools
 
     export PYVER=<3.6 or 3.7>
     export NUMPYVER=<1.16 or 1.17>
-    conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.54.1 pandas=1.2.0 pyarrow=4.0.1 gcc_linux-64 gxx_linux-64
+    conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.54.1 pandas=1.3.4 pyarrow=4.0.1 gcc_linux-64 gxx_linux-64
     source activate sdc-env
     git clone https://github.com/IntelPython/sdc.git
     cd sdc
@@ -123,7 +123,7 @@ Building on Windows with setuptools
 
     set PYVER=<3.6 or 3.7>
     set NUMPYVER=<1.16 or 1.17>
-    conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.54.1 pandas=1.2.0 pyarrow=4.0.1
+    conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.54.1 pandas=1.3.4 pyarrow=4.0.1
     conda activate sdc-env
     set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include
     set LIB=%LIB%;%CONDA_PREFIX%\Library\lib
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set NUMBA_VERSION = "==0.54.1" %}
-{% set PANDAS_VERSION = "==1.2.0" %}
+{% set PANDAS_VERSION = "==1.3.4" %}
 {% set PYARROW_VERSION = "==4.0.1" %}
 
 package:
@@ -26,7 +26,6 @@ requirements:
     - setuptools
     - numba   {{ NUMBA_VERSION }}
     - numpy
-    - pandas  {{ PANDAS_VERSION }}
     - pyarrow {{ PYARROW_VERSION }}
     - wheel
     - tbb-devel
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
@@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla
 Intel SDC conda package can be installed using the steps below:
 ::
 
-    > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=4.0.1 pandas=1.2.0 -c anaconda -c conda-forge
+    > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=4.0.1 pandas=1.3.4 -c anaconda -c conda-forge
     > conda activate sdc_env
     > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels
 
 Intel SDC wheel package can be installed using the steps below:
 ::
 
-    > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=4.0.1 pandas=1.2.0 -c anaconda -c conda-forge
+    > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=4.0.1 pandas=1.3.4 -c anaconda -c conda-forge
     > conda activate sdc_env
     > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 numpy>=1.16
-pandas==1.2.0
+pandas==1.3.4
 pyarrow==4.0.1
 numba==0.54.1
 tbb
diff --git a/sdc/datatypes/categorical/boxing.py b/sdc/datatypes/categorical/boxing.py
@@ -53,23 +53,21 @@ def box_Categorical(typ, val, c):
     pandas_module_name = c.context.insert_const_string(c.builder.module, "pandas")
     pandas_module = c.pyapi.import_module_noblock(pandas_module_name)
 
-    constructor = c.pyapi.object_getattr_string(pandas_module, "Categorical")
-
-    empty_list = c.pyapi.list_new(c.context.get_constant(types.intp, 0))
-    args = c.pyapi.tuple_pack([empty_list])
-    categorical = c.pyapi.call(constructor, args)
+    categorical_class = c.pyapi.object_getattr_string(pandas_module, "Categorical")
+    method_from_codes = c.pyapi.object_getattr_string(categorical_class, "from_codes")
 
     dtype = box_CategoricalDtype(typ.pd_dtype, val, c)
-    c.pyapi.object_setattr_string(categorical, "_dtype", dtype)
-
     codes = boxing.box_array(typ.codes, val, c)
-    c.pyapi.object_setattr_string(categorical, "_codes", codes)
+    py_none = c.pyapi.make_none()
+    args = c.pyapi.tuple_pack([codes, py_none, py_none, dtype])
+    categorical = c.pyapi.call(method_from_codes, args=args)
 
+    c.pyapi.decref(args)
+    c.pyapi.decref(py_none)
     c.pyapi.decref(codes)
     c.pyapi.decref(dtype)
-    c.pyapi.decref(args)
-    c.pyapi.decref(empty_list)
-    c.pyapi.decref(constructor)
+    c.pyapi.decref(method_from_codes)
+    c.pyapi.decref(categorical_class)
     c.pyapi.decref(pandas_module)
     return categorical
 
diff --git a/sdc/datatypes/hpat_pandas_functions.py b/sdc/datatypes/hpat_pandas_functions.py
@@ -236,7 +236,13 @@ def _param_checker(x, accepted_types, defaults):
 
     # infer the resulting DF type as a numba type
     pandas_df_type = csv_reader_infer_nb_pandas_type(
-        py_filepath_or_buffer, py_sep, py_delimiter, py_names, py_usecols, py_dtype, py_skiprows, py_parse_dates
+        py_filepath_or_buffer,
+        delimiter=py_delimiter,
+        names=py_names,
+        usecols=py_usecols,
+        dtype=py_dtype,
+        skiprows=py_skiprows,
+        parse_dates=py_parse_dates
     )
 
     col_names = pandas_df_type.columns
@@ -260,6 +266,8 @@ def _check_usecol_type(py_val, py_type):
     use_user_converters = not (isinstance(converters, types.NoneType) or converters is None)
     if not use_user_converters:
 
+        # dtype parameter is deliberately captured into objmode as global value to avoid
+        # IR grow due to passing large tuples as function arguments
         def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecols, dtype,
                                        converters, skiprows, parse_dates):
             with objmode(df=pandas_df_type):
@@ -278,7 +286,7 @@ def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecol
 
                 # fix when PyArrow will support predicted categories
                 for cat_column_name in cat_columns_list:
-                    df[cat_column_name] = df[cat_column_name].astype(py_col_dtypes[cat_column_name])
+                    df[cat_column_name].cat.set_categories(py_col_dtypes[cat_column_name].categories, inplace=True)
 
             return df
 
@@ -289,13 +297,12 @@ def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecol
         converterted_cols = set(converters.fields)
         py_col_dtypes.update(dict.fromkeys(converterted_cols, 'str'))
         arrow_table_type = csv_reader_infer_nb_arrow_type(py_filepath_or_buffer,
-                                                          py_sep,
-                                                          py_delimiter,
-                                                          py_names,
-                                                          py_usecols,
-                                                          py_col_dtypes,
-                                                          py_skiprows,
-                                                          py_parse_dates)
+                                                          delimiter=py_delimiter,
+                                                          names=py_names,
+                                                          usecols=py_usecols,
+                                                          dtype=py_col_dtypes,
+                                                          skiprows=py_skiprows,
+                                                          parse_dates=py_parse_dates)
 
         n_cols = len(col_names)
         pa_table_type = PyarrowTableType()
@@ -340,7 +347,7 @@ def sdc_internal_read_csv_impl(filepath_or_buffer, sep, delimiter, names, usecol
                         col_as_series = pa_table.column(col_names[i]).to_pandas(categories=cat_columns_list)
                         # fix when PyArrow will support predicted categories
                         if isinstance(col_as_series, pd.CategoricalDtype):
-                            col_as_series = col_as_series.astype(py_col_dtypes[col_names[i]])
+                            col_as_series.cat.set_categories(py_col_dtypes[col_names[i]], inplace=True)
                         ret_cols[i] = col_as_series
 
                     maybe_unboxed_columns = tuple(ret_cols)
diff --git a/sdc/datatypes/hpat_pandas_series_rolling_functions.py b/sdc/datatypes/hpat_pandas_series_rolling_functions.py
@@ -1005,7 +1005,9 @@ def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
     _hpat_pandas_series_rolling_cov_check_types(self, other=other,
                                                 pairwise=pairwise, ddof=ddof)
 
-    return _gen_hpat_pandas_rolling_series_cov_impl(other)
+    # prior to pandas_#39388 df.rolling.cov was different from series cov in handling inf values
+    # so this specific overload had align_finiteness=False
+    return _gen_hpat_pandas_rolling_series_cov_impl(other, align_finiteness=True)
 
 
 @sdc_overload_method(SeriesRollingType, 'kurt')
diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py
@@ -93,8 +93,11 @@ def wrapper(*args, **kwargs):
 @pyarrow_cpu_count_equal_numba_num_treads
 def do_read_csv(filepath_or_buffer, sep, delimiter, names, usecols, dtype, skiprows, parse_dates):
 
+    if delimiter is None:
+        delimiter = sep
+
     pa_options = get_pyarrow_read_csv_options(
-                    sep, delimiter, names, usecols, dtype, skiprows, parse_dates)
+                    delimiter, names, usecols, dtype, skiprows, parse_dates)
 
     table = csv.read_csv(
         filepath_or_buffer,
@@ -107,11 +110,11 @@ def do_read_csv(filepath_or_buffer, sep, delimiter, names, usecols, dtype, skipr
 
 
 def csv_reader_infer_nb_arrow_type(
-    filepath_or_buffer, sep, delimiter, names, usecols, dtype, skiprows, parse_dates
+    filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
 ):
 
     read_opts, parse_opts, convert_opts = get_pyarrow_read_csv_options(
-                                                sep, delimiter, names, usecols, dtype, skiprows, parse_dates)
+                                                delimiter, names, usecols, dtype, skiprows, parse_dates)
     csv_reader = csv.open_csv(filepath_or_buffer,
                               read_options=read_opts,
                               parse_options=parse_opts,
@@ -138,13 +141,13 @@ def csv_reader_infer_nb_arrow_type(
 
 
 def csv_reader_infer_nb_pandas_type(
-    filepath_or_buffer, sep, delimiter, names, usecols, dtype, skiprows, parse_dates
+    filepath_or_buffer, delimiter=',', names=None, usecols=None, dtype=None, skiprows=None, parse_dates=False
 ):
 
     # infer column types from the first block (similarly as Arrow does this)
     # TO-DO: tune the block size or allow user configure it via env var
     rows_to_read = 1000
-    df = pd.read_csv(filepath_or_buffer, sep=sep, delimiter=delimiter, names=names,
+    df = pd.read_csv(filepath_or_buffer, delimiter=delimiter, names=names,
                      usecols=usecols, dtype=dtype, skiprows=skiprows, nrows=rows_to_read,
                      parse_dates=parse_dates)
 
@@ -185,10 +188,7 @@ def csv_reader_get_pyarrow_read_options(names, skiprows):
     return read_options
 
 
-def csv_reader_get_pyarrow_parse_options(delimiter, sep):
-
-    if delimiter is None:
-        delimiter = sep
+def csv_reader_get_pyarrow_parse_options(delimiter):
 
     parse_options = csv.ParseOptions(
         delimiter=delimiter,
@@ -264,11 +264,11 @@ def csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates):
     return convert_options
 
 
-def get_pyarrow_read_csv_options(sep, delimiter, names, usecols, dtype, skiprows, parse_dates):
+def get_pyarrow_read_csv_options(delimiter, names, usecols, dtype, skiprows, parse_dates):
     """ This function attempts to map pandas read_csv parameters to pyarrow read_csv options to be used """
 
     read_opts = csv_reader_get_pyarrow_read_options(names, skiprows)
-    parse_opts = csv_reader_get_pyarrow_parse_options(delimiter, sep)
+    parse_opts = csv_reader_get_pyarrow_parse_options(delimiter)
     convert_opts = csv_reader_get_pyarrow_convert_options(names, usecols, dtype, parse_dates)
 
     return (read_opts, parse_opts, convert_opts)
diff --git a/sdc/tests/categorical/__init__.py b/sdc/tests/categorical/__init__.py
@@ -24,7 +24,7 @@
 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-from . import test_categorical
-from . import test_categoricaldtype
-from . import test_series_category
-from . import test_df_category
+from .test_categorical import *
+from .test_categoricaldtype import *
+from .test_series_category import *
+from .test_df_category import *
diff --git a/sdc/tests/categorical/test_categorical.py b/sdc/tests/categorical/test_categorical.py
@@ -26,6 +26,7 @@
 
 from sdc.tests.test_base import TestCase
 
+import unittest
 import pandas as pd
 import numba as nb
 from numba import types
@@ -91,3 +92,7 @@ def func():
 
         boxed = func()
         assert(boxed.equals(self._pd_value()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sdc/tests/categorical/test_df_category.py b/sdc/tests/categorical/test_df_category.py
@@ -26,6 +26,7 @@
 
 from sdc.tests.test_base import TestCase
 
+import unittest
 import numpy as np
 import pandas as pd
 import numba as nb
@@ -123,3 +124,7 @@ def func():
 
         boxed = func()
         assert(boxed.equals(self._pd_value()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sdc/tests/categorical/test_series_category.py b/sdc/tests/categorical/test_series_category.py
@@ -26,6 +26,7 @@
 
 from sdc.tests.test_base import TestCase
 
+import unittest
 import pandas as pd
 import numpy as np
 import numba as nb
@@ -114,3 +115,7 @@ def func():
 
         boxed = func()
         assert(boxed.equals(self._pd_value()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sdc/tests/test_groupby.py b/sdc/tests/test_groupby.py
@@ -211,6 +211,7 @@ def test_impl():
 
         pd.testing.assert_frame_equal(sdc_impl(), test_impl(), **kwargs)
 
+    @unittest.expectedFailure  # FIXME_pandas#43292: pandas groupby.sum impl broken
     def test_dataframe_groupby_mean(self):
         def test_impl(df):
             return df.groupby('A').mean()
@@ -222,6 +223,7 @@ def test_impl(df):
         # TODO: implement index classes, as current indexes do not have names
         pd.testing.assert_frame_equal(result, result_ref, check_names=False)
 
+    @unittest.expectedFailure  # FIXME_pandas#43292: pandas groupby.sum impl broken
     def test_dataframe_groupby_mean_no_unboxing(self):
         def test_impl():
             df = pd.DataFrame({
@@ -267,7 +269,6 @@ def test_impl():
         # TODO: implement index classes, as current indexes do not have names
         pd.testing.assert_frame_equal(result_jit, result_ref, check_names=False)
 
-    @unittest.expectedFailure   # pandas groupby.median returns unstable dtype (int or float) unlike series.median
     def test_dataframe_groupby_median_result_dtype(self):
         def test_impl(df):
             return df.groupby('A').median()
@@ -326,6 +327,7 @@ def test_impl(df):
         # TODO: implement index classes, as current indexes do not have names
         pd.testing.assert_frame_equal(result, result_ref, check_names=False)
 
+    @unittest.expectedFailure  # FIXME_pandas#43292: pandas groupby.sum impl broken
     def test_dataframe_groupby_sum_no_unboxing(self):
         def test_impl():
             df = pd.DataFrame({
diff --git a/setup.py b/setup.py
@@ -430,7 +430,7 @@ def run(self):
       package_data={'sdc.tests': ['*.bz2'], },
       install_requires=[
           'numpy>=1.16',
-          'pandas==1.2.0',
+          'pandas==1.3.4',
           'pyarrow==4.0.1',
           'numba==0.54.1',
           'tbb'