diff --git a/sdc/datatypes/hpat_pandas_dataframe_types.py b/sdc/datatypes/hpat_pandas_dataframe_types.py deleted file mode 100644 index 4ecd57375..000000000 --- a/sdc/datatypes/hpat_pandas_dataframe_types.py +++ /dev/null @@ -1,226 +0,0 @@ -# ***************************************************************************** -# Copyright (c) 2019-2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -""" - -| :class:`pandas.DataFrame` type implementation in Intel SDC -| Also, it contains related types and iterators for DataFrame type handling - -""" - - -import operator -import pandas - -from numba import types -from numba.core import cgutils -from numba.extending import (models, overload, register_model, make_attribute_wrapper, intrinsic, box, unbox) -from numba.core.datamodel import register_default, StructModel -from numba.core.typing.templates import signature, infer_global, AbstractTemplate - - -class DataFrameTypeIterator(types.SimpleIteratorType): - """ - Iterator type for DataFrameType type - - Members - ---------- - _data: :class:`DataFrameType` - input arg - """ - - def __init__(self, data=None): - self.data = data - - super(DataFrameTypeIterator, self).__init__("DataFrameTypeIterator(data={})".format(self.data), data) - - -@register_default(DataFrameTypeIterator) -class DataFrameTypeIteratorModel(StructModel): - """ - Model for DataFrameTypeIterator type - All members must be the same as main type for this model - - Test: - """ - - def __init__(self, dmm, fe_type): - members = [ - ('data', fe_type.data), - ] - super(DataFrameTypeIteratorModel, self).__init__(dmm, fe_type, members) - - -make_attribute_wrapper(DataFrameTypeIterator, 'data', '_data') - - -class DataFrameType(types.IterableType): - """ - Type definition for DataFrame functions handling. - - Members - ---------- - data: Dictinary of :class:`SeriesType` - input arg - - index: DataFrame index - *unsupported* - - Dictinary looks a bit ambigous due to keys are column names which already presented in Series. - This type selected due to pandas.DataFrame interprets input :class:`SeriesType` as rows instead - expected columns if passed as a list. - This data is interpreted as columns if passed as a dictinary only. - - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_create - """ - - def __init__(self, data=None): - - self.data = data - - type_str = "DataFrameType(data={})".format(self.data) - super(DataFrameType, self).__init__(type_str) - - @property - def iterator_type(self): - return DataFrameTypeIterator(self) - - -@register_model(DataFrameType) -class DataFrameTypeModel(StructModel): - """ - Model for DataFrameType type - All members must be the same as main type for this model - - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_create_numeric_column - """ - - def __init__(self, dmm, fe_type): - members = [ - ('data', fe_type.data) - ] - models.StructModel.__init__(self, dmm, fe_type, members) - - -make_attribute_wrapper(DataFrameType, 'data', '_data') - - -@intrinsic -def _hpat_pandas_dataframe_init(typingctx, data=None): - """ - Internal Numba required function to register DataFrameType and - connect it with corresponding Python type mentioned in @overload(pandas.DataFrame) - """ - - def _hpat_pandas_dataframe_init_codegen(context, builder, signature, args): - """ - It is looks like it creates DataFrameModel structure - - - Fixed number of parameters. Must be 4 - - increase reference counr for the data - """ - - [data_val] = args - - dataframe = cgutils.create_struct_proxy(signature.return_type)(context, builder) - dataframe.data = data_val - - if context.enable_nrt: - context.nrt.incref(builder, data, dataframe.data) - - return dataframe._getvalue() - - ret_typ = DataFrameType(data) - sig = signature(ret_typ, data) - """ - Construct signature of the Numba DataFrameType::ctor() - """ - - return sig, _hpat_pandas_dataframe_init_codegen - - -@overload(pandas.DataFrame) -def hpat_pandas_dataframe(data=None, index=None, columns=None, dtype=None, copy=False): - """ - Special Numba procedure to overload Python type pandas.DataFrame::ctor() with Numba registered model - """ - - if isinstance(data, types.DictType): - def hpat_pandas_dataframe_impl(data=None, index=None, columns=None, dtype=None, copy=False): - series_dict = {} - series_list = [] - - for key, value in data.items(): - """ - Convert input dictionary with: - key - unicode string - value - array - into dictinary of pandas.Series with same names and values - """ - - series_item = pandas.Series(data=value, name=key) - series_dict[key] = series_item - series_list.append(series_item) - - # return _hpat_pandas_dataframe_init(series_dict) - return _hpat_pandas_dataframe_init(series_list) - - return hpat_pandas_dataframe_impl - - -@box(DataFrameType) -def hpat_pandas_dataframe_box(typ, val, c): - """ - This method is to copy data from JITted region data structure - to new Python object data structure. - Python object data structure has creating in this procedure. - """ - - dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) - - ir_ptr_data = c.box(typ.data, dataframe.data) - - dataframe_ctor_args = c.pyapi.tuple_pack([ir_ptr_data, ]) - # dataframe_ctor_kwargs = c.pyapi.dict_pack([("data", ir_ptr_data), ]) - """ - It is better to use kwargs but it fails into SIGSEGV - """ - - dataframe_ctor_fn = c.pyapi.unserialize(c.pyapi.serialize_object(pandas.DataFrame)) - """ - Create a pandas.DataFrame ctor() function pointer - """ - - df_obj = c.pyapi.call(dataframe_ctor_fn, dataframe_ctor_args) # kws=dataframe_ctor_kwargs) - """ - Call pandas.DataFrame function pointer with parameters - """ - - c.pyapi.decref(ir_ptr_data) - c.pyapi.decref(dataframe_ctor_args) - c.pyapi.decref(dataframe_ctor_fn) - - return df_obj diff --git a/sdc/datatypes/hpat_pandas_functions.py b/sdc/datatypes/hpat_pandas_functions.py index 781c7df98..403d45d4e 100644 --- a/sdc/datatypes/hpat_pandas_functions.py +++ b/sdc/datatypes/hpat_pandas_functions.py @@ -46,7 +46,7 @@ from sdc.types import CategoricalDtypeType, Categorical from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype from sdc.utilities.utils import sdc_overload -from sdc.utilities.sdc_typing_utils import has_python_value +from sdc.utilities.sdc_typing_utils import has_python_value, get_nbtype_literal_values from sdc.extensions.sdc_arrow_table_type import PyarrowTableType from sdc.extensions.sdc_arrow_table_ext import ( arrow_reader_create_tableobj, @@ -91,13 +91,6 @@ def _get_py_col_dtype(ctype): return numpy_support.as_dtype(dtype) -def get_nbtype_literal_values(nbtype): - assert all(isinstance(x, types.Literal) for x in nbtype), \ - f"Attempt to unliteral values of {nbtype} failed" - - return [x.literal_value for x in nbtype] - - @sdc_overload(pd.read_csv) def sdc_pandas_read_csv_ovld( filepath_or_buffer, sep=',', delimiter=None, header="infer", names=None, index_col=None, diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index cb3051684..0e62d3b8c 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -32,7 +32,9 @@ from numba.extending import overload from numba.core.extending import intrinsic from numba.core.typing import signature +from numba.core.target_extension import current_target, resolve_dispatcher_from_str +import numpy as np from pandas import DataFrame from sys import modules from textwrap import dedent @@ -45,13 +47,17 @@ insert_before) from sdc.hiframes import pd_dataframe_ext as pd_dataframe_ext_module from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc -from sdc.hiframes.pd_dataframe_ext import get_structure_maps +from sdc.hiframes.pd_dataframe_ext import get_structure_maps, init_dataframe_internal from sdc.hiframes.api import fix_df_array, fix_df_index from sdc.str_ext import string_type from sdc.extensions.indexes.empty_index_ext import init_empty_index from sdc.datatypes.indexes.empty_index_type import EmptyIndexType -from sdc.utilities.sdc_typing_utils import TypeChecker +from sdc.utilities.sdc_typing_utils import TypeChecker, SDCLimitation from sdc.str_arr_type import StringArrayType +from sdc.functions.tuple_utils import sdc_tuple_map, sdc_tuple_zip +from sdc.datatypes.indexes.positional_index_type import PositionalIndexType +from sdc.utilities.utils import sdc_overload +from sdc.utilities.sdc_typing_utils import get_nbtype_literal_values, sdc_pandas_index_types @register_rewrite('before-inference') @@ -288,7 +294,7 @@ def gen_init_dataframe_func(func_name, func_text, global_vars): return loc_vars[func_name] -@overload(DataFrame) +@sdc_overload(DataFrame) def pd_dataframe_overload(data, index=None, columns=None, dtype=None, copy=False): """ Intel Scalable Dataframe Compiler User Guide @@ -302,20 +308,76 @@ def pd_dataframe_overload(data, index=None, columns=None, dtype=None, copy=False ty_checker = TypeChecker('Method DataFrame') - if not isinstance(data, (types.DictType, types.LiteralStrKeyDict)): - ty_checker.raise_exc(data, 'dict', 'data') + if not (isinstance(data, (types.DictType, types.LiteralStrKeyDict)) + or isinstance(data, types.Array) and data.ndim == 2 and isinstance(data.dtype, types.Number)): + ty_checker.raise_exc(data, 'dict or 2d numeric array', 'data') - if not (isinstance(index, (types.Omitted, types.ListType, types.List, - types.Array, StringArrayType, types.NoneType) or index is None)): + accepted_index_types = (types.Omitted, types.NoneType, types.ListType, types.List) + sdc_pandas_index_types + if not (isinstance(index, accepted_index_types) or index is None): ty_checker.raise_exc(index, 'array-like', 'index') - if not (isinstance(columns, (types.Omitted, types.NoneType, types.Tuple, types.UniTuple) or columns is None)): + if not (isinstance(columns, (types.Omitted, types.NoneType, types.Tuple, types.UniTuple)) or columns is None): ty_checker.raise_exc(columns, 'tuple of strings', 'columns') - if not (isinstance(dtype, (types.Omitted, types.NoneType) or dtype is None)): + if not (isinstance(dtype, (types.Omitted, types.NoneType)) or dtype is None): ty_checker.raise_exc(dtype, 'None', 'dtype') - if not (isinstance(copy, (types.Omitted, types.NoneType) or columns is False)): + if not (isinstance(copy, (types.Omitted, types.NoneType)) or copy is False): ty_checker.raise_exc(copy, 'False', 'copy') + if isinstance(data, types.Array): + # case of homogenous DF columns, is special as we can use views to input data + # when creating internal DF structure and avoid penalty in boxing DF as + # pd.DataFrame can be created from 2d array without copy + + nb_col_names = None + try: + nb_col_names = tuple(get_nbtype_literal_values(columns)) + except AssertionError: + ty_checker.raise_exc(columns, 'tuple of literal strings', 'columns') + + n_cols = len(columns) + if index is None and n_cols == 0: + # DataFrame index type cannot be defined unless columns argument is provided + # as it depends on the runtime number of columns in data + raise SDCLimitation("pd.DataFrame constructor from np.ndarray " \ + f"requires columns argument. Given columns={columns}.") + + # FIXME: there should be more accurate way to write this layout definition + if data.layout in ('C', 'A'): + col_type = types.Array(data.dtype, 1, 'A') + else: + col_type = types.Array(data.dtype, 1, 'C') + nb_col_types = tuple([col_type, ] * n_cols) + column_loc, _, _ = get_structure_maps(nb_col_types, nb_col_names) + + typingctx = resolve_dispatcher_from_str(current_target()).targetdescr.typing_context + fnty = typingctx.resolve_value_type(fix_df_index) + nb_index_type = types.none if index is None else index + fixed_index_sig = fnty.get_call_type(typingctx, (nb_index_type, nb_col_types[0]), {}) ### FIXME: need add column argument + fixed_index_typ = fixed_index_sig.return_type + need_fix_index = fixed_index_typ != index + + df_type = DataFrameType(nb_col_types, fixed_index_typ, nb_col_names, column_loc=column_loc) + + def pd_dataframe_2d_array_impl(data, index=None, columns=None, dtype=None, copy=False): + data_as_columns = np.transpose(data) + df_columns_list = [] + + if n_cols != data.shape[1]: + raise AssertionError("Number of columns must match data shape") + + for i in range(n_cols): + df_columns_list.append(data_as_columns[i]) + + data_tup = (df_columns_list, ) + if need_fix_index == True: # noqa + new_index = fix_df_index(index, df_columns_list[0]) + else: + new_index = index + + return init_dataframe_internal(data_tup, new_index, df_type) + + return pd_dataframe_2d_array_impl + return None diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 23ec45639..afccbadf9 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -184,6 +184,37 @@ def test_impl(A, B, index): result_ref = test_impl(A, B, index) pd.testing.assert_frame_equal(result, result_ref) + def test_create_from_2d_array(self): + """ Verifies DF can be created from 2d np.ndarray """ + + n_cols = 4 + col_names = tuple(f'f{i}' for i in range(n_cols)) + + def test_impl(data, index): + df = pd.DataFrame(data, index=index, columns=col_names) + res_sum = df[col_names[0]].sum() + return len(df), res_sum + + sdc_func = self.jit(test_impl) + + n_rows = 10 + indexes_to_test = [ + None, + np.arange(n_rows), + pd.RangeIndex(n_rows), + ] + layout_to_test = ('C', 'F', 'A') + + np.random.seed(0) + data = np.random.rand(n_rows, n_cols) + for layout, index in product(layout_to_test, indexes_to_test): + with self.subTest(layout=layout, index=index): + data = data.copy(order=layout) + result = sdc_func(data, index) + result_ref = test_impl(data, index) + self.assertEqual(result[0], result_ref[0]) + self.assertAlmostEqual(result[1], result_ref[1]) + def test_unbox_empty_df(self): def test_impl(df): return df diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 53fd39f8b..2312f57f7 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -257,3 +257,10 @@ def _check_dtype_param_type(dtype): valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) return isinstance(dtype, valid_dtype_types) or dtype is None + + +def get_nbtype_literal_values(nbtype): + assert all(isinstance(x, types.Literal) for x in nbtype), \ + f"Attempt to unliteral values of {nbtype} failed" + + return [x.literal_value for x in nbtype]