Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

hiframes: better error reporting for getattr #64

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 87 additions & 16 deletions hpat/hiframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,20 @@
from hpat.hiframes_rolling import get_rolling_setup_args, supported_rolling_funcs
from hpat.hiframes_aggregate import get_agg_func, supported_agg_funcs

import enum

LARGE_WIN_SIZE = 10

# Enumeration for annotating known data frame attributes.
# Used to give better error messages to the user.
#
# Non-method attributes do not require immediate call tracking.
# Methods are required to be called right after getattr assignment.
class DataFrameAttr(enum.Enum):
UNIMPLEMENTED_ATTR = 0
UNIMPLEMENTED_METHOD = 1
IMPLEMENTED_ATTR = 2
IMPLEMENTED_METHOD = 3

def remove_hiframes(rhs, lives, call_list):
# used in stencil generation of rolling
Expand Down Expand Up @@ -74,6 +86,47 @@ def remove_hiframes(rhs, lives, call_list):
return False


# TODO(quasilyte): remove duplication from this dict and
# methods handling specific attributes and methods.
df_spec = {
DataFrameAttr.UNIMPLEMENTED_ATTR: [
'ndim',
],
DataFrameAttr.UNIMPLEMENTED_METHOD: [
'filter',
],
DataFrameAttr.IMPLEMENTED_ATTR: [
'values',
'iat',
'loc',
'iloc',
],
DataFrameAttr.IMPLEMENTED_METHOD: [
'apply',
'describe',
'sort_values',
'itertuples',
'pivot_table',
'head',
'isin',
'append',
'fillna',
'dropna',
'groupby',
'rolling',
],
}

# To avoid excessive globals and keep convenient inversed dict literal,
# convert to final form using comprehensions.
# First comprehension turns dict into a list of dicts.
# Second comprehension merges all dicts together.
# {0: ['a', 'b'], 1: ['c']}
# => [{'a': 0, 'b': 0}, {'c': 1}]
# => {'a': 0, 'b': 0, 'c': 1}
df_spec = [{v:k for v in values} for k, values in df_spec.items()]
df_spec = {k:v for d in df_spec for k, v in d.items()}

numba.ir_utils.remove_call_handlers.append(remove_hiframes)

class HiFrames(object):
Expand All @@ -100,7 +153,6 @@ def __init__(self, func_ir, typingctx, args, _locals):
func_ir, typingctx, args, _locals, self.reverse_copies)
self.h5_handler = pio.PIO(self.func_ir, _locals, self.reverse_copies)


def run(self):
# FIXME: see why this breaks test_kmeans
# remove_dels(self.func_ir.blocks)
Expand Down Expand Up @@ -218,21 +270,40 @@ def _run_assign(self, assign, label):
return [hiframes_filter.Filter(lhs, in_df.name, index_var,
self.df_vars, rhs.loc)]

# d = df.column
if (rhs.op == 'getattr' and self._is_df_var(rhs.value)
and self._is_df_colname(rhs.value, rhs.attr)):
df = rhs.value.name
col_var = self._get_df_colvar(rhs.value, rhs.attr)
assign.value = col_var
# need to remove the lhs definition so that find_callname can
# match column function calls (i.e. A.f instead of df.A.f)
assert self.func_ir._definitions[lhs] == [rhs], "invalid def"
self.func_ir._definitions[lhs] = [None]

# A = df.values
if (rhs.op == 'getattr' and self._is_df_var(rhs.value)
and rhs.attr == 'values'):
return self._handle_df_values(assign.target, rhs.value)
if rhs.op == 'getattr' and self._is_df_var(rhs.value):
if self._is_df_colname(rhs.value, rhs.attr):
# Handle `d = df.column`
df = rhs.value.name
col_var = self._get_df_colvar(rhs.value, rhs.attr)
assign.value = col_var
# need to remove the lhs definition so that find_callname can
# match column function calls (i.e. A.f instead of df.A.f)
assert self.func_ir._definitions[lhs] == [rhs], "invalid def"
self.func_ir._definitions[lhs] = [None]
elif rhs.attr not in df_spec:
# Not a column name, not a statically known attribute.
# Should probably raise an AttributeError, but since
# some entries may be missing from fields table,
# print a warning for now.
selector = "{}.{}".format(assign.value.value, assign.value.attr)
warnings.warn("unknown attribute {} accessed at {}".format(
selector, assign.loc))
else:
kind = df_spec[rhs.attr]
if kind == DataFrameAttr.UNIMPLEMENTED_ATTR:
raise NotImplementedError(
"data frame attribute {} not implemented yet".format(rhs.attr))
elif kind == DataFrameAttr.UNIMPLEMENTED_METHOD:
raise NotImplementedError(
"data frame function {} not implemented yet".format(rhs.attr))
elif kind == DataFrameAttr.IMPLEMENTED_METHOD:
pass # Handling is done inside _run_call_df.
elif kind == DataFrameAttr.IMPLEMENTED_ATTR:
# Handle `A = df.values`
if rhs.attr == 'values':
return self._handle_df_values(assign.target, rhs.value)
else:
raise ValueError("unreachable: unexpected DataFrameAttr kind")

if isinstance(rhs, ir.Arg):
return self._run_arg(assign, label)
Expand Down
38 changes: 38 additions & 0 deletions hpat/tests/test_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import unittest
import pandas as pd
import numpy as np
import hpat
import warnings

class TestDataFrameErrors(unittest.TestCase):
def test_get_unknown_attr(self):
def test_impl():
df = pd.DataFrame({'A': [0, 0]})
A = df['A'] # OK, known column
field = df.loc # OK, known field
method = df.apply # OK, known method
something = df.non_existing # Should warn

with warnings.catch_warnings(record=True) as w:
hpat.jit(test_impl)()
self.assertIn(
"unknown attribute df.non_existing accessed",
str(w[0].message))

def test_get_unimplemented_attr(self):
def test_impl():
df = pd.DataFrame({'A': [1.0, 2.0]})
ndim = df.ndim

with self.assertRaises(NotImplementedError) as err:
hpat.jit(test_impl)()
self.assertIn("data frame attribute ndim not implemented yet", str(err.exception))

def test_call_unimplemented_method(self):
def test_impl():
df = pd.DataFrame({'A': [0.1, 0.2], 'B': [0.3]})
return df.filter(items=['A'])

with self.assertRaises(NotImplementedError) as err:
hpat.jit(test_impl)()
self.assertIn("data frame function filter not implemented yet", str(err.exception))