Lift stats generator: use Tuple instead of FeaturePath when encoding feature paths as keys in a PTable.

brills · tfx-copybara · commit debbe73d036a · 2021-03-23T10:44:56.000-07:00
Also changed the output type of some internal functions from numpy arrays to python lists. elements of an numpy array are of numpy types which do not have deterministic beam coders.

PiperOrigin-RevId: 364593214
diff --git a/tensorflow_data_validation/statistics/generators/lift_stats_generator.py b/tensorflow_data_validation/statistics/generators/lift_stats_generator.py
@@ -63,25 +63,34 @@
 _SlicedYKey = typing.NamedTuple('_SlicedYKey', [('slice_key', types.SliceKey),
                                                 ('y', _YType)])
 
-_SlicedXKey = typing.NamedTuple('_SlicedXKey', [('slice_key', types.SliceKey),
-                                                ('x_path', types.FeaturePath),
-                                                ('x', _XType)])
 
-_SlicedXYKey = typing.NamedTuple('_SlicedXYKey', [('slice_key', types.SliceKey),
-                                                  ('x_path', types.FeaturePath),
-                                                  ('x', _XType), ('y', _YType)])
+# TODO(embr,zhuo): FeaturePathTuple is used instead of FeaturePath because:
+#  - FeaturePath does not have a deterministic coder
+#  - Even if it does, beam does not automatically derive a coder for a
+#    NamedTuple.
+#  Once the latter is supported we can change all FEaturePathTuples back to
+#  FeaturePaths.
+_SlicedXKey = typing.NamedTuple('_SlicedXKey',
+                                [('slice_key', types.SliceKey),
+                                 ('x_path', types.FeaturePathTuple),
+                                 ('x', _XType)])
+
+_SlicedXYKey = typing.NamedTuple('_SlicedXYKey',
+                                 [('slice_key', types.SliceKey),
+                                  ('x_path', types.FeaturePathTuple),
+                                  ('x', _XType), ('y', _YType)])
 
 _LiftSeriesKey = typing.NamedTuple('_LiftSeriesKey',
                                    [('slice_key', types.SliceKey),
-                                    ('x_path', types.FeaturePath),
+                                    ('x_path', types.FeaturePathTuple),
                                     ('y', _YType), ('y_count', _CountType)])
 
 _SlicedFeatureKey = typing.NamedTuple('_SlicedFeatureKey',
                                       [('slice_key', types.SliceKey),
-                                       ('x_path', types.FeaturePath)])
+                                       ('x_path', types.FeaturePathTuple)])
 
 _ConditionalYRate = typing.NamedTuple('_ConditionalYRate',
-                                      [('x_path', types.FeaturePath),
+                                      [('x_path', types.FeaturePathTuple),
                                        ('x', _XType), ('xy_count', _CountType),
                                        ('x_count', _CountType)])
 
@@ -171,15 +180,15 @@ def _get_example_value_presence(
   if is_binary_like:
     # return binary like values a pd.Categorical wrapped in a Series. This makes
     # subsqeuent operations like pd.Merge cheaper.
-    values = arr_flat_dict[values]
+    values = arr_flat_dict[values].tolist()
   else:
     values = values.tolist()  # converts values to python native types.
   if weight_column_name:
     weights = arrow_util.get_weight_feature(record_batch, weight_column_name)
-    weights = np.asarray(weights)[example_indices]
+    weights = np.asarray(weights)[example_indices].tolist()
   else:
     weights = np.ones(len(example_indices), dtype=int).tolist()
-  return _ValuePresence(example_indices, values, weights)
+  return _ValuePresence(example_indices.tolist(), values, weights)
 
 
 def _to_partial_copresence_counts(
@@ -246,7 +255,8 @@ def _to_partial_copresence_counts(
     if num_xy_pairs_batch_copresent:
       num_xy_pairs_batch_copresent.update(len(copresence_counts))
     for (x, y), count in copresence_counts.items():
-      yield _SlicedXYKey(slice_key=slice_key, x_path=x_path, x=x, y=y), count
+      yield (_SlicedXYKey(slice_key=slice_key, x_path=x_path.steps(), x=x,
+                          y=y), count)
 
 
 def _to_partial_counts(
@@ -283,7 +293,7 @@ def _to_partial_x_counts(
         x_path,
         boundaries=None,
         weight_column_name=example_weight_map.get(x_path)):
-      yield _SlicedXKey(slice_key, x_path, x), x_count
+      yield _SlicedXKey(slice_key, x_path.steps(), x), x_count
 
 
 def _get_unicode_value(value: Union[Text, bytes]) -> Text:
@@ -324,11 +334,12 @@ def _make_dataset_feature_stats_proto(
     The populated DatasetFeatureStatistics proto.
   """
   key, lift_series_list = lifts
+  x_path = types.FeaturePath(key.x_path)
   stats = statistics_pb2.DatasetFeatureStatistics()
   cross_stats = stats.cross_features.add(
-      path_x=key.x_path.to_proto(), path_y=y_path.to_proto())
+      path_x=x_path.to_proto(), path_y=y_path.to_proto())
   if output_custom_stats:
-    feature_stats = stats.features.add(path=key.x_path.to_proto())
+    feature_stats = stats.features.add(path=x_path.to_proto())
   for lift_series in sorted(lift_series_list):
     lift_series_proto = (
         cross_stats.categorical_cross_stats.lift.lift_series.add())
@@ -392,7 +403,8 @@ def _make_dataset_feature_stats_proto(
 def _cross_join_y_keys(
     join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]]
     # TODO(b/147153346) update dict value list element type annotation to:
-    # Union[_YKey, Tuple[_YType, Tuple[types.FeaturePath, _XType, _CountType]]]
+    # Union[_YKey, Tuple[_YType,
+    #                    Tuple[types.FeaturePathTuple, _XType, _CountType]]]
 ) -> Iterator[Tuple[_SlicedXYKey, _CountType]]:
   slice_key, join_args = join_info
   for x_path, x, _ in join_args['x_counts']:
diff --git a/tensorflow_data_validation/statistics/generators/lift_stats_generator_test.py b/tensorflow_data_validation/statistics/generators/lift_stats_generator_test.py
@@ -17,7 +17,6 @@
 
 from absl.testing import absltest
 import apache_beam as beam
-from apache_beam.options import pipeline_options
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -31,10 +30,6 @@
 from tensorflow_metadata.proto.v0 import statistics_pb2
 
 
-# TODO(b/181911927): Remove this workaround.
-pipeline_options.TypeOptions.allow_non_deterministic_key_coders = True
-
-
 def _get_example_value_presence_as_dataframe(
     record_batch: pa.RecordBatch, path: types.FeaturePath,
     boundaries: Optional[Sequence[float]],
@@ -221,8 +216,8 @@ def test_to_partial_x_counts_unweighted(self):
     ], ['x'])
     x_path = types.FeaturePath(['x'])
     expected_counts = [
-        (lift_stats_generator._SlicedXYKey('', x_path, x=1, y=None), 2),
-        (lift_stats_generator._SlicedXYKey('', x_path, x=2, y=None), 1),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=1, y=None), 2),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=2, y=None), 1),
     ]
     for (expected_key, expected_count), (actual_key, actual_count) in zip(
         expected_counts,
@@ -241,8 +236,10 @@ def test_to_partial_x_counts_weighted(self):
     ], ['x', 'w'])
     x_path = types.FeaturePath(['x'])
     expected_counts = [
-        (lift_stats_generator._SlicedXYKey('', x_path, x=1, y=None), 2.5),
-        (lift_stats_generator._SlicedXYKey('', x_path, x=2, y=None), 0.5),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=1,
+                                           y=None), 2.5),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=2,
+                                           y=None), 0.5),
     ]
     for (expected_key, expected_count), (actual_key, actual_count) in zip(
         expected_counts,
@@ -263,9 +260,9 @@ def test_to_partial_copresence_counts_unweighted(self):
     ], ['x', 'y'])
     x_path = types.FeaturePath(['x'])
     expected_counts = [
-        (lift_stats_generator._SlicedXYKey('', x_path, x=1, y='a'), 1),
-        (lift_stats_generator._SlicedXYKey('', x_path, x=1, y='b'), 1),
-        (lift_stats_generator._SlicedXYKey('', x_path, x=2, y='a'), 1)
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=1, y='a'), 1),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=1, y='b'), 1),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=2, y='a'), 1)
     ]
     actual_counts = list(
         lift_stats_generator._to_partial_copresence_counts(
@@ -284,9 +281,11 @@ def test_to_partial_copresence_counts_weighted(self):
     ], ['x', 'y', 'w'])
     x_path = types.FeaturePath(['x'])
     expected_counts = [
-        (lift_stats_generator._SlicedXYKey('', x_path, x=1, y='a'), 0.5),
-        (lift_stats_generator._SlicedXYKey('', x_path, x=1, y='b'), 2.0),
-        (lift_stats_generator._SlicedXYKey('', x_path, x=2, y='a'), 0.5)
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=1,
+                                           y='a'), 0.5),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=1,
+                                           y='b'), 2.0),
+        (lift_stats_generator._SlicedXYKey('', x_path.steps(), x=2, y='a'), 0.5)
     ]
     actual_counts = list(
         lift_stats_generator._to_partial_copresence_counts(
diff --git a/tensorflow_data_validation/statistics/stats_impl_test.py b/tensorflow_data_validation/statistics/stats_impl_test.py
@@ -21,7 +21,6 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 import apache_beam as beam
-from apache_beam.options import pipeline_options
 from apache_beam.testing import util
 import numpy as np
 import pyarrow as pa
@@ -41,10 +40,6 @@
 from tensorflow_metadata.proto.v0 import statistics_pb2
 
 
-# TODO(b/181911927): Remove this workaround.
-pipeline_options.TypeOptions.allow_non_deterministic_key_coders = True
-
-
 # Testing classes for 'custom_feature_generator' testcase.
 # They are defined module level in order to allow pickling.
 class _BaseCounter(stats_generator.CombinerFeatureStatsGenerator):