|
63 | 63 | _SlicedYKey = typing.NamedTuple('_SlicedYKey', [('slice_key', types.SliceKey),
|
64 | 64 | ('y', _YType)])
|
65 | 65 |
|
66 |
| -_SlicedXKey = typing.NamedTuple('_SlicedXKey', [('slice_key', types.SliceKey), |
67 |
| - ('x_path', types.FeaturePath), |
68 |
| - ('x', _XType)]) |
69 | 66 |
|
70 |
| -_SlicedXYKey = typing.NamedTuple('_SlicedXYKey', [('slice_key', types.SliceKey), |
71 |
| - ('x_path', types.FeaturePath), |
72 |
| - ('x', _XType), ('y', _YType)]) |
| 67 | +# TODO(embr,zhuo): FeaturePathTuple is used instead of FeaturePath because: |
| 68 | +# - FeaturePath does not have a deterministic coder |
| 69 | +# - Even if it does, beam does not automatically derive a coder for a |
| 70 | +# NamedTuple. |
| 71 | +# Once the latter is supported we can change all FEaturePathTuples back to |
| 72 | +# FeaturePaths. |
| 73 | +_SlicedXKey = typing.NamedTuple('_SlicedXKey', |
| 74 | + [('slice_key', types.SliceKey), |
| 75 | + ('x_path', types.FeaturePathTuple), |
| 76 | + ('x', _XType)]) |
| 77 | + |
| 78 | +_SlicedXYKey = typing.NamedTuple('_SlicedXYKey', |
| 79 | + [('slice_key', types.SliceKey), |
| 80 | + ('x_path', types.FeaturePathTuple), |
| 81 | + ('x', _XType), ('y', _YType)]) |
73 | 82 |
|
74 | 83 | _LiftSeriesKey = typing.NamedTuple('_LiftSeriesKey',
|
75 | 84 | [('slice_key', types.SliceKey),
|
76 |
| - ('x_path', types.FeaturePath), |
| 85 | + ('x_path', types.FeaturePathTuple), |
77 | 86 | ('y', _YType), ('y_count', _CountType)])
|
78 | 87 |
|
79 | 88 | _SlicedFeatureKey = typing.NamedTuple('_SlicedFeatureKey',
|
80 | 89 | [('slice_key', types.SliceKey),
|
81 |
| - ('x_path', types.FeaturePath)]) |
| 90 | + ('x_path', types.FeaturePathTuple)]) |
82 | 91 |
|
83 | 92 | _ConditionalYRate = typing.NamedTuple('_ConditionalYRate',
|
84 |
| - [('x_path', types.FeaturePath), |
| 93 | + [('x_path', types.FeaturePathTuple), |
85 | 94 | ('x', _XType), ('xy_count', _CountType),
|
86 | 95 | ('x_count', _CountType)])
|
87 | 96 |
|
@@ -171,15 +180,15 @@ def _get_example_value_presence(
|
171 | 180 | if is_binary_like:
|
172 | 181 | # return binary like values a pd.Categorical wrapped in a Series. This makes
|
173 | 182 | # subsqeuent operations like pd.Merge cheaper.
|
174 |
| - values = arr_flat_dict[values] |
| 183 | + values = arr_flat_dict[values].tolist() |
175 | 184 | else:
|
176 | 185 | values = values.tolist() # converts values to python native types.
|
177 | 186 | if weight_column_name:
|
178 | 187 | weights = arrow_util.get_weight_feature(record_batch, weight_column_name)
|
179 |
| - weights = np.asarray(weights)[example_indices] |
| 188 | + weights = np.asarray(weights)[example_indices].tolist() |
180 | 189 | else:
|
181 | 190 | weights = np.ones(len(example_indices), dtype=int).tolist()
|
182 |
| - return _ValuePresence(example_indices, values, weights) |
| 191 | + return _ValuePresence(example_indices.tolist(), values, weights) |
183 | 192 |
|
184 | 193 |
|
185 | 194 | def _to_partial_copresence_counts(
|
@@ -246,7 +255,8 @@ def _to_partial_copresence_counts(
|
246 | 255 | if num_xy_pairs_batch_copresent:
|
247 | 256 | num_xy_pairs_batch_copresent.update(len(copresence_counts))
|
248 | 257 | for (x, y), count in copresence_counts.items():
|
249 |
| - yield _SlicedXYKey(slice_key=slice_key, x_path=x_path, x=x, y=y), count |
| 258 | + yield (_SlicedXYKey(slice_key=slice_key, x_path=x_path.steps(), x=x, |
| 259 | + y=y), count) |
250 | 260 |
|
251 | 261 |
|
252 | 262 | def _to_partial_counts(
|
@@ -283,7 +293,7 @@ def _to_partial_x_counts(
|
283 | 293 | x_path,
|
284 | 294 | boundaries=None,
|
285 | 295 | weight_column_name=example_weight_map.get(x_path)):
|
286 |
| - yield _SlicedXKey(slice_key, x_path, x), x_count |
| 296 | + yield _SlicedXKey(slice_key, x_path.steps(), x), x_count |
287 | 297 |
|
288 | 298 |
|
289 | 299 | def _get_unicode_value(value: Union[Text, bytes]) -> Text:
|
@@ -324,11 +334,12 @@ def _make_dataset_feature_stats_proto(
|
324 | 334 | The populated DatasetFeatureStatistics proto.
|
325 | 335 | """
|
326 | 336 | key, lift_series_list = lifts
|
| 337 | + x_path = types.FeaturePath(key.x_path) |
327 | 338 | stats = statistics_pb2.DatasetFeatureStatistics()
|
328 | 339 | cross_stats = stats.cross_features.add(
|
329 |
| - path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) |
| 340 | + path_x=x_path.to_proto(), path_y=y_path.to_proto()) |
330 | 341 | if output_custom_stats:
|
331 |
| - feature_stats = stats.features.add(path=key.x_path.to_proto()) |
| 342 | + feature_stats = stats.features.add(path=x_path.to_proto()) |
332 | 343 | for lift_series in sorted(lift_series_list):
|
333 | 344 | lift_series_proto = (
|
334 | 345 | cross_stats.categorical_cross_stats.lift.lift_series.add())
|
@@ -392,7 +403,8 @@ def _make_dataset_feature_stats_proto(
|
392 | 403 | def _cross_join_y_keys(
|
393 | 404 | join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]]
|
394 | 405 | # TODO(b/147153346) update dict value list element type annotation to:
|
395 |
| - # Union[_YKey, Tuple[_YType, Tuple[types.FeaturePath, _XType, _CountType]]] |
| 406 | + # Union[_YKey, Tuple[_YType, |
| 407 | + # Tuple[types.FeaturePathTuple, _XType, _CountType]]] |
396 | 408 | ) -> Iterator[Tuple[_SlicedXYKey, _CountType]]:
|
397 | 409 | slice_key, join_args = join_info
|
398 | 410 | for x_path, x, _ in join_args['x_counts']:
|
|
0 commit comments