|
39 | 39 |
|
40 | 40 | from __future__ import print_function
|
41 | 41 |
|
42 |
| -import logging |
43 | 42 | import operator
|
44 | 43 | import typing
|
45 | 44 | from typing import Any, Dict, Iterator, Iterable, Optional, Sequence, Text, Tuple, Union
|
|
108 | 107 | [('y', _YType), ('y_count', _CountType),
|
109 | 108 | ('lift_values', Iterable[_LiftValue])])
|
110 | 109 |
|
| 110 | +# Beam counter to track the number of non-utf8 values. |
| 111 | +_NON_UTF8_VALUES_COUNTER = beam.metrics.Metrics.counter( |
| 112 | + constants.METRICS_NAMESPACE, 'num_non_utf8_values_lift_generator') |
| 113 | + |
111 | 114 |
|
112 | 115 | def _get_example_value_presence(
|
113 | 116 | record_batch: pa.RecordBatch, path: types.FeaturePath,
|
@@ -279,13 +282,12 @@ def _to_partial_x_counts(
|
279 | 282 | yield _SlicedXKey(slice_key, x_path, x), x_count
|
280 | 283 |
|
281 | 284 |
|
282 |
| -def _get_unicode_value(value: Union[Text, bytes], path: types.FeaturePath |
283 |
| - ) -> Text: |
| 285 | +def _get_unicode_value(value: Union[Text, bytes]) -> Text: |
| 286 | + """Get feature value decoded as utf-8.""" |
284 | 287 | decoded_value = stats_util.maybe_get_utf8(value)
|
285 | 288 | # Check if we have a valid utf-8 string. If not, assign a placeholder.
|
286 | 289 | if decoded_value is None:
|
287 |
| - logging.warning('Feature "%s" has bytes value "%s" which cannot be ' |
288 |
| - 'decoded as a UTF-8 string.', path, value) |
| 290 | + _NON_UTF8_VALUES_COUNTER.inc() |
289 | 291 | decoded_value = constants.NON_UTF8_PLACEHOLDER
|
290 | 292 | return decoded_value
|
291 | 293 |
|
@@ -341,7 +343,7 @@ def _make_dataset_feature_stats_proto(
|
341 | 343 | lift_series_proto.y_string = y
|
342 | 344 | y_display_val = y
|
343 | 345 | elif isinstance(y, six.binary_type):
|
344 |
| - y_string = _get_unicode_value(y, y_path) |
| 346 | + y_string = _get_unicode_value(y) |
345 | 347 | lift_series_proto.y_string = y_string
|
346 | 348 | y_display_val = y_string
|
347 | 349 | else:
|
@@ -370,7 +372,7 @@ def _make_dataset_feature_stats_proto(
|
370 | 372 | lift_value_proto.x_string = x
|
371 | 373 | x_display_val = x
|
372 | 374 | elif isinstance(x, six.binary_type):
|
373 |
| - x_string = _get_unicode_value(x, key.x_path) |
| 375 | + x_string = _get_unicode_value(x) |
374 | 376 | lift_value_proto.x_string = x_string
|
375 | 377 | x_display_val = x_string
|
376 | 378 | else:
|
|
0 commit comments