Skip to content

Commit 010bad0

Browse files
paulgc17tfx-copybara
authored andcommitted
Remove warning and add counter to track non utf8 values.
PiperOrigin-RevId: 331036516
1 parent 065dc16 commit 010bad0

File tree

2 files changed

+15
-10
lines changed

2 files changed

+15
-10
lines changed

tensorflow_data_validation/statistics/generators/lift_stats_generator.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939

4040
from __future__ import print_function
4141

42-
import logging
4342
import operator
4443
import typing
4544
from typing import Any, Dict, Iterator, Iterable, Optional, Sequence, Text, Tuple, Union
@@ -108,6 +107,10 @@
108107
[('y', _YType), ('y_count', _CountType),
109108
('lift_values', Iterable[_LiftValue])])
110109

110+
# Beam counter to track the number of non-utf8 values.
111+
_NON_UTF8_VALUES_COUNTER = beam.metrics.Metrics.counter(
112+
constants.METRICS_NAMESPACE, 'num_non_utf8_values_lift_generator')
113+
111114

112115
def _get_example_value_presence(
113116
record_batch: pa.RecordBatch, path: types.FeaturePath,
@@ -279,13 +282,12 @@ def _to_partial_x_counts(
279282
yield _SlicedXKey(slice_key, x_path, x), x_count
280283

281284

282-
def _get_unicode_value(value: Union[Text, bytes], path: types.FeaturePath
283-
) -> Text:
285+
def _get_unicode_value(value: Union[Text, bytes]) -> Text:
286+
"""Get feature value decoded as utf-8."""
284287
decoded_value = stats_util.maybe_get_utf8(value)
285288
# Check if we have a valid utf-8 string. If not, assign a placeholder.
286289
if decoded_value is None:
287-
logging.warning('Feature "%s" has bytes value "%s" which cannot be '
288-
'decoded as a UTF-8 string.', path, value)
290+
_NON_UTF8_VALUES_COUNTER.inc()
289291
decoded_value = constants.NON_UTF8_PLACEHOLDER
290292
return decoded_value
291293

@@ -341,7 +343,7 @@ def _make_dataset_feature_stats_proto(
341343
lift_series_proto.y_string = y
342344
y_display_val = y
343345
elif isinstance(y, six.binary_type):
344-
y_string = _get_unicode_value(y, y_path)
346+
y_string = _get_unicode_value(y)
345347
lift_series_proto.y_string = y_string
346348
y_display_val = y_string
347349
else:
@@ -370,7 +372,7 @@ def _make_dataset_feature_stats_proto(
370372
lift_value_proto.x_string = x
371373
x_display_val = x
372374
elif isinstance(x, six.binary_type):
373-
x_string = _get_unicode_value(x, key.x_path)
375+
x_string = _get_unicode_value(x)
374376
lift_value_proto.x_string = x_string
375377
x_display_val = x_string
376378
else:

tensorflow_data_validation/utils/top_k_uniques_stats_util.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
from __future__ import print_function
2020

2121
import collections
22-
import logging
2322
from typing import FrozenSet, List, Optional, Union
2423

24+
import apache_beam as beam
2525
import six
2626
from tensorflow_data_validation import constants
2727
from tensorflow_data_validation import types
@@ -33,6 +33,10 @@
3333
FeatureValueCount = collections.namedtuple('FeatureValueCount',
3434
['feature_value', 'count'])
3535

36+
# Beam counter to track the number of non-utf8 values.
37+
_NON_UTF8_VALUES_COUNTER = beam.metrics.Metrics.counter(
38+
constants.METRICS_NAMESPACE, 'num_non_utf8_values_topk_uniques_generator')
39+
3640

3741
def make_feature_stats_proto_topk_uniques(
3842
feature_path: types.FeaturePath, is_categorical: bool,
@@ -174,8 +178,7 @@ def _make_feature_stats_proto_topk(
174178
if isinstance(value, six.binary_type):
175179
decoded_value = stats_util.maybe_get_utf8(value)
176180
if decoded_value is None:
177-
logging.warning('Feature "%s" has bytes value "%s" which cannot be '
178-
'decoded as a UTF-8 string.', feature_path, value)
181+
_NON_UTF8_VALUES_COUNTER.inc()
179182
value = constants.NON_UTF8_PLACEHOLDER
180183
else:
181184
value = decoded_value

0 commit comments

Comments
 (0)