Skip to content

Commit 065dc16

Browse files
paulgc17tfx-copybara
authored andcommitted
Deprecate sample_count option in tfdv.StatsOptions
PiperOrigin-RevId: 331000377
1 parent 256ca91 commit 065dc16

File tree

5 files changed

+5
-106
lines changed

5 files changed

+5
-106
lines changed

RELEASE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
## Deprecations
2525

2626
* Deprecated Py3.5 support.
27+
* Deprecated `sample_count` option in `tfdv.StatsOptions`. Use `sample_rate`
28+
option instead.
2729

2830
# Version 0.23.0
2931

tensorflow_data_validation/api/stats_api.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
from typing import Generator, Text
4949
import apache_beam as beam
5050
import pyarrow as pa
51-
from tensorflow_data_validation import constants
5251
from tensorflow_data_validation.statistics import stats_impl
5352
from tensorflow_data_validation.statistics import stats_options
5453

@@ -91,27 +90,7 @@ def __init__(
9190
self._options = options
9291

9392
def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
94-
# Sample input data if sample_count option is provided.
95-
# TODO(b/117229955): Consider providing an option to write the sample
96-
# to a file.
97-
# TODO(zhuo): clean this up once public APIs are changed to accept
98-
# PCollection[RecordBatch].
99-
if self._options.sample_count is not None:
100-
# TODO(pachristopher): Consider moving the sampling logic to decoders.
101-
# beam.combiners.Sample.FixedSizeGlobally returns a
102-
# PCollection[List[pa.RecordBatch]], which we then flatten to get a
103-
# PCollection[pa.RecordBatch].
104-
batch_size = (
105-
self._options.desired_batch_size if self._options.desired_batch_size
106-
and self._options.desired_batch_size > 0 else
107-
constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
108-
batch_count = (
109-
int(self._options.sample_count / batch_size) +
110-
(1 if self._options.sample_count % batch_size else 0))
111-
dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
112-
beam.combiners.Sample.FixedSizeGlobally(batch_count)
113-
| 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
114-
elif self._options.sample_rate is not None:
93+
if self._options.sample_rate is not None:
11594
dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate >>
11695
beam.FlatMap(_sample_at_rate,
11796
sample_rate=self._options.sample_rate))

tensorflow_data_validation/api/stats_api_test.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -584,34 +584,6 @@ def test_stats_pipeline_with_zero_examples(self):
584584
test_util.make_dataset_feature_stats_list_proto_equal_fn(
585585
self, expected_result))
586586

587-
def test_stats_pipeline_with_sample_count(self):
588-
record_batches = [
589-
pa.RecordBatch.from_arrays(
590-
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
591-
pa.RecordBatch.from_arrays(
592-
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
593-
pa.RecordBatch.from_arrays(
594-
[pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
595-
]
596-
597-
with beam.Pipeline() as p:
598-
options = stats_options.StatsOptions(
599-
sample_count=3000,
600-
num_top_values=2,
601-
num_rank_histogram_buckets=2,
602-
num_values_histogram_buckets=2,
603-
num_histogram_buckets=2,
604-
num_quantiles_histogram_buckets=2,
605-
epsilon=0.001,
606-
desired_batch_size=3000)
607-
result = (
608-
p | beam.Create(record_batches)
609-
| stats_api.GenerateStatistics(options))
610-
util.assert_that(
611-
result,
612-
test_util.make_dataset_feature_stats_list_proto_equal_fn(
613-
self, self._sampling_test_expected_result))
614-
615587
def test_stats_pipeline_with_sample_rate(self):
616588
record_batches = [
617589
pa.RecordBatch.from_arrays(

tensorflow_data_validation/statistics/stats_options.py

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,9 @@
2222
import copy
2323
import json
2424
import types as python_types
25-
25+
from typing import List, Optional, Text
2626
from tensorflow_data_validation import types
2727
from tensorflow_data_validation.statistics.generators import stats_generator
28-
from typing import List, Optional, Text
2928

3029
from google.protobuf import json_format
3130
from tensorflow_metadata.proto.v0 import schema_pb2
@@ -37,7 +36,6 @@
3736

3837

3938
# TODO(b/118833241): Set MI default configs when MI is a default generator
40-
# TODO(b/162776976): Consider deprecating sample_count option.
4139
class StatsOptions(object):
4240
"""Options for generating statistics."""
4341

@@ -49,7 +47,6 @@ def __init__(
4947
label_feature: Optional[types.FeatureName] = None,
5048
weight_feature: Optional[types.FeatureName] = None,
5149
slice_functions: Optional[List[types.SliceFunction]] = None,
52-
sample_count: Optional[int] = None,
5350
sample_rate: Optional[float] = None,
5451
num_top_values: int = 20,
5552
frequency_threshold: int = 1,
@@ -79,14 +76,8 @@ def __init__(
7976
slice_functions: An optional list of functions that generate slice keys
8077
for each example. Each slice function should take an example dict as
8178
input and return a list of zero or more slice keys.
82-
sample_count: An optional number of examples to include in the sample. If
83-
specified, statistics is computed over the sample. Only one of
84-
sample_count or sample_rate can be specified. Note that since TFDV
85-
batches input examples, the sample count is only a desired count and we
86-
may include more examples in certain cases.
8779
sample_rate: An optional sampling rate. If specified, statistics is
88-
computed over the sample. Only one of sample_count or sample_rate can
89-
be specified.
80+
computed over the sample.
9081
num_top_values: An optional number of most frequent feature values to keep
9182
for string features.
9283
frequency_threshold: An optional minimum number of examples the most
@@ -126,7 +117,6 @@ def __init__(
126117
self.label_feature = label_feature
127118
self.weight_feature = weight_feature
128119
self.slice_functions = slice_functions
129-
self.sample_count = sample_count
130120
self.sample_rate = sample_rate
131121
self.num_top_values = num_top_values
132122
self.frequency_threshold = frequency_threshold
@@ -245,30 +235,13 @@ def slice_functions(
245235
raise TypeError('slice_functions must contain functions only.')
246236
self._slice_functions = slice_functions
247237

248-
@property
249-
def sample_count(self) -> Optional[int]:
250-
return self._sample_count
251-
252-
@sample_count.setter
253-
def sample_count(self, sample_count: Optional[int]) -> None:
254-
if sample_count is not None:
255-
if hasattr(self, 'sample_rate') and self.sample_rate is not None:
256-
raise ValueError('Only one of sample_count or sample_rate can be '
257-
'specified.')
258-
if sample_count < 1:
259-
raise ValueError('Invalid sample_count %d' % sample_count)
260-
self._sample_count = sample_count
261-
262238
@property
263239
def sample_rate(self) -> Optional[float]:
264240
return self._sample_rate
265241

266242
@sample_rate.setter
267243
def sample_rate(self, sample_rate: Optional[float]):
268244
if sample_rate is not None:
269-
if hasattr(self, 'sample_count') and self.sample_count is not None:
270-
raise ValueError('Only one of sample_count or sample_rate can be '
271-
'specified.')
272245
if not 0 < sample_rate <= 1:
273246
raise ValueError('Invalid sample_rate %f' % sample_rate)
274247
self._sample_rate = sample_rate

tensorflow_data_validation/statistics/stats_options_test.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -79,32 +79,6 @@
7979
'exception_type': TypeError,
8080
'error_message': 'slice_functions must contain functions only.'
8181
},
82-
{
83-
'testcase_name': 'sample_count_zero',
84-
'stats_options_kwargs': {
85-
'sample_count': 0
86-
},
87-
'exception_type': ValueError,
88-
'error_message': 'Invalid sample_count 0'
89-
},
90-
{
91-
'testcase_name': 'sample_count_negative',
92-
'stats_options_kwargs': {
93-
'sample_count': -1
94-
},
95-
'exception_type': ValueError,
96-
'error_message': 'Invalid sample_count -1'
97-
},
98-
{
99-
'testcase_name': 'both_sample_count_and_sample_rate',
100-
'stats_options_kwargs': {
101-
'sample_count': 100,
102-
'sample_rate': 0.5
103-
},
104-
'exception_type': ValueError,
105-
'error_message': 'Only one of sample_count or sample_rate can be '
106-
'specified.'
107-
},
10882
{
10983
'testcase_name': 'sample_rate_zero',
11084
'stats_options_kwargs': {
@@ -304,7 +278,6 @@ def test_stats_options_from_json(self):
304278
"weight_feature": null,
305279
"label_feature": null,
306280
"_slice_functions": null,
307-
"_sample_count": null,
308281
"_sample_rate": null,
309282
"num_top_values": 20,
310283
"frequency_threshold": 1,

0 commit comments

Comments
 (0)