Deprecate sample_count option in tfdv.StatsOptions

paulgc17 · tfx-copybara · commit 065dc161c40f · 2020-09-10T13:07:08.000-07:00
PiperOrigin-RevId: 331000377
diff --git a/RELEASE.md b/RELEASE.md
@@ -24,6 +24,8 @@
 ## Deprecations
 
 *   Deprecated Py3.5 support.
+*   Deprecated `sample_count` option in `tfdv.StatsOptions`. Use `sample_rate`
+    option instead.
 
 # Version 0.23.0
 
diff --git a/tensorflow_data_validation/api/stats_api.py b/tensorflow_data_validation/api/stats_api.py
@@ -48,7 +48,6 @@
 from typing import Generator, Text
 import apache_beam as beam
 import pyarrow as pa
-from tensorflow_data_validation import constants
 from tensorflow_data_validation.statistics import stats_impl
 from tensorflow_data_validation.statistics import stats_options
 
@@ -91,27 +90,7 @@ def __init__(
     self._options = options
 
   def expand(self, dataset: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
-    # Sample input data if sample_count option is provided.
-    # TODO(b/117229955): Consider providing an option to write the sample
-    # to a file.
-    # TODO(zhuo): clean this up once public APIs are changed to accept
-    # PCollection[RecordBatch].
-    if self._options.sample_count is not None:
-      # TODO(pachristopher): Consider moving the sampling logic to decoders.
-      # beam.combiners.Sample.FixedSizeGlobally returns a
-      # PCollection[List[pa.RecordBatch]], which we then flatten to get a
-      # PCollection[pa.RecordBatch].
-      batch_size = (
-          self._options.desired_batch_size if self._options.desired_batch_size
-          and self._options.desired_batch_size > 0 else
-          constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
-      batch_count = (
-          int(self._options.sample_count / batch_size) +
-          (1 if self._options.sample_count % batch_size else 0))
-      dataset |= ('SampleExamples(%s)' % self._options.sample_count >>
-                  beam.combiners.Sample.FixedSizeGlobally(batch_count)
-                  | 'FlattenExamples' >> beam.FlatMap(lambda lst: lst))
-    elif self._options.sample_rate is not None:
+    if self._options.sample_rate is not None:
       dataset |= ('SampleExamplesAtRate(%s)' % self._options.sample_rate >>
                   beam.FlatMap(_sample_at_rate,
                                sample_rate=self._options.sample_rate))
diff --git a/tensorflow_data_validation/api/stats_api_test.py b/tensorflow_data_validation/api/stats_api_test.py
@@ -584,34 +584,6 @@ def test_stats_pipeline_with_zero_examples(self):
           test_util.make_dataset_feature_stats_list_proto_equal_fn(
               self, expected_result))
 
-  def test_stats_pipeline_with_sample_count(self):
-    record_batches = [
-        pa.RecordBatch.from_arrays(
-            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
-        pa.RecordBatch.from_arrays(
-            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
-        pa.RecordBatch.from_arrays(
-            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
-    ]
-
-    with beam.Pipeline() as p:
-      options = stats_options.StatsOptions(
-          sample_count=3000,
-          num_top_values=2,
-          num_rank_histogram_buckets=2,
-          num_values_histogram_buckets=2,
-          num_histogram_buckets=2,
-          num_quantiles_histogram_buckets=2,
-          epsilon=0.001,
-          desired_batch_size=3000)
-      result = (
-          p | beam.Create(record_batches)
-          | stats_api.GenerateStatistics(options))
-      util.assert_that(
-          result,
-          test_util.make_dataset_feature_stats_list_proto_equal_fn(
-              self, self._sampling_test_expected_result))
-
   def test_stats_pipeline_with_sample_rate(self):
     record_batches = [
         pa.RecordBatch.from_arrays(
diff --git a/tensorflow_data_validation/statistics/stats_options.py b/tensorflow_data_validation/statistics/stats_options.py
@@ -22,10 +22,9 @@
 import copy
 import json
 import types as python_types
-
+from typing import List, Optional, Text
 from tensorflow_data_validation import types
 from tensorflow_data_validation.statistics.generators import stats_generator
-from typing import List, Optional, Text
 
 from google.protobuf import json_format
 from tensorflow_metadata.proto.v0 import schema_pb2
@@ -37,7 +36,6 @@
 
 
 # TODO(b/118833241): Set MI default configs when MI is a default generator
-# TODO(b/162776976): Consider deprecating sample_count option.
 class StatsOptions(object):
   """Options for generating statistics."""
 
@@ -49,7 +47,6 @@ def __init__(
       label_feature: Optional[types.FeatureName] = None,
       weight_feature: Optional[types.FeatureName] = None,
       slice_functions: Optional[List[types.SliceFunction]] = None,
-      sample_count: Optional[int] = None,
       sample_rate: Optional[float] = None,
       num_top_values: int = 20,
       frequency_threshold: int = 1,
@@ -79,14 +76,8 @@ def __init__(
       slice_functions: An optional list of functions that generate slice keys
         for each example. Each slice function should take an example dict as
         input and return a list of zero or more slice keys.
-      sample_count: An optional number of examples to include in the sample. If
-        specified, statistics is computed over the sample. Only one of
-        sample_count or sample_rate can be specified. Note that since TFDV
-        batches input examples, the sample count is only a desired count and we
-        may include more examples in certain cases.
       sample_rate: An optional sampling rate. If specified, statistics is
-        computed over the sample. Only one of sample_count or sample_rate can
-        be specified.
+        computed over the sample.
       num_top_values: An optional number of most frequent feature values to keep
         for string features.
       frequency_threshold: An optional minimum number of examples the most
@@ -126,7 +117,6 @@ def __init__(
     self.label_feature = label_feature
     self.weight_feature = weight_feature
     self.slice_functions = slice_functions
-    self.sample_count = sample_count
     self.sample_rate = sample_rate
     self.num_top_values = num_top_values
     self.frequency_threshold = frequency_threshold
@@ -245,30 +235,13 @@ def slice_functions(
           raise TypeError('slice_functions must contain functions only.')
     self._slice_functions = slice_functions
 
-  @property
-  def sample_count(self) -> Optional[int]:
-    return self._sample_count
-
-  @sample_count.setter
-  def sample_count(self, sample_count: Optional[int]) -> None:
-    if sample_count is not None:
-      if hasattr(self, 'sample_rate') and self.sample_rate is not None:
-        raise ValueError('Only one of sample_count or sample_rate can be '
-                         'specified.')
-      if sample_count < 1:
-        raise ValueError('Invalid sample_count %d' % sample_count)
-    self._sample_count = sample_count
-
   @property
   def sample_rate(self) -> Optional[float]:
     return self._sample_rate
 
   @sample_rate.setter
   def sample_rate(self, sample_rate: Optional[float]):
     if sample_rate is not None:
-      if hasattr(self, 'sample_count') and self.sample_count is not None:
-        raise ValueError('Only one of sample_count or sample_rate can be '
-                         'specified.')
       if not 0 < sample_rate <= 1:
         raise ValueError('Invalid sample_rate %f' % sample_rate)
     self._sample_rate = sample_rate
diff --git a/tensorflow_data_validation/statistics/stats_options_test.py b/tensorflow_data_validation/statistics/stats_options_test.py
@@ -79,32 +79,6 @@
         'exception_type': TypeError,
         'error_message': 'slice_functions must contain functions only.'
     },
-    {
-        'testcase_name': 'sample_count_zero',
-        'stats_options_kwargs': {
-            'sample_count': 0
-        },
-        'exception_type': ValueError,
-        'error_message': 'Invalid sample_count 0'
-    },
-    {
-        'testcase_name': 'sample_count_negative',
-        'stats_options_kwargs': {
-            'sample_count': -1
-        },
-        'exception_type': ValueError,
-        'error_message': 'Invalid sample_count -1'
-    },
-    {
-        'testcase_name': 'both_sample_count_and_sample_rate',
-        'stats_options_kwargs': {
-            'sample_count': 100,
-            'sample_rate': 0.5
-        },
-        'exception_type': ValueError,
-        'error_message': 'Only one of sample_count or sample_rate can be '
-                         'specified.'
-    },
     {
         'testcase_name': 'sample_rate_zero',
         'stats_options_kwargs': {
@@ -304,7 +278,6 @@ def test_stats_options_from_json(self):
       "weight_feature": null,
       "label_feature": null,
       "_slice_functions": null,
-      "_sample_count": null,
       "_sample_rate": null,
       "num_top_values": 20,
       "frequency_threshold": 1,