diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..4b33fe18 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,52 @@ +name: Deploy docs +on: + workflow_dispatch: + push: + branches: + - 'master' + pull_request: +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + if: (github.event_name != 'pull_request') + + - name: Set up Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' + cache-dependency-path: | + setup.py + requirements-docs.txt + + - name: Save time for cache for mkdocs + run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + + - name: Caching + uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + + - name: Install Dependencies + run: pip install -r requirements-docs.txt + + - name: Deploy to GitHub Pages + run: mkdocs gh-deploy --force + if: (github.event_name != 'pull_request') + + - name: Build docs to check for errors + run: mkdocs build + if: (github.event_name == 'pull_request') diff --git a/g3doc/_toc.yaml b/docs/_toc.yaml similarity index 100% rename from g3doc/_toc.yaml rename to docs/_toc.yaml diff --git a/g3doc/anomalies.md b/docs/anomalies.md similarity index 100% rename from g3doc/anomalies.md rename to docs/anomalies.md diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 00000000..14df0022 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,4 @@ +# TensorFlow Data Validation API Documentation + + +::: tensorflow_data_validation diff --git a/g3doc/custom_data_validation.md b/docs/custom_data_validation.md similarity index 97% rename from g3doc/custom_data_validation.md rename to docs/custom_data_validation.md index a67d72b0..6d92f10a 100644 --- a/g3doc/custom_data_validation.md +++ b/docs/custom_data_validation.md @@ -6,9 +6,9 @@ freshness: { owner: 'kuochuntsai' reviewed: '2022-11-29' } TFDV supports custom data validation using SQL. You can run custom data validation using -[validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py;l=236;rcl=488721853) +[validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py#L236) or -[custom_validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py;l=535;rcl=488721853). +[custom_validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py#L535). Use `validate_statistics` to run standard, schema-based data validation along with custom validation. Use `custom_validate_statistics` to run only custom validation. diff --git a/g3doc/get_started.md b/docs/get_started.md similarity index 100% rename from g3doc/get_started.md rename to docs/get_started.md diff --git a/g3doc/images/anomaly.png b/docs/images/anomaly.png similarity index 100% rename from g3doc/images/anomaly.png rename to docs/images/anomaly.png diff --git a/docs/images/feature_stats.png b/docs/images/feature_stats.png new file mode 100644 index 00000000..d1cb3718 Binary files /dev/null and b/docs/images/feature_stats.png differ diff --git a/g3doc/images/schema.png b/docs/images/schema.png similarity index 100% rename from g3doc/images/schema.png rename to docs/images/schema.png diff --git a/g3doc/images/serving_anomaly.png b/docs/images/serving_anomaly.png similarity index 100% rename from g3doc/images/serving_anomaly.png rename to docs/images/serving_anomaly.png diff --git a/g3doc/images/skew_anomaly.png b/docs/images/skew_anomaly.png similarity index 100% rename from g3doc/images/skew_anomaly.png rename to docs/images/skew_anomaly.png diff --git a/g3doc/images/stats.png b/docs/images/stats.png similarity index 100% rename from g3doc/images/stats.png rename to docs/images/stats.png diff --git a/docs/images/tf_full_color_primary_icon.svg b/docs/images/tf_full_color_primary_icon.svg new file mode 100644 index 00000000..3e724777 --- /dev/null +++ b/docs/images/tf_full_color_primary_icon.svg @@ -0,0 +1 @@ +FullColorPrimary Icon \ No newline at end of file diff --git a/docs/images/unbalanced.png b/docs/images/unbalanced.png new file mode 100644 index 00000000..3738522c Binary files /dev/null and b/docs/images/unbalanced.png differ diff --git a/docs/images/uniform.png b/docs/images/uniform.png new file mode 100644 index 00000000..793b1d1b Binary files /dev/null and b/docs/images/uniform.png differ diff --git a/docs/images/uniform_cumulative.png b/docs/images/uniform_cumulative.png new file mode 100644 index 00000000..d8bf0573 Binary files /dev/null and b/docs/images/uniform_cumulative.png differ diff --git a/docs/images/zero_length.png b/docs/images/zero_length.png new file mode 100644 index 00000000..07b3b103 Binary files /dev/null and b/docs/images/zero_length.png differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..87619c6b --- /dev/null +++ b/docs/index.md @@ -0,0 +1,319 @@ +# TensorFlow Data Validation: Checking and analyzing your data + +Once your data is in a TFX pipeline, you can use TFX components to +analyze and transform it. You can use these tools even before you train +a model. + +There are many reasons to analyze and transform your data: + +- To find problems in your data. Common problems include: + - Missing data, such as features with empty values. + - Labels treated as features, so that your model gets to peek at + the right answer during training. + - Features with values outside the range you expect. + - Data anomalies. + - Transfer learned model has preprocessing that does not match the + training data. +- To engineer more effective feature sets. For example, you can + identify: + - Especially informative features. + - Redundant features. + - Features that vary so widely in scale that they may slow + learning. + - Features with little or no unique predictive information. + +TFX tools can both help find data bugs, and help with feature +engineering. + +## TensorFlow Data Validation + +- [Overview](#overview) +- [Schema Based Example Validation](#schema-based-example-validation) +- [Training-Serving Skew Detection](#training-serving-skew-detection) +- [Drift Detection](#drift-detection) + +### Overview + +TensorFlow Data Validation identifies anomalies in training and serving +data, and can automatically create a schema by examining the data. The +component can be configured to detect different classes of anomalies in +the data. It can + +1. Perform validity checks by comparing data statistics against a + schema that codifies expectations of the user. +2. Detect training-serving skew by comparing examples in training and + serving data. +3. Detect data drift by looking at a series of data. + +We document each of these functionalities independently: + +- [Schema Based Example Validation](#schema-based-example-validation) +- [Training-Serving Skew Detection](#training-serving-skew-detection) +- [Drift Detection](#drift-detection) + +### Schema Based Example Validation + +TensorFlow Data Validation identifies any anomalies in the input data by +comparing data statistics against a schema. The schema codifies +properties which the input data is expected to satisfy, such as data +types or categorical values, and can be modified or replaced by the +user. + +Tensorflow Data Validation is typically invoked multiple times within +the context of the TFX pipeline: (i) for every split obtained from +ExampleGen, (ii) for all pre-transformed data used by Transform and +(iii) for all post-transform data generated by Transform. When invoked +in the context of Transform (ii-iii), statistics options and +schema-based constraints can be set by defining the +[`stats_options_updater_fn`](https://tensorflow.github.io/transform). +This is particilarly useful when validating unstructured data (e.g. text +features). See the [user +code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/bert/mrpc/bert_mrpc_utils.py) +for an example. + +#### Advanced Schema Features + +This section covers more advanced schema configuration that can help +with special setups. + +##### Sparse Features + +Encoding sparse features in Examples usually introduces multiple +Features that are expected to have the same valency for all Examples. +For example the sparse feature: + +```python +WeightedCategories = [('CategoryA', 0.3), ('CategoryX', 0.7)] +``` + +would be encoded using separate Features for index and value: + +``` python +WeightedCategoriesIndex = ['CategoryA', 'CategoryX'] +WeightedCategoriesValue = [0.3, 0.7] +``` + +with the restriction that the valency of the index and value feature +should match for all Examples. This restriction can be made explicit in +the schema by defining a sparse_feature: + + +```python +sparse_feature { + name: 'WeightedCategories' + index_feature { name: 'WeightedCategoriesIndex' } + value_feature { name: 'WeightedCategoriesValue' } +} +``` + +The sparse feature definition requires one or more index and one value +feature which refer to features that exist in the schema. Explicitly +defining sparse features enables TFDV to check that the valencies of all +referred features match. + +Some use cases introduce similar valency restrictions between Features, +but do not necessarily encode a sparse feature. Using sparse feature +should unblock you, but is not ideal. + +##### Schema Environments + +By default validations assume that all Examples in a pipeline adhere to +a single schema. In some cases introducing slight schema variations is +necessary, for instance features used as labels are required during +training (and should be validated), but are missing during serving. +Environments can be used to express such requirements, in particular +`default_environment()`, +`in_environment()`, and +`not_in_environment()`. + +For example, assume a feature named `'LABEL'` is required for training, +but is expected to be missing from serving. This can be expressed by: + +- Define two distinct environments in the schema: `["SERVING", + "TRAINING"]` and associate `'LABEL'` only with environment + `"TRAINING"`. +- Associate the training data with environment `"TRAINING"` and the + serving data with environment `"SERVING"`. + +##### Schema Generation + +The input data schema is specified as an instance of the TensorFlow +[Schema](https://github.com/tensorflow/metadata/blob/master/tensorflow_metadata/proto/v0/schema.proto). + +Instead of constructing a schema manually from scratch, a developer can +rely on TensorFlow Data Validation's automatic schema construction. +Specifically, TensorFlow Data Validation automatically constructs an +initial schema based on statistics computed over training data available +in the pipeline. Users can simply review this autogenerated schema, +modify it as needed, check it into a version control system, and push it +explicitly into the pipeline for further validation. + +TFDV includes `infer_schema()` to generate a +schema automatically. For example: + +```python +schema = tfdv.infer_schema(statistics=train_stats) +tfdv.display_schema(schema=schema) +``` + +This triggers an automatic schema generation based on the following +rules: + +- If a schema has already been auto-generated then it is used as is. + +- Otherwise, TensorFlow Data Validation examines the available data + statistics and computes a suitable schema for the data. + +*Note: The auto-generated schema is best-effort and only tries to infer +basic properties of the data. It is expected that users review and +modify it as needed.* + +### Training-Serving Skew Detection + +#### Overview + +TensorFlow Data Validation can detect distribution skew between training +and serving data. Distribution skew occurs when the distribution of +feature values for training data is significantly different from serving +data. One of the key causes for distribution skew is using either a +completely different corpus for training data generation to overcome +lack of initial data in the desired corpus. Another reason is a faulty +sampling mechanism that only chooses a subsample of the serving data to +train on. + +##### Example Scenario + +**Note:** For instance, in order to compensate for an underrepresented +slice of data, if a biased sampling is used without upweighting the +downsampled examples appropriately, the distribution of feature values +between training and serving data gets artificially skewed. + +See the [TensorFlow Data Validation Get Started +Guide](get_started#checking-data-skew-and-drift) +for information about configuring training-serving skew detection. + +### Drift Detection + +Drift detection is supported between consecutive spans of data (i.e., +between span N and span N+1), such as between different days of training +data. We express drift in terms of [L-infinity +distance](https://en.wikipedia.org/wiki/Chebyshev_distance) for +categorical features and approximate [Jensen-Shannon +divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence) +for numeric features. You can set the threshold distance so that you +receive warnings when the drift is higher than is acceptable. Setting +the correct distance is typically an iterative process requiring domain +knowledge and experimentation. + +See the [TensorFlow Data Validation Get Started +Guide](get_started#checking-data-skew-and-drift) +for information about configuring drift detection. + +## Using Visualizations to Check Your Data + +TensorFlow Data Validation provides tools for visualizing the +distribution of feature values. By examining these distributions in a +Jupyter notebook using [Facets](https://pair-code.github.io/facets/) you +can catch common problems with data. + +![Feature stats](images/feature_stats.png) + +### Identifying Suspicious Distributions + +You can identify common bugs in your data by using a Facets Overview +display to look for suspicious distributions of feature values. + +#### Unbalanced Data + +An unbalanced feature is a feature for which one value predominates. +Unbalanced features can occur naturally, but if a feature always has the +same value you may have a data bug. To detect unbalanced features in a +Facets Overview, choose "Non-uniformity" from the "Sort by" +dropdown. + +The most unbalanced features will be listed at the top of each +feature-type list. For example, the following screenshot shows one +feature that is all zeros, and a second that is highly unbalanced, at +the top of the "Numeric Features" list: + +![Visualization of unbalanced +data](images/unbalanced.png) + +#### Uniformly Distributed Data + +A uniformly distributed feature is one for which all possible values +appear with close to the same frequency. As with unbalanced data, this +distribution can occur naturally, but can also be produced by data bugs. + +To detect uniformly distributed features in a Facets Overview, choose +"Non- uniformity" from the "Sort by" dropdown and check the +"Reverse order" checkbox: + +![Histogram of uniform data](images/uniform.png) + +String data is represented using bar charts if there are 20 or fewer +unique values, and as a cumulative distribution graph if there are more +than 20 unique values. So for string data, uniform distributions can +appear as either flat bar graphs like the one above or straight lines +like the one below: + +![Line graph: cumulative distribution of uniform +data](images/uniform_cumulative.png) + +##### Bugs That Can Produce Uniformly Distributed Data + +Here are some common bugs that can produce uniformly distributed data: + +- Using strings to represent non-string data types such as dates. For + example, you will have many unique values for a datetime feature + with representations like `2017-03-01-11-45-03`. Unique values + will be distributed uniformly. + +- Including indices like "row number" as features. Here again you + have many unique values. + +#### Missing Data + +To check whether a feature is missing values entirely: + +1. Choose "Amount missing/zero" from the "Sort by" drop-down. +2. Check the "Reverse order" checkbox. +3. Look at the "missing" column to see the percentage of instances + with missing values for a feature. + +A data bug can also cause incomplete feature values. For example you may +expect a feature's value list to always have three elements and +discover that sometimes it only has one. To check for incomplete values +or other cases where feature value lists don\'t have the expected number +of elements: + +1. Choose "Value list length" from the "Chart to show" drop-down + menu on the right. + +2. Look at the chart to the right of each feature row. The chart shows + the range of value list lengths for the feature. For example, the + highlighted row in the screenshot below shows a feature that has + some zero-length value lists: + +![Facets Overview display with feature with zero-length feature value +lists](images/zero_length.png) + +#### Large Differences in Scale Between Features + +If your features vary widely in scale, then the model may have +difficulties learning. For example, if some features vary from 0 to 1 +and others vary from 0 to 1,000,000,000, you have a big difference in +scale. Compare the "max" and "min" columns across features to find +widely varying scales. + +Consider normalizing feature values to reduce these wide variations. + +#### Labels with Invalid Labels + +TensorFlow's Estimators have restrictions on the type of data they +accept as labels. For example, binary classifiers typically only work +with {0, 1} labels. + +Review the label values in the Facets Overview and make sure they +conform to the [requirements of +Estimators](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/feature_columns.md). diff --git a/g3doc/install.md b/docs/install.md similarity index 97% rename from g3doc/install.md rename to docs/install.md index a3702f7a..10997e88 100644 --- a/g3doc/install.md +++ b/docs/install.md @@ -1,7 +1,3 @@ - -{% setvar github_path %}tensorflow/data-validation{% endsetvar %} -{% include "_templates/github-bug.html" %} - # TensorFlow Data Validation *TensorFlow Data Validation* (TFDV) is a library for exploring and validating @@ -23,7 +19,7 @@ TF Data Validation includes: learn more in order to correct them. For instructions on using TFDV, see the -[get started guide](https://github.com/tensorflow/data-validation/blob/master/g3doc/get_started.md) +[get started guide](../get_started) and try out the [example notebook](https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/data_validation/tfdv_basic.ipynb). Some of the techniques implemented in TFDV are described in a @@ -233,9 +229,9 @@ tag. ## Links -* [TensorFlow Data Validation Getting Started Guide](https://www.tensorflow.org/tfx/data_validation/get_started) +* [TensorFlow Data Validation Getting Started Guide](../get_started) * [TensorFlow Data Validation Notebook](https://colab.research.google.com/github/tensorflow/tfx/blob/master/docs/tutorials/data_validation/tfdv_basic.ipynb) -* [TensorFlow Data Validation API Documentation](https://www.tensorflow.org/tfx/data_validation/api_docs/python/tfdv) +* [TensorFlow Data Validation API Documentation](../api) * [TensorFlow Data Validation Blog Post](https://medium.com/tensorflow/introducing-tensorflow-data-validation-data-understanding-validation-and-monitoring-at-scale-d38e3952c2f0) * [TensorFlow Data Validation PyPI](https://pypi.org/project/tensorflow-data-validation/) * [TensorFlow Data Validation Paper](https://www.sysml.cc/doc/2019/167.pdf) diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js new file mode 100644 index 00000000..0be88e04 --- /dev/null +++ b/docs/javascripts/mathjax.js @@ -0,0 +1,19 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + MathJax.startup.output.clearCache() + MathJax.typesetClear() + MathJax.texReset() + MathJax.typesetPromise() +}) diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000..21c97aa9 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,42 @@ +:root { + --md-primary-fg-color: #FFA800; + --md-primary-fg-color--light: #CCCCCC; + --md-primary-fg-color--dark: #425066; +} + +.video-wrapper { + max-width: 240px; + display: flex; + flex-direction: row; +} +.video-wrapper > iframe { + width: 100%; + aspect-ratio: 16 / 9; +} + +.buttons-wrapper { + flex-wrap: wrap; + gap: 1em; + display: flex; + /* flex-grow: 1; */ + /* justify-content: center; */ + /* align-content: center; */ +} + +.buttons-wrapper > a { + justify-content: center; + align-content: center; + flex-wrap: nowrap; + /* gap: 1em; */ + align-items: center; + text-align: center; + flex: 1 1 30%; + display: flex; +} + +.md-button > .buttons-content { + align-items: center; + justify-content: center; + display: flex; + gap: 1em; +} diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..86d9c904 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,120 @@ +site_name: TensorFlow Data Validation +repo_name: "data-validation" +repo_url: + +theme: + logo: images/tf_full_color_primary_icon.svg + name: material + palette: + # Palette toggle for automatic mode + - media: "(prefers-color-scheme)" + primary: custom + accent: custom + toggle: + icon: material/brightness-auto + name: Switch to light mode + + # Palette toggle for light mode + - media: "(prefers-color-scheme: light)" + primary: custom + accent: custom + scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + + # Palette toggle for dark mode + - media: "(prefers-color-scheme: dark)" + primary: custom + accent: custom + scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to system preference + favicon: images/tf_full_color_primary_icon.svg + + features: + - content.code.copy + - content.code.select + - content.action.edit + +plugins: + - search + - autorefs + - mkdocstrings: + default_handler: python + handlers: + python: + options: + show_source: true + show_root_heading: true + unwrap_annotated: true + show_symbol_type_toc: true + show_if_no_docstring: true + show_symbol_type_heading: true + merge_init_into_class: true + show_signature_annotations: true + separate_signature: true + signature_crossrefs: true + group_by_category: true + show_category_heading: true + show_submodules: false + show_root_full_path: true + docstring_section_style: "spacy" + inherited_members: true + summary: false + filters: + - "!^_" + - "^__init__$" + - "^__call__$" + - "^__version__$" + - "!^logger" + - "!^test_" + - "!_test$" + extensions: + - griffe_inherited_docstrings + import: + - https://docs.python.org/3/objects.inv + +extra_css: + - stylesheets/extra.css + +extra_javascript: + - javascripts/mathjax.js + - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js + +markdown_extensions: + - admonition + - attr_list + - def_list + - tables + - toc: + permalink: true + - pymdownx.highlight: + anchor_linenums: true + linenums: false + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.arithmatex: + generic: true + - pymdownx.critic + - pymdownx.caret + - pymdownx.keys + - pymdownx.mark + - pymdownx.tilde + - md_in_html + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg +watch: + - tensorflow_data_validation + +nav: + - Home: index.md + - Install: install.md + - Getting Started: get_started.md + - Anomalies: anomalies.md + - API: api.md diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 00000000..15b34146 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,6 @@ +mkdocs +mkdocs-material +mkdocstrings[python] +griffe-inherited-docstrings +mkdocs-autorefs +ruff diff --git a/setup.py b/setup.py index f63f2b91..9b2cc5ba 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ # limitations under the License. """Package Setup script for TensorFlow Data Validation.""" import os +from pathlib import Path import platform import shutil import subprocess @@ -114,10 +115,22 @@ def _make_visualization_requirements(): 'ipython>=7,<8', ] +def _make_docs_requirements(): + return [ + req for req in Path("./requirements-docs.txt") + .expanduser() + .resolve() + .read_text() + .splitlines() + if req + ] def _make_all_extra_requirements(): - return (_make_mutual_information_requirements() + - _make_visualization_requirements()) + return ( + *_make_mutual_information_requirements(), + *_make_visualization_requirements(), + *_make_docs_requirements(), + ) def select_constraint(default, nightly=None, git_master=None): @@ -204,6 +217,7 @@ def select_constraint(default, nightly=None, git_master=None): extras_require={ 'mutual-information': _make_mutual_information_requirements(), 'visualization': _make_visualization_requirements(), + 'docs': _make_docs_requirements(), 'all': _make_all_extra_requirements(), }, python_requires='>=3.9,<4', diff --git a/tensorflow_data_validation/__init__.py b/tensorflow_data_validation/__init__.py index a414de20..17d5c1cc 100644 --- a/tensorflow_data_validation/__init__.py +++ b/tensorflow_data_validation/__init__.py @@ -90,3 +90,56 @@ # Import version string. from tensorflow_data_validation.version import __version__ + +__all__ = [ + '__version__', + 'CombinerStatsGenerator', + 'compare_slices', + 'CrossFeatureView', + 'DatasetListView', + 'DatasetView', + 'default_sharded_output_suffix', + 'default_sharded_output_supported', + 'DetectFeatureSkew', + 'display_anomalies', + 'display_schema', + 'experimental_get_feature_value_slicer', + 'FeaturePath', + 'FeatureView', + 'generate_dummy_schema_with_paths', + 'generate_statistics_from_csv', + 'generate_statistics_from_dataframe', + 'generate_statistics_from_tfrecord', + 'GenerateStatistics', + 'get_confusion_count_dataframes', + 'get_domain', + 'get_feature', + 'get_feature_stats', + 'get_match_stats_dataframe', + 'get_skew_result_dataframe', + 'get_slice_stats', + 'get_statistics_html', + 'infer_schema', + 'load_anomalies_text', + 'load_schema_text', + 'load_sharded_statistics', + 'load_statistics', + 'load_stats_binary', + 'load_stats_text', + 'MergeDatasetFeatureStatisticsList', + 'set_domain', + 'StatsOptions', + 'TransformStatsGenerator', + 'update_schema', + 'validate_corresponding_slices', + 'validate_examples_in_csv', + 'validate_examples_in_tfrecord', + 'validate_statistics', + 'visualize_statistics', + 'write_anomalies_text', + 'write_schema_text', + 'write_stats_text', + 'WriteStatisticsToBinaryFile', + 'WriteStatisticsToRecordsAndBinaryFile', + 'WriteStatisticsToTFRecord' +]