tensorflow · peytondmurray · Jun 20, 2025 · May 7, 2025 · May 12, 2025 · May 13, 2025
diff --git a/.bazelrc b/.bazelrc
@@ -8,4 +8,3 @@ build --protocopt=--experimental_allow_proto3_optional
 # parameter 'user_link_flags' is deprecated and will be removed soon.
 # It may be temporarily re-enabled by setting --incompatible_require_linker_input_cc_api=false
 build --incompatible_require_linker_input_cc_api=false
-
diff --git a/.bazelversion b/.bazelversion
@@ -1 +1 @@
-6.5.0
+6.5.0
diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml
@@ -0,0 +1,21 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+     branches: [master]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/[email protected]
+      with:
+        # Ensure the full history is fetched
+        # This is required to run pre-commit on a specific set of commits
+        # TODO: Remove this when all the pre-commit issues are fixed
+        fetch-depth: 0
+    - uses: actions/[email protected]
+      with:
+        python-version: 3.13
+    - uses: pre-commit/[email protected]
diff --git a/.gitignore b/.gitignore
@@ -126,4 +126,4 @@ dmypy.json
 .pyre/
 
 # pb2.py files
-*_pb2.py
+*_pb2.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,39 @@
+# pre-commit is a tool to perform a predefined set of tasks manually and/or
+# automatically before git commits are made.
+#
+# Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level
+#
+# Common tasks
+#
+# - Register git hooks: pre-commit install --install-hooks
+# - Run on all files:   pre-commit run --all-files
+#
+# These pre-commit hooks are run as CI.
+#
+# NOTE: if it can be avoided, add configs/args in pyproject.toml or below instead of creating a new `.config.file`.
+# https://pre-commit.ci/#configuration
+ci:
+  autoupdate_schedule: monthly
+  autofix_commit_msg: |
+    [pre-commit.ci] Apply automatic pre-commit fixes
+
+repos:
+  # general
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: end-of-file-fixer
+        exclude: '\.svg$|\.patch$'
+      - id: trailing-whitespace
+        exclude: '\.svg$|\.patch$'
+      - id: check-json
+      - id: check-yaml
+        args: [--allow-multiple-documents, --unsafe]
+      - id: check-toml
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.5.6
+    hooks:
+      - id: ruff
+        args: ["--fix"]
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -238,4 +238,3 @@ tag.
   * [TensorFlow Data Validation PyPI](https://pypi.org/project/tensorflow-data-validation/)
   * [TensorFlow Data Validation Paper](https://mlsys.org/Conferences/2019/doc/2019/167.pdf)
   * [TensorFlow Data Validation Slides](https://conf.slac.stanford.edu/xldb2018/sites/xldb2018.conf.slac.stanford.edu/files/Tues_09.45_NeoklisPolyzotis_Data%20Analysis%20and%20Validation%20(1).pdf)
-
diff --git a/g3doc/custom_data_validation.md b/g3doc/custom_data_validation.md
@@ -43,5 +43,3 @@ See the
 [documentation](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto)
 in the `CustomValidationConfig` proto for example
 configurations.
-
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,3 +19,130 @@ requires = [
   # Required for using org_tensorflow bazel repository.
   "numpy~=1.22.0",
 ]
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    "W",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # pep8 naming
+    "N",
+    # pydocstyle
+    "D",
+    # annotations
+    "ANN",
+    # debugger
+    "T10",
+    # flake8-pytest
+    "PT",
+    # flake8-return
+    "RET",
+    # flake8-unused-arguments
+    "ARG",
+    # flake8-fixme
+    "FIX",
+    # flake8-eradicate
+    "ERA",
+    # pandas-vet
+    "PD",
+    # numpy-specific rules
+    "NPY",
+]
+
+ignore = [
+    "D104",   # Missing docstring in public package
+    "D100",   # Missing docstring in public module
+    "D211",   # No blank line before class
+    "PD901",  # Avoid using 'df' for pandas dataframes. Perfectly fine in functions with limited scope
+    "ANN201", # Missing return type annotation for public function (makes no sense for NoneType return types...)
+    "ANN101", # Missing type annotation for `self`
+    "ANN204", # Missing return type annotation for special method
+    "ANN002", # Missing type annotation for `*args`
+    "ANN003", # Missing type annotation for `**kwargs`
+    "D105",   # Missing docstring in magic method
+    "D203",   # 1 blank line before after class docstring
+    "D204",   # 1 blank line required after class docstring
+    "D413",   # 1 blank line after parameters
+    "SIM108", # Simplify if/else to one line; not always clearer
+    "D206",   # Docstrings should be indented with spaces; unnecessary when running ruff-format
+    "E501",   # Line length too long; unnecessary when running ruff-format
+    "W191",   # Indentation contains tabs; unnecessary when running ruff-format
+
+    # REMOVE AFTER FIXING
+    # ANN rules (flake8-annotations)
+    "ANN001", # Missing type annotation for function argument `args`
+    "ANN102", # Missing type annotation for `cls` in classmethod
+    "ANN202", # Missing Missing return type annotation for private function
+    "ANN205", # Missing return type annotation for staticmethod
+    "ANN206", # Missing return type annotation for classmethod
+    "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in `domain`
+    # ARG rules (flake8-unused-arguments)
+    "ARG001", # Unused function argument
+    "ARG002", # Unused method argument
+    # B rules (flake8-bugbear)
+    "B005",   # Using `.strip()` with multi-character strings is misleading
+    "B007",   # Loop control variable not used within loop body
+    "B008",   # Do not perform function call in argument defaults; instead, perform the call within the function, or read the default from a module-level singleton variable
+    "B904",   # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
+    # D rules (pydocstyle)
+    "D101",   # Missing docstring in public class
+    "D102",   # Missing docstring in public method
+    "D103",   # Missing docstring in public function
+    "D107",   # Missing docstring in `__init__`,
+    "D401",   # First line of docstring should be in imperative mood: "Loads the vocabulary from the specified path."
+    "D404",   # First word of the docstring should not be "This"
+    "D417",   # Missing argument descriptions in the docstring
+    # E rules (pycodestyle)
+    "E731",   # Do not assign a `lambda` expression, use a `def`
+    "E741",   # Ambiguous variable name
+    # ERA rules (flake8-eradicate)
+    "ERA001", # Found commented-out code
+    # F rules (Pyflakes)
+    "F821",   # Undefined name
+    # FIX rules (flake8-fixme)
+    "FIX002", # Line contains TODO, consider resolving the issue
+    # N rules (pep8-naming)
+    "N802",   # Function name should be lowercase,
+    # NPY rules (numpy-specific rules)
+    "NPY002", # Replace legacy
+    # PD rules (pandas-vet)
+    "PD002",  # `inplace=True` should be avoided; it has inconsistent behavior
+    "PD003",  # `.isna` is preferred to `.isnull`; functionality is equivalent
+    "PD011",  # Use `.to_numpy()` instead of `.values`
+    "PD015",  # Use `.merge` method instead of `pd.merge` function
+    # PT rules (flake8-pytest-style)
+    "PT009",  # Use a regular `assert` instead of unittest-style `assertEqual`
+    "PT018",  # Assertion should be broken down into multiple parts
+    "PT027",  # Use `pytest.raises` instead of unittest-style `assertRaisesRegex`
+    # RET rules (flake8-return)
+    "RET504", # Unnecessary assignment to variable before `return` statement
+    "RET505", # Unnecessary `elif` after `return` statement
+    # SIM rules (flake8-simplify)
+    "SIM101", # Multiple `isinstance` calls for `maybe_collection`, merge into a single call
+    "SIM102", # Use a single `if` statement instead of nested `if` statements
+    "SIM103", # Return the condition directly
+    "SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
+    "SIM117", # Use a single `with` statement with multiple contexts instead of nested `with` statements
+    "SIM211", # Use `not ...` instead of `False if ... else True`
+    # UP rules (pyupgrade)
+    "UP008",  # Use `super()` instead of `super(__class__, self)`
+    "UP028",  # Replace `yield` over `for` loop with `yield from`
+    "UP031",  # Use format specifiers instead of percent format
+]
+
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,4 +8,3 @@ build --protocopt=--experimental_allow_proto3_optional
		# parameter 'user_link_flags' is deprecated and will be removed soon.
		# It may be temporarily re-enabled by setting --incompatible_require_linker_input_cc_api=false
		build --incompatible_require_linker_input_cc_api=false
Original file line number	Diff line number	Diff line change
Expand Up		@@ -238,4 +238,3 @@ tag.
		* [TensorFlow Data Validation PyPI](https://pypi.org/project/tensorflow-data-validation/)
		* [TensorFlow Data Validation Paper](https://mlsys.org/Conferences/2019/doc/2019/167.pdf)
		* [TensorFlow Data Validation Slides](https://conf.slac.stanford.edu/xldb2018/sites/xldb2018.conf.slac.stanford.edu/files/Tues_09.45_NeoklisPolyzotis_Data%20Analysis%20and%20Validation%20(1).pdf)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -43,5 +43,3 @@ See the
		[documentation](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto)
		in the `CustomValidationConfig` proto for example
		configurations.