Start working on mutation issues in validate.

Carreau · Carreau · commit 43ea02cf72d0 · 2021-11-23T09:03:19.000-08:00
We change the validation logic and separate the normalisation from
the validation step.

We make sure that if a notebook is normalized, it emits a warning.
In the future we will turn the warning in to an Error.

We add test for the current and an xfail test for the future behavior
diff --git a/nbformat/json_compat.py b/nbformat/json_compat.py
@@ -76,6 +76,8 @@ def _validator_for_name(validator_name):
     for (name, module, validator_cls) in _VALIDATOR_MAP:
         if module and validator_name == name:
             return validator_cls
+    # we always return something.
+    raise ValueError(f"Missing validator for {validator_name!r}")
 
 
 def get_current_validator():
diff --git a/nbformat/tests/test_validator.py b/nbformat/tests/test_validator.py
@@ -6,6 +6,9 @@
 import os
 import re
 
+from nbformat.warnings import MissingIDFieldWarning
+from copy import deepcopy
+
 from .base import TestsBase
 from jsonschema import ValidationError
 from nbformat import read
@@ -14,6 +17,8 @@
 
 import pytest
 
+nb4 = ("test4.ipynb", "test4.5.ipynb")
+
 
 # Fixtures
 @pytest.fixture(autouse=True)
@@ -29,6 +34,49 @@ def set_validator(validator_name):
     os.environ["NBFORMAT_VALIDATOR"] = validator_name
 
 
+@pytest.mark.parametrize("validator_name", VALIDATORS)
+def test_should_warn(validator_name):
+    """Test that a v4 notebook witout id emit a warning"""
+    set_validator(validator_name)
+    with TestsBase.fopen(u"test4.5.ipynb", u"r") as f:
+        nb = read(f, as_version=4)
+
+    del nb.cells[3]["id"]
+    assert nb.cells[3].get("id") is None
+    assert nb.cells[3]["cell_type"] == "code"
+
+    nb_copy = deepcopy(nb)
+
+    with pytest.warns(MissingIDFieldWarning):
+        validate(nb)
+    assert isvalid(nb) == True
+
+
+@pytest.mark.xfail(reason="In the future we want to stop warning, and raise an error")
+@pytest.mark.parametrize("validator_name", VALIDATORS)
+def test_should_not_mutate(validator_name):
+    """Test that a v4 notebook without id raise an error and does/not mutate
+
+    Probably should be 2 test. To enable in the future.
+    """
+    set_validator(validator_name)
+    with TestsBase.fopen(u"test4.5.ipynb", u"r") as f:
+        nb = read(f, as_version=4)
+
+    del nb.cells[3]["id"]
+    assert nb.cells[3].get("id") is None
+    assert nb.cells[3]["cell_type"] == "code"
+
+    nb_deep_copy = deepcopy(nb)
+
+    with (pytest.raises(MissingIDFieldWarning), pytest.warns(None)):
+        validate(nb)
+
+    assert nb == nb_deep_copy
+
+    assert isvalid(nb) == True
+
+
 @pytest.mark.parametrize("validator_name", VALIDATORS)
 def test_nb2(validator_name):
     """Test that a v2 notebook converted to current passes validation"""
@@ -50,10 +98,11 @@ def test_nb3(validator_name):
 
 
 @pytest.mark.parametrize("validator_name", VALIDATORS)
-def test_nb4(validator_name):
+@pytest.mark.parametrize("nbfile", nb4)
+def test_nb4(validator_name, nbfile):
     """Test that a v4 notebook passes validation"""
     set_validator(validator_name)
-    with TestsBase.fopen(u'test4.ipynb', u'r') as f:
+    with TestsBase.fopen(nbfile, u"r") as f:
         nb = read(f, as_version=4)
     validate(nb)
     assert isvalid(nb) == True
diff --git a/nbformat/validator.py b/nbformat/validator.py
@@ -7,11 +7,13 @@
 import pprint
 import sys
 import warnings
+from copy import deepcopy
 
 from ipython_genutils.importstring import import_item
 from .json_compat import get_current_validator, ValidationError
 from .reader import get_version, reads
 from .corpus.words import generate_corpus_id
+from .warnings import MissingIDFieldWarning
 
 validators = {}
 
@@ -229,15 +231,71 @@ def better_validation_error(error, version, version_minor):
     return NotebookValidationError(error, ref)
 
 
-def validate(nbdict=None, ref=None, version=None, version_minor=None,
-             relax_add_props=False, nbjson=None):
+def normalize(nbdict, version, version_minor):
+    """
+    EXPERIMENTAL
+
+    normalise a notebook prior to validation.
+
+    This tries to implement a couple of normalisation steps to standardise
+    notebooks and make validation easier.
+
+    You should in general not rely on this function and make sure the notebooks
+    that reach nbformat are already in a normal form.
+
+    Parameters
+    ----------
+    nbdict : dict
+        notebook document
+    version : int
+    version_minor : int
+
+    Returns
+    -------
+    changes : int
+        number of changes in the notebooks
+    notebook : dict
+        deep-copy of the original object with relevant changes.
+
+    """
+    nbdict = deepcopy(nbdict)
+    return _normalize(deepcopy(nbdict))
+
+def _normalize(nbdict, version, version_minor):
+    changes = 0
+
+    if version >= 4 and version_minor >= 5:
+        # if we support cell ids ensure default ids are provided
+        for cell in nbdict["cells"]:
+            if "id" not in cell:
+                changes +=1
+                warnings.warn(
+                    "Code cell is missing an id field, this will become"
+                    " a hard error in future nbformat versions. You may want"
+                    " to use `normalize()` on your notebooks before validations"
+                    " (available since nbformat 5.1.4). Previous of nbformat"
+                    " are also mutating their arguments, and will stop to do so"
+                    " in the future.",
+                    MissingIDFieldWarning,
+                    stacklevel=3,
+                )
+                # Generate cell ids if any are missing
+                cell['id'] = generate_corpus_id()
+    return changes, nbdict
+
+def validate(nbdict=None, ref:str=None, version=None, version_minor=None,
+             relax_add_props=False, nbjson=None) -> None:
     """Checks whether the given notebook dict-like object
     conforms to the relevant notebook format schema.
 
-
+    Parameters
+    ----------
+    ref : optional, str
+        reference to the subset of the schema we want to validate against.
+        for example ``"markdown_cell"``, `"code_cell"` ....
     Raises ValidationError if not valid.
     """
-
+    assert isinstance(ref, str) or ref is None
     # backwards compatibility for nbjson argument
     if nbdict is not None:
         pass
@@ -257,13 +315,8 @@ def validate(nbdict=None, ref=None, version=None, version_minor=None,
         # if ref is specified, and we don't have a version number, assume we're validating against 1.0
         if version is None:
             version, version_minor = 1, 0
-
-    if ref is None and version >= 4 and version_minor >= 5:
-        # if we support cell ids ensure default ids are provided
-        for cell in nbdict['cells']:
-            if 'id' not in cell:
-                # Generate cell ids if any are missing
-                cell['id'] = generate_corpus_id()
+    if ref is None:
+        _normalize(nbdict, version, version_minor)
 
     for error in iter_validate(nbdict, ref=ref, version=version,
                                version_minor=version_minor,
diff --git a/nbformat/warnings.py b/nbformat/warnings.py
@@ -0,0 +1,18 @@
+"""
+Warnings that can be emitted by nbformat.
+"""
+
+
+class MissingIDFieldWarning(FutureWarning):
+    """
+
+    This warning is emitted in the validation step of nbformat as we used to
+    mutate the structure which is cause signature issues.
+
+    This will be turned into an error at later point.
+
+    We subclass FutureWarning as we will change the behavior in the future.
+
+    """
+
+    pass