feat: Enhance Sentry scrubber with more flexible markers

drew2a · drew2a · commit abc3c3c8d63a · 2025-02-27T12:49:05.000+01:00
The Sentry scrubber has been improved to support a wider range of marker types for identifying sections to be redacted. Now, in addition to exact value matches, it can handle lists, tuples and sets of values as markers. This allows for more granular control over what gets redacted.

A new private method `_is_dict_should_be_scrubbed` has been added to encapsulate the logic for determining whether a dictionary should be scrubbed based on its key-value pairs.

The README.md file has also been updated to reflect these changes and provide examples of how to use the new features.

Tests have been added to ensure that the new functionality works as expected under various scenarios.
diff --git a/README.md b/README.md
@@ -133,7 +133,10 @@ from sentry_scrubber.scrubber import SentryScrubber
 
 # Define markers that indicate sections to be removed
 dict_markers = {
-    'visibility': 'private'
+    'visibility': 'private',
+    'status': ['error', 'failure'],  # List of values to match
+    'level': ('warning', 'critical'),  # Tuple of values to match
+    'environment': {'staging', 'production'}  # Set of values to match
 }
 
 scrubber = SentryScrubber(dict_markers_to_scrub=dict_markers)
@@ -144,11 +147,15 @@ event = {
     'private_section': {
         'visibility': 'private',  # This will cause the entire 'private_section' to be redacted
         'secret_data': 'sensitive information'
+    },
+    'error_section': {
+        'status': 'error',  # This will cause the entire 'error_section' to be redacted
+        'details': 'Error details'
     }
 }
 
 scrubbed = scrubber.scrub_event(event)
-# Result: {'public_info': 'This is public', 'private_section': '<redacted>'}
+# Result: {'public_info': 'This is public', 'private_section': '<redacted>', 'error_section': '<redacted>'}
 ```
 
 ### Exclusions
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sentry-scrubber"
-version = "2.1.0"
+version = "2.2.0"
 description = "A lightweight and flexible Python library for scrubbing sensitive information from Sentry events before they are sent to the server."
 authors = ["Andrei Andreev"]
 readme = "README.md"
diff --git a/sentry_scrubber/scrubber.py b/sentry_scrubber/scrubber.py
@@ -233,11 +233,9 @@ def scrub_entity_recursively(self, entity: Union[str, Dict, List, Any], sensitiv
                     result[key] = value
                     continue
 
-                if marker_value := self.dict_markers_to_scrub.get(key):
-                    should_be_scrubbed = value == marker_value
-                    if should_be_scrubbed:
-                        result = self.placeholder
-                        break
+                if self._is_dict_should_be_scrubbed(key, value):
+                    result = self.placeholder
+                    break
 
                 if key in self.dict_keys_for_scrub:
                     if isinstance(value, str):
@@ -255,3 +253,12 @@ def scrub_entity_recursively(self, entity: Union[str, Dict, List, Any], sensitiv
             return tuple(self.scrub_entity_recursively(item, sensitive_strings, depth) for item in entity)
 
         return entity
+
+    def _is_dict_should_be_scrubbed(self, key: str, value: Any):
+        if marker_value := self.dict_markers_to_scrub.get(key):
+            should_be_scrubbed = value == marker_value
+            if should_be_scrubbed:
+                return True
+            if isinstance(marker_value, (list, tuple, set)):
+                return value in marker_value
+        return False
diff --git a/sentry_scrubber/tests/test_scrubber.py b/sentry_scrubber/tests/test_scrubber.py
@@ -426,3 +426,48 @@ def test_scrub_list(scrubber):
     actual = scrubber.scrub_entity_recursively(['/home/username/some/'], sensitive_string)
     assert actual == ['/home/<redacted>/some/']
     assert 'username' in sensitive_string
+
+
+@pytest.mark.parametrize(
+    "key, value, dict_markers_to_scrub, expected",
+    [
+        # Test case 1: Key not in dict_markers_to_scrub
+        ("unknown_key", "value", {}, False),
+
+        # Test case 2: Key in dict_markers_to_scrub, value matches exactly
+        ("api_key", "secret123", {"api_key": "secret123"}, True),
+
+        # Test case 3: Key in dict_markers_to_scrub, value doesn't match
+        ("api_key", "different_value", {"api_key": "secret123"}, False),
+
+        # Test case 4: Key in dict_markers_to_scrub, value in list of marker values
+        ("status", "error", {"status": ["error", "failure"]}, True),
+
+        # Test case 5: Key in dict_markers_to_scrub, value not in list of marker values
+        ("status", "success", {"status": ["error", "failure"]}, False),
+
+        # Test case 6: Key in dict_markers_to_scrub, value in tuple of marker values
+        ("level", "critical", {"level": ("warning", "critical")}, True),
+
+        # Test case 7: Key in dict_markers_to_scrub, value in set of marker values
+        ("environment", "production", {"environment": {"staging", "production"}}, True),
+    ],
+)
+def test_is_dict_should_be_scrubbed(key, value, dict_markers_to_scrub, expected):
+    """Test the _is_dict_should_be_scrubbed method with various inputs."""
+    scrubber = SentryScrubber(dict_markers_to_scrub=dict_markers_to_scrub)
+    result = scrubber._is_dict_should_be_scrubbed(key, value)
+    assert result == expected
+
+
+def test_is_dict_should_be_scrubbed_with_empty_markers():
+    """Test the method with empty dict_markers_to_scrub."""
+    scrubber = SentryScrubber()
+    assert not scrubber._is_dict_should_be_scrubbed("any_key", "any_value")
+
+
+def test_is_dict_should_be_scrubbed_with_none_value():
+    """Test the method with None value."""
+    scrubber = SentryScrubber(dict_markers_to_scrub={"key": None})
+    assert not scrubber._is_dict_should_be_scrubbed("key", None)
+    assert not scrubber._is_dict_should_be_scrubbed("key", "not_none")