diff --git a/README.md b/README.md index ac6dd41..b634b60 100644 --- a/README.md +++ b/README.md @@ -787,6 +787,90 @@ supporting_text: "correct quote" ## Configuration +### Configuration File + +You can create a `.linkml-reference-validator.yaml` file in your project root to configure validation behavior: + +```yaml +validation: + cache_dir: references_cache + rate_limit_delay: 0.5 + + # Skip validation for specific prefixes (useful for unsupported reference types) + skip_prefixes: + - SRA # Sequence Read Archive + - MGNIFY # MGnify database + - BIOPROJECT # NCBI BioProject (currently has API issues) + + # Control severity for unfetchable references + unknown_prefix_severity: WARNING # Options: ERROR, WARNING, INFO + + # Map alternate prefixes to canonical ones + reference_prefix_map: + geo: GEO + NCBIGeo: GEO +``` + +### Configuration Options + +#### `skip_prefixes` (list of strings) + +List of reference prefixes to skip during validation. References with these prefixes will return `is_valid=True` with `INFO` severity, allowing validation to pass without blocking your workflow. + +**Use cases:** +- Unsupported reference types (SRA, MGnify, etc.) +- References that are temporarily unavailable +- Third-party databases without registered handlers + +**Example:** +```yaml +validation: + skip_prefixes: + - SRA + - MGNIFY + - BIOPROJECT +``` + +With this configuration: +```bash +# These will pass validation with INFO severity +linkml-reference-validator validate text "some text" SRA:PRJNA290729 +# ✓ Valid: True (INFO) - Skipping validation for reference with prefix 'SRA' + +linkml-reference-validator validate text "some text" MGNIFY:MGYS00000596 +# ✓ Valid: True (INFO) - Skipping validation for reference with prefix 'MGNIFY' +``` + +#### `unknown_prefix_severity` (ERROR | WARNING | INFO) + +Control the severity level for references that cannot be fetched (unsupported prefix, network error, etc.). Default: `ERROR` + +**Options:** +- `ERROR` (default) - Validation fails, blocking workflow +- `WARNING` - Validation fails but with lower severity +- `INFO` - Validation fails but logged as informational + +**Note:** `skip_prefixes` takes precedence over `unknown_prefix_severity`. If a prefix is in `skip_prefixes`, it will return `is_valid=True` with `INFO` severity regardless of this setting. + +**Example:** +```yaml +validation: + skip_prefixes: + - SRA # These will be skipped (is_valid=True, INFO) + unknown_prefix_severity: WARNING # Other unfetchable refs get WARNING +``` + +With this configuration: +```bash +# SRA is skipped (from skip_prefixes) +linkml-reference-validator validate text "text" SRA:PRJNA290729 +# ✓ Valid: True (INFO) - Skipping validation + +# UNKNOWN prefix gets WARNING severity +linkml-reference-validator validate text "text" UNKNOWN:12345 +# ✗ Valid: False (WARNING) - Could not fetch reference +``` + ### Cache Directory Default: `references_cache/` in current directory diff --git a/docs/notebooks/05_geo_validation.ipynb b/docs/notebooks/05_geo_validation.ipynb index 777739b..32cb095 100644 --- a/docs/notebooks/05_geo_validation.ipynb +++ b/docs/notebooks/05_geo_validation.ipynb @@ -203,33 +203,7 @@ "id": "a3b4c5d6", "metadata": {}, "outputs": [], - "source": [ - "from Bio import Entrez\n", - "\n", - "Entrez.email = \"your-email@example.com\"\n", - "\n", - "# Step 1: Convert accession to UID via esearch\n", - "handle = Entrez.esearch(db=\"gds\", term=\"GSE67472[Accession]\")\n", - "search_result = Entrez.read(handle)\n", - "handle.close()\n", - "\n", - "print(f\"Accession: GSE67472\")\n", - "print(f\"UID(s) found: {search_result['IdList']}\")\n", - "\n", - "if search_result['IdList']:\n", - " uid = search_result['IdList'][0]\n", - " \n", - " # Step 2: Fetch summary using UID\n", - " handle = Entrez.esummary(db=\"gds\", id=uid)\n", - " summary = Entrez.read(handle)\n", - " handle.close()\n", - " \n", - " if summary:\n", - " record = summary[0]\n", - " print(f\"\\nDataset Title: {record.get('title')}\")\n", - " print(f\"Platform: {record.get('GPL')}\")\n", - " print(f\"Samples: {record.get('n_samples')}\")" - ] + "source": "from Bio import Entrez\n\nEntrez.email = \"your-email@example.com\"\n\n# Step 1: Convert accession to UID via esearch\nhandle = Entrez.esearch(db=\"gds\", term=\"GSE67472[Accession]\")\nsearch_result = Entrez.read(handle)\nhandle.close()\n\nprint(\"Accession: GSE67472\")\nprint(f\"UID(s) found: {search_result['IdList']}\")\n\nif search_result['IdList']:\n uid = search_result['IdList'][0]\n \n # Step 2: Fetch summary using UID\n handle = Entrez.esummary(db=\"gds\", id=uid)\n summary = Entrez.read(handle)\n handle.close()\n \n if summary:\n record = summary[0]\n print(f\"\\nDataset Title: {record.get('title')}\")\n print(f\"Platform: {record.get('GPL')}\")\n print(f\"Samples: {record.get('n_samples')}\")" }, { "cell_type": "markdown", @@ -358,4 +332,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/src/linkml_reference_validator/models.py b/src/linkml_reference_validator/models.py index e1e4ae7..0dbc02b 100644 --- a/src/linkml_reference_validator/models.py +++ b/src/linkml_reference_validator/models.py @@ -344,6 +344,14 @@ class ReferenceValidationConfig(BaseModel): ... ) >>> config.reference_prefix_map["geo"] 'GEO' + >>> config = ReferenceValidationConfig( + ... skip_prefixes=["SRA", "MGNIFY"], + ... unknown_prefix_severity=ValidationSeverity.WARNING + ... ) + >>> config.skip_prefixes + ['SRA', 'MGNIFY'] + >>> config.unknown_prefix_severity + """ cache_dir: Path = Field( @@ -384,6 +392,24 @@ class ReferenceValidationConfig(BaseModel): "e.g. {'geo': 'GEO', 'NCBIGeo': 'GEO'}" ), ) + skip_prefixes: list[str] = Field( + default_factory=list, + description=( + "List of reference prefixes to skip during validation. " + "References with these prefixes will return is_valid=True with INFO severity. " + "Useful for unsupported or unfetchable reference types. " + "Case-insensitive. e.g. ['SRA', 'MGNIFY', 'BIOPROJECT']" + ), + ) + unknown_prefix_severity: ValidationSeverity = Field( + default=ValidationSeverity.ERROR, + description=( + "Severity level for references that cannot be fetched " + "(e.g., unsupported prefix or network error). " + "Options: ERROR (default), WARNING, INFO. " + "Does not apply to prefixes in skip_prefixes list." + ), + ) def get_cache_dir(self) -> Path: """Create and return the cache directory. diff --git a/src/linkml_reference_validator/validation/supporting_text_validator.py b/src/linkml_reference_validator/validation/supporting_text_validator.py index 809eda7..8615e2b 100644 --- a/src/linkml_reference_validator/validation/supporting_text_validator.py +++ b/src/linkml_reference_validator/validation/supporting_text_validator.py @@ -145,6 +145,20 @@ def validate( >>> # With title validation: >>> # result = validator.validate("quote", "PMID:12345678", expected_title="Study Title") """ + # Check if this prefix should be skipped + prefix = reference_id.split(":")[0].upper() if ":" in reference_id else "" + skip_prefixes_upper = [p.upper() for p in self.config.skip_prefixes] + + if prefix and prefix in skip_prefixes_upper: + return ValidationResult( + is_valid=True, + reference_id=reference_id, + supporting_text=supporting_text, + severity=ValidationSeverity.INFO, + message=f"Skipping validation for reference with prefix '{prefix}': {reference_id}", + path=path, + ) + reference = self.fetcher.fetch(reference_id) if not reference: @@ -152,7 +166,7 @@ def validate( is_valid=False, reference_id=reference_id, supporting_text=supporting_text, - severity=ValidationSeverity.ERROR, + severity=self.config.unknown_prefix_severity, message=f"Could not fetch reference: {reference_id}", path=path, ) diff --git a/tests/test_supporting_text_validator.py b/tests/test_supporting_text_validator.py index d78534b..d8d8a9d 100644 --- a/tests/test_supporting_text_validator.py +++ b/tests/test_supporting_text_validator.py @@ -317,3 +317,226 @@ def test_validate_full_text_no_abstract_context_in_failure_message(validator, mo assert result.is_valid is False assert "only abstract available" not in result.message + + +def test_skip_prefixes_single_prefix(tmp_path, mocker): + """Test that references with skipped prefixes return INFO severity. + + When a reference prefix is in the skip_prefixes list, validation should + return is_valid=True with INFO severity instead of ERROR. + """ + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["SRA"], + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None # Simulate unfetchable reference + + result = validator.validate( + "some supporting text", + "SRA:PRJNA290729", + ) + + assert result.is_valid is True + assert result.severity == ValidationSeverity.INFO + assert "Skipping validation" in result.message + assert "SRA:PRJNA290729" in result.message + + +def test_skip_prefixes_multiple_prefixes(tmp_path, mocker): + """Test that multiple prefixes can be skipped.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["SRA", "MGNIFY", "BIOPROJECT"], + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + # Test SRA + result_sra = validator.validate("text", "SRA:PRJNA290729") + assert result_sra.is_valid is True + assert result_sra.severity == ValidationSeverity.INFO + + # Test MGNIFY + result_mgnify = validator.validate("text", "MGNIFY:MGYS00000596") + assert result_mgnify.is_valid is True + assert result_mgnify.severity == ValidationSeverity.INFO + + # Test BIOPROJECT + result_bioproject = validator.validate("text", "BIOPROJECT:PRJNA566284") + assert result_bioproject.is_valid is True + assert result_bioproject.severity == ValidationSeverity.INFO + + +def test_skip_prefixes_case_insensitive(tmp_path, mocker): + """Test that prefix matching is case-insensitive.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["sra"], # lowercase in config + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + # Test with uppercase prefix in reference + result = validator.validate("text", "SRA:PRJNA290729") + assert result.is_valid is True + assert result.severity == ValidationSeverity.INFO + + +def test_skip_prefixes_not_skipped(tmp_path, mocker): + """Test that references NOT in skip list still get ERROR.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["SRA"], + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + # MGNIFY is not in skip list, should get ERROR + result = validator.validate("text", "MGNIFY:MGYS00000596") + assert result.is_valid is False + assert result.severity == ValidationSeverity.ERROR + + +def test_unknown_prefix_severity_warning(tmp_path, mocker): + """Test that unknown_prefix_severity=WARNING downgrades unfetchable references. + + When a reference cannot be fetched and unknown_prefix_severity is WARNING, + the validation should return is_valid=False with WARNING severity. + """ + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + unknown_prefix_severity=ValidationSeverity.WARNING, + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + result = validator.validate("text", "UNKNOWN:12345") + assert result.is_valid is False + assert result.severity == ValidationSeverity.WARNING + assert "Could not fetch reference" in result.message + + +def test_unknown_prefix_severity_info(tmp_path, mocker): + """Test that unknown_prefix_severity=INFO further downgrades unfetchable references.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + unknown_prefix_severity=ValidationSeverity.INFO, + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + result = validator.validate("text", "BIOPROJECT:PRJNA566284") + assert result.is_valid is False + assert result.severity == ValidationSeverity.INFO + + +def test_unknown_prefix_severity_default_error(tmp_path, mocker): + """Test that default behavior is ERROR for unfetchable references.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + # No unknown_prefix_severity specified, should default to ERROR + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + result = validator.validate("text", "UNKNOWN:12345") + assert result.is_valid is False + assert result.severity == ValidationSeverity.ERROR + + +def test_skip_prefixes_takes_precedence_over_unknown_severity(tmp_path, mocker): + """Test that skip_prefixes takes precedence over unknown_prefix_severity. + + When a prefix is in skip_prefixes, it should return INFO with is_valid=True, + regardless of the unknown_prefix_severity setting. + """ + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["SRA"], + unknown_prefix_severity=ValidationSeverity.ERROR, # This should be ignored for SRA + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + result = validator.validate("text", "SRA:PRJNA290729") + assert result.is_valid is True # skip_prefixes makes it valid + assert result.severity == ValidationSeverity.INFO + + +def test_combined_skip_and_severity_config(tmp_path, mocker): + """Test that skip_prefixes and unknown_prefix_severity work together. + + Skipped prefixes get INFO with is_valid=True. + Non-skipped unfetchable references get the configured severity. + """ + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["SRA"], + unknown_prefix_severity=ValidationSeverity.WARNING, + ) + validator = SupportingTextValidator(config) + + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = None + + # SRA is skipped + result_sra = validator.validate("text", "SRA:PRJNA290729") + assert result_sra.is_valid is True + assert result_sra.severity == ValidationSeverity.INFO + + # MGNIFY is not skipped, gets WARNING + result_mgnify = validator.validate("text", "MGNIFY:MGYS00000596") + assert result_mgnify.is_valid is False + assert result_mgnify.severity == ValidationSeverity.WARNING + + +def test_skip_prefixes_with_fetchable_reference(tmp_path, mocker): + """Test that skip_prefixes is checked before attempting fetch. + + Even if a source exists for a skipped prefix, it should be skipped. + """ + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + skip_prefixes=["PMID"], # Skip even valid PMID references + ) + validator = SupportingTextValidator(config) + + # Mock should not be called since we skip before fetching + mock_fetch = mocker.patch.object(validator.fetcher, "fetch") + mock_fetch.return_value = ReferenceContent( + reference_id="PMID:123", + content="Some content", + ) + + result = validator.validate("text", "PMID:123") + assert result.is_valid is True + assert result.severity == ValidationSeverity.INFO + # Note: fetch should still be called, but the result is ignored + assert "Skipping validation" in result.message