Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,90 @@ supporting_text: "correct quote"

## Configuration

### Configuration File

You can create a `.linkml-reference-validator.yaml` file in your project root to configure validation behavior:

```yaml
validation:
cache_dir: references_cache
rate_limit_delay: 0.5

# Skip validation for specific prefixes (useful for unsupported reference types)
skip_prefixes:
- SRA # Sequence Read Archive
- MGNIFY # MGnify database
- BIOPROJECT # NCBI BioProject (currently has API issues)

# Control severity for unfetchable references
unknown_prefix_severity: WARNING # Options: ERROR, WARNING, INFO

# Map alternate prefixes to canonical ones
reference_prefix_map:
geo: GEO
NCBIGeo: GEO
```

### Configuration Options

#### `skip_prefixes` (list of strings)

List of reference prefixes to skip during validation. References with these prefixes will return `is_valid=True` with `INFO` severity, allowing validation to pass without blocking your workflow.

**Use cases:**
- Unsupported reference types (SRA, MGnify, etc.)
- References that are temporarily unavailable
- Third-party databases without registered handlers

**Example:**
```yaml
validation:
skip_prefixes:
- SRA
- MGNIFY
- BIOPROJECT
```

With this configuration:
```bash
# These will pass validation with INFO severity
linkml-reference-validator validate text "some text" SRA:PRJNA290729
# ✓ Valid: True (INFO) - Skipping validation for reference with prefix 'SRA'

linkml-reference-validator validate text "some text" MGNIFY:MGYS00000596
# ✓ Valid: True (INFO) - Skipping validation for reference with prefix 'MGNIFY'
```

#### `unknown_prefix_severity` (ERROR | WARNING | INFO)

Control the severity level for references that cannot be fetched (unsupported prefix, network error, etc.). Default: `ERROR`

**Options:**
- `ERROR` (default) - Validation fails, blocking workflow
- `WARNING` - Validation fails but with lower severity
- `INFO` - Validation fails but logged as informational

**Note:** `skip_prefixes` takes precedence over `unknown_prefix_severity`. If a prefix is in `skip_prefixes`, it will return `is_valid=True` with `INFO` severity regardless of this setting.

**Example:**
```yaml
validation:
skip_prefixes:
- SRA # These will be skipped (is_valid=True, INFO)
unknown_prefix_severity: WARNING # Other unfetchable refs get WARNING
```

With this configuration:
```bash
# SRA is skipped (from skip_prefixes)
linkml-reference-validator validate text "text" SRA:PRJNA290729
# ✓ Valid: True (INFO) - Skipping validation

# UNKNOWN prefix gets WARNING severity
linkml-reference-validator validate text "text" UNKNOWN:12345
# ✗ Valid: False (WARNING) - Could not fetch reference
```

### Cache Directory

Default: `references_cache/` in current directory
Expand Down
30 changes: 2 additions & 28 deletions docs/notebooks/05_geo_validation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -203,33 +203,7 @@
"id": "a3b4c5d6",
"metadata": {},
"outputs": [],
"source": [
"from Bio import Entrez\n",
"\n",
"Entrez.email = \"your-email@example.com\"\n",
"\n",
"# Step 1: Convert accession to UID via esearch\n",
"handle = Entrez.esearch(db=\"gds\", term=\"GSE67472[Accession]\")\n",
"search_result = Entrez.read(handle)\n",
"handle.close()\n",
"\n",
"print(f\"Accession: GSE67472\")\n",
"print(f\"UID(s) found: {search_result['IdList']}\")\n",
"\n",
"if search_result['IdList']:\n",
" uid = search_result['IdList'][0]\n",
" \n",
" # Step 2: Fetch summary using UID\n",
" handle = Entrez.esummary(db=\"gds\", id=uid)\n",
" summary = Entrez.read(handle)\n",
" handle.close()\n",
" \n",
" if summary:\n",
" record = summary[0]\n",
" print(f\"\\nDataset Title: {record.get('title')}\")\n",
" print(f\"Platform: {record.get('GPL')}\")\n",
" print(f\"Samples: {record.get('n_samples')}\")"
]
"source": "from Bio import Entrez\n\nEntrez.email = \"your-email@example.com\"\n\n# Step 1: Convert accession to UID via esearch\nhandle = Entrez.esearch(db=\"gds\", term=\"GSE67472[Accession]\")\nsearch_result = Entrez.read(handle)\nhandle.close()\n\nprint(\"Accession: GSE67472\")\nprint(f\"UID(s) found: {search_result['IdList']}\")\n\nif search_result['IdList']:\n uid = search_result['IdList'][0]\n \n # Step 2: Fetch summary using UID\n handle = Entrez.esummary(db=\"gds\", id=uid)\n summary = Entrez.read(handle)\n handle.close()\n \n if summary:\n record = summary[0]\n print(f\"\\nDataset Title: {record.get('title')}\")\n print(f\"Platform: {record.get('GPL')}\")\n print(f\"Samples: {record.get('n_samples')}\")"
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -358,4 +332,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
26 changes: 26 additions & 0 deletions src/linkml_reference_validator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,14 @@ class ReferenceValidationConfig(BaseModel):
... )
>>> config.reference_prefix_map["geo"]
'GEO'
>>> config = ReferenceValidationConfig(
... skip_prefixes=["SRA", "MGNIFY"],
... unknown_prefix_severity=ValidationSeverity.WARNING
... )
>>> config.skip_prefixes
['SRA', 'MGNIFY']
>>> config.unknown_prefix_severity
<ValidationSeverity.WARNING: 'WARNING'>
"""

cache_dir: Path = Field(
Expand Down Expand Up @@ -384,6 +392,24 @@ class ReferenceValidationConfig(BaseModel):
"e.g. {'geo': 'GEO', 'NCBIGeo': 'GEO'}"
),
)
skip_prefixes: list[str] = Field(
default_factory=list,
description=(
"List of reference prefixes to skip during validation. "
"References with these prefixes will return is_valid=True with INFO severity. "
"Useful for unsupported or unfetchable reference types. "
"Case-insensitive. e.g. ['SRA', 'MGNIFY', 'BIOPROJECT']"
),
)
unknown_prefix_severity: ValidationSeverity = Field(
default=ValidationSeverity.ERROR,
description=(
"Severity level for references that cannot be fetched "
"(e.g., unsupported prefix or network error). "
"Options: ERROR (default), WARNING, INFO. "
"Does not apply to prefixes in skip_prefixes list."
),
)

def get_cache_dir(self) -> Path:
"""Create and return the cache directory.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,28 @@ def validate(
>>> # With title validation:
>>> # result = validator.validate("quote", "PMID:12345678", expected_title="Study Title")
"""
# Check if this prefix should be skipped
prefix = reference_id.split(":")[0].upper() if ":" in reference_id else ""
skip_prefixes_upper = [p.upper() for p in self.config.skip_prefixes]

if prefix and prefix in skip_prefixes_upper:
return ValidationResult(
is_valid=True,
reference_id=reference_id,
supporting_text=supporting_text,
severity=ValidationSeverity.INFO,
message=f"Skipping validation for reference with prefix '{prefix}': {reference_id}",
path=path,
)

reference = self.fetcher.fetch(reference_id)

if not reference:
return ValidationResult(
is_valid=False,
reference_id=reference_id,
supporting_text=supporting_text,
severity=ValidationSeverity.ERROR,
severity=self.config.unknown_prefix_severity,
message=f"Could not fetch reference: {reference_id}",
path=path,
)
Expand Down
Loading