Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 43 additions & 16 deletions src/linkml_reference_validator/etl/sources/pmid.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import logging
import re
import time
from typing import Optional
from typing import Any, Optional

from Bio import Entrez # type: ignore
from bs4 import BeautifulSoup # type: ignore
Expand Down Expand Up @@ -118,13 +118,20 @@ def fetch(

record = records[0] if isinstance(records, list) else records

if not isinstance(record, dict):
logger.warning(
"Unexpected record format for PMID:%s: %s", pmid, type(record))
return None

record_dict: dict[str, Any] = record

# Convert Entrez StringElement objects to plain strings
title = str(record.get("Title", ""))
authors = self._parse_authors(record.get("AuthorList", []))
journal = str(record.get("Source", ""))
pub_date = record.get("PubDate", "")
title = str(record_dict.get("Title", ""))
authors = self._parse_authors(record_dict.get("AuthorList", []))
journal = str(record_dict.get("Source", ""))
pub_date = record_dict.get("PubDate", "")
year = str(pub_date)[:4] if pub_date else ""
doi = str(record.get("DOI", "")) if record.get("DOI") else ""
doi = str(record_dict.get("DOI", "")) if record_dict.get("DOI") else ""

abstract = self._fetch_abstract(pmid, config)
full_text, content_type = self._fetch_pmc_fulltext(pmid, config)
Expand Down Expand Up @@ -178,7 +185,8 @@ def _fetch_abstract(
"""
time.sleep(config.rate_limit_delay)

handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
handle = Entrez.efetch(db="pubmed", id=pmid,
rettype="abstract", retmode="text")
abstract_text = handle.read()
handle.close()

Expand Down Expand Up @@ -206,7 +214,8 @@ def _fetch_mesh_terms(
"""
time.sleep(config.rate_limit_delay)

handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="xml")
handle = Entrez.efetch(db="pubmed", id=pmid,
rettype="xml", retmode="xml")
xml_content = handle.read()
handle.close()

Expand Down Expand Up @@ -271,14 +280,31 @@ def _get_pmcid(self, pmid: str, config: ReferenceValidationConfig) -> Optional[s
"""
time.sleep(config.rate_limit_delay)

handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc")
result = Entrez.read(handle)
handle.close()
try:
handle = Entrez.elink(
dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc"
)
except Exception as exc:
logger.warning("Failed to link PMID:%s to PMC: %s", pmid, exc)
return None

try:
result = Entrez.read(handle)
except Exception as exc:
logger.warning(
"Failed to read PMC link for PMID:%s: %s", pmid, exc)
return None
finally:
handle.close()

if result and result[0].get("LinkSetDb"):
links = result[0]["LinkSetDb"][0].get("Link", [])
if links:
return links[0]["Id"]
if isinstance(result, list) and result and isinstance(result[0], dict):
link_set_db = result[0].get("LinkSetDb", [])
if isinstance(link_set_db, list) and link_set_db:
links = link_set_db[0].get("Link", [])
if isinstance(links, list) and links:
first_link = links[0]
if isinstance(first_link, dict) and "Id" in first_link:
return str(first_link["Id"])

return None

Expand All @@ -296,7 +322,8 @@ def _fetch_pmc_xml(
"""
time.sleep(config.rate_limit_delay)

handle = Entrez.efetch(db="pmc", id=pmcid, rettype="xml", retmode="xml")
handle = Entrez.efetch(
db="pmc", id=pmcid, rettype="xml", retmode="xml")
xml_content = handle.read()
handle.close()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ def __init__(
self.validator = SupportingTextValidator(config)
self.schema_view: Optional[SchemaView] = None

@property
def cache_dir(self) -> Path:
"""Return the cache directory for this plugin."""
return self.config.cache_dir

@cache_dir.setter
def cache_dir(self, value: Path) -> None:
"""Update the cache directory for this plugin."""
self.config.cache_dir = value

def pre_process(self, context: ValidationContext) -> None:
"""Pre-process hook called before validation.

Expand Down Expand Up @@ -293,17 +303,16 @@ def _get_converter(self) -> Optional[Converter]:
return None
schema = self.schema_view.schema
if schema and schema.prefixes:
# schema.prefixes is a dict of prefix name -> Prefix object
# The Prefix object has a prefix_reference attribute with the URI
prefix_map = {
name: (
prefix.prefix_reference
if hasattr(prefix, "prefix_reference")
else str(prefix)
)
for name, prefix in schema.prefixes.items()
}
return Converter.from_prefix_map(prefix_map)
# schema.prefixes is a list of Prefix objects
# Each Prefix object has a prefix_name and prefix_reference attribute
prefix_map: dict[str, str] = {}
for prefix in schema.prefixes:
prefix_name = getattr(prefix, "prefix_name", None)
prefix_reference = getattr(prefix, "prefix_reference", None)
if isinstance(prefix_name, str) and prefix_reference is not None:
prefix_map[prefix_name] = str(prefix_reference)
if prefix_map:
return Converter.from_prefix_map(prefix_map)
return None

# type: ignore
Expand Down
3 changes: 2 additions & 1 deletion tests/test_plugin_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def test_plugin_initialization_with_params():
plugin = ReferenceValidationPlugin(
cache_dir="/tmp/cache",
)
assert plugin.config.cache_dir.as_posix() == "/tmp/cache"
# Access the cache_dir attribute directly if config is not exposed
assert plugin.cache_dir.as_posix() == "/tmp/cache"


def test_extract_reference_id_string(plugin):
Expand Down
40 changes: 34 additions & 6 deletions tests/test_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def test_fetch_relative_path_with_base_dir(self, tmp_path):
result = source.fetch("notes.md", config)

assert result is not None
assert result.content is not None
assert "Some notes here." in result.content

def test_fetch_relative_path_cwd_fallback(self, source, config, tmp_path, monkeypatch):
Expand All @@ -154,7 +155,8 @@ def test_fetch_nonexistent_file(self, source, config):
def test_extract_title_from_markdown(self, source, config, tmp_path):
"""Should extract title from first heading."""
test_file = tmp_path / "titled.md"
test_file.write_text("Some preamble\n\n# The Real Title\n\nContent here.")
test_file.write_text(
"Some preamble\n\n# The Real Title\n\nContent here.")

result = source.fetch(str(test_file), config)

Expand All @@ -164,7 +166,8 @@ def test_extract_title_from_markdown(self, source, config, tmp_path):
def test_html_content_preserved(self, source, config, tmp_path):
"""HTML content should be preserved as-is."""
test_file = tmp_path / "test.html"
test_file.write_text("<html><body><p>Test &amp; content</p></body></html>")
test_file.write_text(
"<html><body><p>Test &amp; content</p></body></html>")

result = source.fetch(str(test_file), config)

Expand Down Expand Up @@ -280,6 +283,27 @@ def test_can_handle_pmid(self, source):
assert source.can_handle("PMID 12345678")
assert not source.can_handle("DOI:10.1234/test")

@patch("linkml_reference_validator.etl.sources.pmid.Entrez.read")
@patch("linkml_reference_validator.etl.sources.pmid.Entrez.elink")
def test_get_pmcid_handles_entrez_error(
self,
mock_elink,
mock_read,
source,
config,
):
"""Should return None when Entrez.read raises an error."""
handle = MagicMock()
mock_elink.return_value = handle
mock_read.side_effect = RuntimeError(
"Couldn't resolve #exLinkSrv2, the address table is empty."
)

result = source._get_pmcid("12112053", config)

assert result is None
handle.close.assert_called_once()


class TestDOISource:
"""Tests for DOISource (refactored from ReferenceFetcher)."""
Expand Down Expand Up @@ -470,7 +494,8 @@ def config(self, tmp_path):
"Project_Description",
"bioproject",
),
(BioSampleSource, "biosample:SAMN00000001", "Title", "Description", "biosample"),
(BioSampleSource, "biosample:SAMN00000001",
"Title", "Description", "biosample"),
],
)
@patch("linkml_reference_validator.etl.sources.entrez.Entrez.read")
Expand Down Expand Up @@ -505,7 +530,8 @@ def test_fetch_entrez_summary(
assert result.content == "Example content summary."
assert result.content_type == "summary"
assert result.metadata["entrez_db"] == db_name
mock_esummary.assert_called_once_with(db=db_name, id=reference_id.split(":", 1)[1])
mock_esummary.assert_called_once_with(
db=db_name, id=reference_id.split(":", 1)[1])
mock_handle.close.assert_called_once()

@pytest.mark.parametrize(
Expand Down Expand Up @@ -578,7 +604,8 @@ def test_fetch_geo_converts_accession_to_uid(
# Configure mock_read to return different values for esearch vs esummary
mock_read.side_effect = [
{"IdList": ["200067472"]}, # esearch result
[{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}], # esummary result
# esummary result
[{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}],
]

result = source.fetch("GSE67472", config)
Expand All @@ -592,7 +619,8 @@ def test_fetch_geo_converts_accession_to_uid(
assert result.metadata["entrez_uid"] == "200067472"

# Verify esearch was called with accession
mock_esearch.assert_called_once_with(db="gds", term="GSE67472[Accession]")
mock_esearch.assert_called_once_with(
db="gds", term="GSE67472[Accession]")
# Verify esummary was called with UID, not accession
mock_esummary.assert_called_once_with(db="gds", id="200067472")

Expand Down