diff --git a/src/linkml_reference_validator/etl/sources/pmid.py b/src/linkml_reference_validator/etl/sources/pmid.py index fe1d7d5..d6933a7 100644 --- a/src/linkml_reference_validator/etl/sources/pmid.py +++ b/src/linkml_reference_validator/etl/sources/pmid.py @@ -13,7 +13,7 @@ import logging import re import time -from typing import Optional +from typing import Any, Optional from Bio import Entrez # type: ignore from bs4 import BeautifulSoup # type: ignore @@ -118,13 +118,20 @@ def fetch( record = records[0] if isinstance(records, list) else records + if not isinstance(record, dict): + logger.warning( + "Unexpected record format for PMID:%s: %s", pmid, type(record)) + return None + + record_dict: dict[str, Any] = record + # Convert Entrez StringElement objects to plain strings - title = str(record.get("Title", "")) - authors = self._parse_authors(record.get("AuthorList", [])) - journal = str(record.get("Source", "")) - pub_date = record.get("PubDate", "") + title = str(record_dict.get("Title", "")) + authors = self._parse_authors(record_dict.get("AuthorList", [])) + journal = str(record_dict.get("Source", "")) + pub_date = record_dict.get("PubDate", "") year = str(pub_date)[:4] if pub_date else "" - doi = str(record.get("DOI", "")) if record.get("DOI") else "" + doi = str(record_dict.get("DOI", "")) if record_dict.get("DOI") else "" abstract = self._fetch_abstract(pmid, config) full_text, content_type = self._fetch_pmc_fulltext(pmid, config) @@ -178,7 +185,8 @@ def _fetch_abstract( """ time.sleep(config.rate_limit_delay) - handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text") + handle = Entrez.efetch(db="pubmed", id=pmid, + rettype="abstract", retmode="text") abstract_text = handle.read() handle.close() @@ -206,7 +214,8 @@ def _fetch_mesh_terms( """ time.sleep(config.rate_limit_delay) - handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="xml") + handle = Entrez.efetch(db="pubmed", id=pmid, + rettype="xml", retmode="xml") xml_content = handle.read() handle.close() @@ -271,14 +280,31 @@ def _get_pmcid(self, pmid: str, config: ReferenceValidationConfig) -> Optional[s """ time.sleep(config.rate_limit_delay) - handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc") - result = Entrez.read(handle) - handle.close() + try: + handle = Entrez.elink( + dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc" + ) + except Exception as exc: + logger.warning("Failed to link PMID:%s to PMC: %s", pmid, exc) + return None + + try: + result = Entrez.read(handle) + except Exception as exc: + logger.warning( + "Failed to read PMC link for PMID:%s: %s", pmid, exc) + return None + finally: + handle.close() - if result and result[0].get("LinkSetDb"): - links = result[0]["LinkSetDb"][0].get("Link", []) - if links: - return links[0]["Id"] + if isinstance(result, list) and result and isinstance(result[0], dict): + link_set_db = result[0].get("LinkSetDb", []) + if isinstance(link_set_db, list) and link_set_db: + links = link_set_db[0].get("Link", []) + if isinstance(links, list) and links: + first_link = links[0] + if isinstance(first_link, dict) and "Id" in first_link: + return str(first_link["Id"]) return None @@ -296,7 +322,8 @@ def _fetch_pmc_xml( """ time.sleep(config.rate_limit_delay) - handle = Entrez.efetch(db="pmc", id=pmcid, rettype="xml", retmode="xml") + handle = Entrez.efetch( + db="pmc", id=pmcid, rettype="xml", retmode="xml") xml_content = handle.read() handle.close() diff --git a/src/linkml_reference_validator/plugins/reference_validation_plugin.py b/src/linkml_reference_validator/plugins/reference_validation_plugin.py index 1b07570..bcf8477 100644 --- a/src/linkml_reference_validator/plugins/reference_validation_plugin.py +++ b/src/linkml_reference_validator/plugins/reference_validation_plugin.py @@ -94,6 +94,16 @@ def __init__( self.validator = SupportingTextValidator(config) self.schema_view: Optional[SchemaView] = None + @property + def cache_dir(self) -> Path: + """Return the cache directory for this plugin.""" + return self.config.cache_dir + + @cache_dir.setter + def cache_dir(self, value: Path) -> None: + """Update the cache directory for this plugin.""" + self.config.cache_dir = value + def pre_process(self, context: ValidationContext) -> None: """Pre-process hook called before validation. @@ -293,17 +303,16 @@ def _get_converter(self) -> Optional[Converter]: return None schema = self.schema_view.schema if schema and schema.prefixes: - # schema.prefixes is a dict of prefix name -> Prefix object - # The Prefix object has a prefix_reference attribute with the URI - prefix_map = { - name: ( - prefix.prefix_reference - if hasattr(prefix, "prefix_reference") - else str(prefix) - ) - for name, prefix in schema.prefixes.items() - } - return Converter.from_prefix_map(prefix_map) + # schema.prefixes is a list of Prefix objects + # Each Prefix object has a prefix_name and prefix_reference attribute + prefix_map: dict[str, str] = {} + for prefix in schema.prefixes: + prefix_name = getattr(prefix, "prefix_name", None) + prefix_reference = getattr(prefix, "prefix_reference", None) + if isinstance(prefix_name, str) and prefix_reference is not None: + prefix_map[prefix_name] = str(prefix_reference) + if prefix_map: + return Converter.from_prefix_map(prefix_map) return None # type: ignore diff --git a/tests/test_plugin_integration.py b/tests/test_plugin_integration.py index 2183b02..12b13ec 100644 --- a/tests/test_plugin_integration.py +++ b/tests/test_plugin_integration.py @@ -38,7 +38,8 @@ def test_plugin_initialization_with_params(): plugin = ReferenceValidationPlugin( cache_dir="/tmp/cache", ) - assert plugin.config.cache_dir.as_posix() == "/tmp/cache" + # Access the cache_dir attribute directly if config is not exposed + assert plugin.cache_dir.as_posix() == "/tmp/cache" def test_extract_reference_id_string(plugin): diff --git a/tests/test_sources.py b/tests/test_sources.py index 95abb3c..750a46e 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -130,6 +130,7 @@ def test_fetch_relative_path_with_base_dir(self, tmp_path): result = source.fetch("notes.md", config) assert result is not None + assert result.content is not None assert "Some notes here." in result.content def test_fetch_relative_path_cwd_fallback(self, source, config, tmp_path, monkeypatch): @@ -154,7 +155,8 @@ def test_fetch_nonexistent_file(self, source, config): def test_extract_title_from_markdown(self, source, config, tmp_path): """Should extract title from first heading.""" test_file = tmp_path / "titled.md" - test_file.write_text("Some preamble\n\n# The Real Title\n\nContent here.") + test_file.write_text( + "Some preamble\n\n# The Real Title\n\nContent here.") result = source.fetch(str(test_file), config) @@ -164,7 +166,8 @@ def test_extract_title_from_markdown(self, source, config, tmp_path): def test_html_content_preserved(self, source, config, tmp_path): """HTML content should be preserved as-is.""" test_file = tmp_path / "test.html" - test_file.write_text("

Test & content

") + test_file.write_text( + "

Test & content

") result = source.fetch(str(test_file), config) @@ -280,6 +283,27 @@ def test_can_handle_pmid(self, source): assert source.can_handle("PMID 12345678") assert not source.can_handle("DOI:10.1234/test") + @patch("linkml_reference_validator.etl.sources.pmid.Entrez.read") + @patch("linkml_reference_validator.etl.sources.pmid.Entrez.elink") + def test_get_pmcid_handles_entrez_error( + self, + mock_elink, + mock_read, + source, + config, + ): + """Should return None when Entrez.read raises an error.""" + handle = MagicMock() + mock_elink.return_value = handle + mock_read.side_effect = RuntimeError( + "Couldn't resolve #exLinkSrv2, the address table is empty." + ) + + result = source._get_pmcid("12112053", config) + + assert result is None + handle.close.assert_called_once() + class TestDOISource: """Tests for DOISource (refactored from ReferenceFetcher).""" @@ -470,7 +494,8 @@ def config(self, tmp_path): "Project_Description", "bioproject", ), - (BioSampleSource, "biosample:SAMN00000001", "Title", "Description", "biosample"), + (BioSampleSource, "biosample:SAMN00000001", + "Title", "Description", "biosample"), ], ) @patch("linkml_reference_validator.etl.sources.entrez.Entrez.read") @@ -505,7 +530,8 @@ def test_fetch_entrez_summary( assert result.content == "Example content summary." assert result.content_type == "summary" assert result.metadata["entrez_db"] == db_name - mock_esummary.assert_called_once_with(db=db_name, id=reference_id.split(":", 1)[1]) + mock_esummary.assert_called_once_with( + db=db_name, id=reference_id.split(":", 1)[1]) mock_handle.close.assert_called_once() @pytest.mark.parametrize( @@ -578,7 +604,8 @@ def test_fetch_geo_converts_accession_to_uid( # Configure mock_read to return different values for esearch vs esummary mock_read.side_effect = [ {"IdList": ["200067472"]}, # esearch result - [{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}], # esummary result + # esummary result + [{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}], ] result = source.fetch("GSE67472", config) @@ -592,7 +619,8 @@ def test_fetch_geo_converts_accession_to_uid( assert result.metadata["entrez_uid"] == "200067472" # Verify esearch was called with accession - mock_esearch.assert_called_once_with(db="gds", term="GSE67472[Accession]") + mock_esearch.assert_called_once_with( + db="gds", term="GSE67472[Accession]") # Verify esummary was called with UID, not accession mock_esummary.assert_called_once_with(db="gds", id="200067472")