Updated using importlib_resource instead of pkg_resources and prepare for later versions of pandas (#492)

hrshdhgd · web-flow · commit 96b3ab95ce7a · 2024-02-08T16:25:05.000-06:00
Fixes #491 - [x] Updated using `importlib_resource` instead of `pkg_resources` - Reason: `pkg_resources` is going to be deprecated. - [x] Refactor `pandas` related code to smoothly transition to future versions and handle deprecation warnings. - [Pandas PR](https://github.com/pandas-dev/pandas/pull/54710/files#diff-55001624a0932c1b6cee2e6ddb65dea85c1faf0dee84812c0ca0c32916a71438): ``` "Downcasting behavior in `replace` is deprecated and " "will be removed in a future version. To retain the old " "behavior, explicitly call " "`result.infer_objects(copy=False)`. " "To opt-in to the future " "behavior, set " "`pd.set_option('future.no_silent_downcasting', True)`", ``` - `A value is trying to be set on a copy of a slice from a DataFrame` - `.apply(max)` => `.apply(np.maximum.reduce)` - `UserWarning: Boolean Series key will be reindexed to match DataFrame index.`
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ deprecation = "^2.1.0"
 pyyaml = "^6.0.1"
 rdflib = ">=6.0.0"
 scipy = {version = "*", extras = ["scipy"]}
+importlib-resources = "^6.1.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = {version = ">=7.1.2"}
diff --git a/src/sssom/constants.py b/src/sssom/constants.py
@@ -6,14 +6,14 @@
 from functools import cached_property, lru_cache
 from typing import Any, Dict, List, Literal, Set
 
-import pkg_resources
+import importlib_resources
 import yaml
 from linkml_runtime.utils.schema_as_dict import schema_as_dict
 from linkml_runtime.utils.schemaview import SchemaView
 
 HERE = pathlib.Path(__file__).parent.resolve()
 
-SCHEMA_YAML = pkg_resources.resource_filename("sssom_schema", "schema/sssom_schema.yaml")
+SCHEMA_YAML = importlib_resources.files("sssom_schema").joinpath("schema/sssom_schema.yaml")
 EXTENDED_PREFIX_MAP = HERE / "obo.epm.json"
 
 OWL_EQUIV_CLASS_URI = "http://www.w3.org/2002/07/owl#equivalentClass"
diff --git a/src/sssom/context.py b/src/sssom/context.py
@@ -5,7 +5,7 @@
 from typing import Mapping, Union
 
 import curies
-import pkg_resources
+import importlib_resources
 from curies import Converter
 from rdflib.namespace import is_ncname
 
@@ -19,8 +19,8 @@
 ]
 
 SSSOM_BUILT_IN_PREFIXES = ("sssom", "owl", "rdf", "rdfs", "skos", "semapv")
-SSSOM_CONTEXT = pkg_resources.resource_filename(
-    "sssom_schema", "context/sssom_schema.context.jsonld"
+SSSOM_CONTEXT = importlib_resources.files("sssom_schema").joinpath(
+    "context/sssom_schema.context.jsonld"
 )
 
 
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -80,6 +80,13 @@
 KEY_FEATURES = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID, PREDICATE_MODIFIER]
 TRIPLES_IDS = [SUBJECT_ID, PREDICATE_ID, OBJECT_ID]
 
+# ! This will be unnecessary when pandas >= 3.0.0 is released
+# ! https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.infer_objects.html#
+# A value is trying to be set on a copy of a slice from a DataFrame
+pd.options.mode.copy_on_write = True
+# Get the version of pandas as a tuple of integers
+pandas_version = tuple(map(int, pd.__version__.split(".")))
+
 
 @dataclass
 class MappingSetDataFrame:
@@ -151,6 +158,12 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr
         df = pd.DataFrame(get_dict_from_mapping(mapping) for mapping in doc.mapping_set.mappings)
         meta = _extract_global_metadata(doc)
 
+        if pandas_version >= (2, 0, 0):
+            # For pandas >= 2.0.0, use the 'copy' parameter
+            df = df.infer_objects(copy=False)
+        else:
+            # For pandas < 2.0.0, call 'infer_objects()' without any parameters
+            df = df.infer_objects()
         # remove columns where all values are blank.
         df.replace("", np.nan, inplace=True)
         df.dropna(axis=1, how="all", inplace=True)  # remove columns with all row = 'None'-s.
@@ -160,6 +173,14 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr
             slot for slot, slot_metadata in slots.items() if slot_metadata["range"] == "double"
         }
         non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
+
+        if pandas_version >= (2, 0, 0):
+            # For pandas >= 2.0.0, use the 'copy' parameter
+            non_double_cols = non_double_cols.infer_objects(copy=False)
+        else:
+            # For pandas < 2.0.0, call 'infer_objects()' without any parameters
+            non_double_cols = non_double_cols.infer_objects()
+
         non_double_cols.replace(np.nan, "", inplace=True)
         df.update(non_double_cols)
 
@@ -1397,18 +1418,26 @@ def invert_mappings(
         non_predicate_modified_df = df
 
     if subject_prefix:
-        subject_starts_with_prefix_condition = df[SUBJECT_ID].str.startswith(subject_prefix + ":")
-        object_starts_with_prefix_condition = df[OBJECT_ID].str.startswith(subject_prefix + ":")
+        # Filter rows where 'SUBJECT_ID' starts with the prefix but 'OBJECT_ID' does not
         prefixed_subjects_df = pd.DataFrame(
             non_predicate_modified_df[
-                (subject_starts_with_prefix_condition & ~object_starts_with_prefix_condition)
+                (
+                    non_predicate_modified_df[SUBJECT_ID].str.startswith(subject_prefix + ":")
+                    & ~non_predicate_modified_df[OBJECT_ID].str.startswith(subject_prefix + ":")
+                )
             ]
         )
+
+        # Filter rows where 'SUBJECT_ID' does not start with the prefix but 'OBJECT_ID' does
         non_prefix_subjects_df = pd.DataFrame(
             non_predicate_modified_df[
-                (~subject_starts_with_prefix_condition & object_starts_with_prefix_condition)
+                (
+                    ~non_predicate_modified_df[SUBJECT_ID].str.startswith(subject_prefix + ":")
+                    & non_predicate_modified_df[OBJECT_ID].str.startswith(subject_prefix + ":")
+                )
             ]
         )
+
         df_to_invert = non_prefix_subjects_df.loc[
             non_prefix_subjects_df[PREDICATE_ID].isin(list(predicate_invert_map.keys()))
         ]

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`from typing import Mapping, Union`
`6`	`6`
`7`	`7`	`import curies`
`8`		`-import pkg_resources`
	`8`	`+import importlib_resources`
`9`	`9`	`from curies import Converter`
`10`	`10`	`from rdflib.namespace import is_ncname`
`11`	`11`
`@@ -19,8 +19,8 @@`
`19`	`19`	`]`
`20`	`20`
`21`	`21`	`SSSOM_BUILT_IN_PREFIXES = ("sssom", "owl", "rdf", "rdfs", "skos", "semapv")`
`22`		`-SSSOM_CONTEXT = pkg_resources.resource_filename(`
`23`		`- "sssom_schema", "context/sssom_schema.context.jsonld"`
	`22`	`+SSSOM_CONTEXT = importlib_resources.files("sssom_schema").joinpath(`
	`23`	`+ "context/sssom_schema.context.jsonld"`
`24`	`24`	`)`
`25`	`25`
`26`	`26`