Skip to content

Commit 20a00be

Browse files
hrshdhgdcthoytmatentzn
authored
Optimization of some functions (#462)
Addresses #202 - [x] Ran `poetry update` - [x] Call `_get_sssom_schema_object()` once in the function `get_dict_from_mapping()` rather than multiple times in a for loop that is inefficient. - [x] Instead of `pandas.iterrows()` use `pandas.apply()` in `_get_mapping_set_from_df()` - [x] Use dict/list comprehensions instead of for loops - [x] Use sets instead of lists where lookups are done and sequence of elements don't matter. - [x] Improve `SchemaView` object instantiation and persistence - [x] Use `@cached_property` thank you @cthoyt --------- Co-authored-by: Charles Tapley Hoyt <[email protected]> Co-authored-by: Nico Matentzoglu <[email protected]>
1 parent da2a250 commit 20a00be

File tree

5 files changed

+207
-110
lines changed

5 files changed

+207
-110
lines changed

poetry.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/sssom/constants.py

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import pathlib
44
import uuid
55
from enum import Enum
6-
from functools import lru_cache
7-
from typing import Any, Dict, List, Literal
6+
from functools import cached_property, lru_cache
7+
from typing import Any, Dict, List, Literal, Set
88

99
import pkg_resources
1010
import yaml
@@ -213,48 +213,56 @@ class SSSOMSchemaView(object):
213213
Implemented via PR: https://github.com/mapping-commons/sssom-py/pull/323
214214
"""
215215

216-
_view = None
217-
_dict = None
218-
219216
def __new__(cls):
220217
"""Create a instance of the SSSOM schema view if non-existent."""
221218
if not hasattr(cls, "instance"):
222219
cls.instance = super(SSSOMSchemaView, cls).__new__(cls)
223-
return cls.instance
220+
return cls.instance
224221

225-
@property
222+
@cached_property
226223
def view(self) -> SchemaView:
227224
"""Return SchemaView object."""
228-
if self._view is None:
229-
self._view = SchemaView(SCHEMA_YAML)
230-
return self._view
225+
return SchemaView(SCHEMA_YAML)
231226

232-
@property
227+
@cached_property
233228
def dict(self) -> dict:
234229
"""Return SchemaView as a dictionary."""
235-
if self._dict is None:
236-
self._dict = schema_as_dict(self.view.schema)
237-
return self._dict
230+
return schema_as_dict(self.view.schema)
238231

239-
@property
232+
@cached_property
240233
def mapping_slots(self) -> List[str]:
241234
"""Return list of mapping slots."""
242235
return self.view.get_class("mapping").slots
243236

244-
@property
237+
@cached_property
245238
def mapping_set_slots(self) -> List[str]:
246239
"""Return list of mapping set slots."""
247240
return self.view.get_class("mapping set").slots
248241

249-
@property
250-
def multivalued_slots(self) -> List[str]:
251-
"""Return list of multivalued slots."""
252-
return [c for c in self.view.all_slots() if self.view.get_slot(c).multivalued]
253-
254-
@property
255-
def entity_reference_slots(self) -> List[str]:
256-
"""Return list of entity reference slots."""
257-
return [c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE]
242+
@cached_property
243+
def multivalued_slots(self) -> Set[str]:
244+
"""Return set of multivalued slots."""
245+
return {c for c in self.view.all_slots() if self.view.get_slot(c).multivalued}
246+
247+
@cached_property
248+
def entity_reference_slots(self) -> Set[str]:
249+
"""Return set of entity reference slots."""
250+
return {c for c in self.view.all_slots() if self.view.get_slot(c).range == ENTITY_REFERENCE}
251+
252+
@cached_property
253+
def mapping_enum_keys(self) -> Set[str]:
254+
"""Return a set of mapping enum keys."""
255+
return set(_get_sssom_schema_object().dict["enums"].keys())
256+
257+
@cached_property
258+
def slots(self) -> Dict[str, str]:
259+
"""Return the slots for SSSOMSchemaView object."""
260+
return self.dict["slots"]
261+
262+
@cached_property
263+
def double_slots(self) -> Set[str]:
264+
"""Return the slot names for SSSOMSchemaView object."""
265+
return {k for k, v in self.dict["slots"].items() if v["range"] == "double"}
258266

259267

260268
@lru_cache(1)

src/sssom/parsers.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def _get_prefix_map_and_metadata(
313313

314314

315315
def _address_multivalued_slot(k: str, v: Any) -> Union[str, List[str]]:
316-
if is_multivalued_slot(k) and v is not None and isinstance(v, str):
316+
if isinstance(v, str) and is_multivalued_slot(k):
317317
# IF k is multivalued, then v = List[values]
318318
return [s.strip() for s in v.split("|")]
319319
else:
@@ -329,20 +329,28 @@ def _init_mapping_set(meta: Optional[MetadataType]) -> MappingSet:
329329
return mapping_set
330330

331331

332+
MAPPING_SLOTS = set(_get_sssom_schema_object().mapping_slots)
333+
334+
332335
def _get_mapping_dict(row: pd.Series, bad_attrs: Counter) -> Dict[str, Any]:
333-
mdict = {}
334-
sssom_schema_object = _get_sssom_schema_object()
335-
for k, v in row.items():
336-
if not v or pd.isna(v):
337-
continue
338-
k = cast(str, k)
339-
if k in sssom_schema_object.mapping_slots:
340-
mdict[k] = _address_multivalued_slot(k, v)
341-
else:
342-
# There's the possibility that the key is in
343-
# sssom_schema_object.mapping_set_slots, but
344-
# this is skipped for now
345-
bad_attrs[k] += 1
336+
"""Generate a mapping dictionary from a given row of data.
337+
338+
It also updates the 'bad_attrs' counter for keys that are not present
339+
in the sssom_schema_object's mapping_slots.
340+
"""
341+
# Populate the mapping dictionary with key-value pairs from the row,
342+
# only if the value exists, is not NaN, and the key is in the schema's mapping slots.
343+
# The value could be a string or a list and is handled accordingly via _address_multivalued_slot().
344+
mdict = {
345+
k: _address_multivalued_slot(k, v)
346+
for k, v in row.items()
347+
if v and pd.notna(v) and k in MAPPING_SLOTS
348+
}
349+
350+
# Update bad_attrs for keys not in mapping_slots
351+
bad_keys = set(row.keys()) - MAPPING_SLOTS
352+
for bad_key in bad_keys:
353+
bad_attrs[bad_key] += 1
346354
return mdict
347355

348356

@@ -795,9 +803,14 @@ def to_mapping_set_document(msdf: MappingSetDataFrame) -> MappingSetDocument:
795803
def _get_mapping_set_from_df(df: pd.DataFrame, meta: Optional[MetadataType] = None) -> MappingSet:
796804
mapping_set = _init_mapping_set(meta)
797805
bad_attrs: Counter = Counter()
798-
for _, row in df.iterrows():
799-
mapping_dict = _get_mapping_dict(row, bad_attrs)
800-
_add_valid_mapping_to_list(mapping_dict, mapping_set.mappings)
806+
807+
df.apply(
808+
lambda row: _add_valid_mapping_to_list(
809+
_get_mapping_dict(row, bad_attrs), mapping_set.mappings
810+
),
811+
axis=1,
812+
)
813+
801814
for k, v in bad_attrs.items():
802815
logging.warning(f"No attr for {k} [{v} instances]")
803816
return mapping_set

src/sssom/util.py

Lines changed: 45 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,13 @@ def from_mapping_set_document(cls, doc: MappingSetDocument) -> "MappingSetDataFr
156156
df.replace("", np.nan, inplace=True)
157157
df.dropna(axis=1, how="all", inplace=True) # remove columns with all row = 'None'-s.
158158

159+
slots = _get_sssom_schema_object().dict["slots"]
159160
slots_with_double_as_range = {
160-
slot
161-
for slot, slot_metadata in _get_sssom_schema_object().dict["slots"].items()
162-
if slot_metadata["range"] == "double"
161+
slot for slot, slot_metadata in slots.items() if slot_metadata["range"] == "double"
163162
}
164163
non_double_cols = df.loc[:, ~df.columns.isin(slots_with_double_as_range)]
165-
non_double_cols = non_double_cols.replace(np.nan, "")
166-
df[non_double_cols.columns] = non_double_cols
164+
non_double_cols.replace(np.nan, "", inplace=True)
165+
df.update(non_double_cols)
167166

168167
df = sort_df_rows_columns(df)
169168
return cls.with_converter(df=df, converter=doc.converter, metadata=meta)
@@ -1044,46 +1043,35 @@ def get_dict_from_mapping(map_obj: Union[Any, Dict[Any, Any], SSSOM_Mapping]) ->
10441043
:return: Dictionary
10451044
"""
10461045
map_dict = {}
1047-
slots_with_double_as_range = [
1048-
s
1049-
for s in _get_sssom_schema_object().dict["slots"].keys()
1050-
if _get_sssom_schema_object().dict["slots"][s]["range"] == "double"
1051-
]
1046+
sssom_schema_object = _get_sssom_schema_object()
10521047
for property in map_obj:
1053-
if map_obj[property] is not None:
1054-
if isinstance(map_obj[property], list):
1055-
# IF object is an enum
1056-
if (
1057-
_get_sssom_schema_object().dict["slots"][property]["range"]
1058-
in _get_sssom_schema_object().dict["enums"].keys()
1059-
):
1060-
# IF object is a multivalued enum
1061-
if _get_sssom_schema_object().dict["slots"][property]["multivalued"]:
1062-
map_dict[property] = "|".join(
1063-
enum_value.code.text for enum_value in map_obj[property]
1064-
)
1065-
# If object is NOT multivalued BUT an enum.
1066-
else:
1067-
map_dict[property] = map_obj[property].code.text
1068-
# IF object is NOT an enum but a list
1069-
else:
1070-
map_dict[property] = "|".join(enum_value for enum_value in map_obj[property])
1071-
# IF object NOT a list
1048+
mapping_property = map_obj[property]
1049+
if mapping_property is None:
1050+
map_dict[property] = np.nan if property in sssom_schema_object.double_slots else ""
1051+
continue
1052+
1053+
slot_of_interest = sssom_schema_object.slots[property]
1054+
is_enum = slot_of_interest["range"] in sssom_schema_object.mapping_enum_keys # type:ignore
1055+
1056+
# Check if the mapping_property is a list
1057+
if isinstance(mapping_property, list):
1058+
# If the property is an enumeration and it allows multiple values
1059+
if is_enum and slot_of_interest["multivalued"]: # type:ignore
1060+
# Join all the enum values into a string separated by '|'
1061+
map_dict[property] = "|".join(
1062+
enum_value.code.text for enum_value in mapping_property
1063+
)
10721064
else:
1073-
# IF object is an enum
1074-
if (
1075-
_get_sssom_schema_object().dict["slots"][property]["range"]
1076-
in _get_sssom_schema_object().dict["enums"].keys()
1077-
):
1078-
map_dict[property] = map_obj[property].code.text
1079-
else:
1080-
map_dict[property] = map_obj[property]
1065+
# If the property is not an enumeration or doesn't allow multiple values,
1066+
# join all the values into a string separated by '|'
1067+
map_dict[property] = "|".join(enum_value for enum_value in mapping_property)
1068+
elif is_enum:
1069+
# Assign the text of the enumeration code to the property in the dictionary
1070+
map_dict[property] = mapping_property.code.text
10811071
else:
1082-
# IF map_obj[property] is None:
1083-
if property in slots_with_double_as_range:
1084-
map_dict[property] = np.nan
1085-
else:
1086-
map_dict[property] = ""
1072+
# If the mapping_property is neither a list nor an enumeration,
1073+
# assign the value directly to the property in the dictionary
1074+
map_dict[property] = mapping_property
10871075

10881076
return map_dict
10891077

@@ -1139,18 +1127,21 @@ def get_prefixes_used_in_table(df: pd.DataFrame, converter: Converter) -> Set[st
11391127
prefixes = set(SSSOM_BUILT_IN_PREFIXES)
11401128
if df.empty:
11411129
return prefixes
1142-
for col in _get_sssom_schema_object().entity_reference_slots:
1143-
if col not in df.columns:
1144-
continue
1145-
prefixes.update(
1146-
converter.parse_curie(row).prefix
1147-
for row in df[col]
1148-
# we don't use the converter here since get_prefixes_used_in_table
1149-
# is often used to identify prefixes that are not properly registered
1150-
# in the converter
1151-
if not _is_iri(row) and _is_curie(row)
1152-
)
1153-
return set(prefixes)
1130+
sssom_schema_object = _get_sssom_schema_object()
1131+
entity_reference_slots = sssom_schema_object.entity_reference_slots & set(df.columns)
1132+
new_prefixes = {
1133+
converter.parse_curie(row).prefix
1134+
for col in entity_reference_slots
1135+
for row in df[col]
1136+
if not _is_iri(row) and _is_curie(row)
1137+
# we don't use the converter here since get_prefixes_used_in_table
1138+
# is often used to identify prefixes that are not properly registered
1139+
# in the converter
1140+
}
1141+
1142+
prefixes.update(new_prefixes)
1143+
1144+
return prefixes
11541145

11551146

11561147
def get_prefixes_used_in_metadata(meta: MetadataType) -> Set[str]:

0 commit comments

Comments
 (0)