Skip to content

Commit 1891bb6

Browse files
author
Glass
authored
filter prefixes - option to require all prefixes (#379)
* nothing commit * committing so i can install in koza * add to filter_out_prefix as well * add extra tests for new options * revert to two methods, but keep require_all_prefixes option * bump version * uncomment tests (#2) * match filter out and filter return * fix lint issues i think * bumped version too far * Remove unnecessary Args: from filter_prefixes
1 parent 90ca6a5 commit 1891bb6

File tree

5 files changed

+63
-28
lines changed

5 files changed

+63
-28
lines changed

.gitignore

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
venv/
1+
.venv
2+
venv
23
sssom/__pycache__/
34
.idea
45
sssom.egg-info
@@ -24,4 +25,4 @@ schema/sssom.schema.json
2425
schema/sssom.yaml
2526
schema/sssom_datamodel.py
2627
sssom/internal_context.py
27-
sssom/sssom_datamodel.py
28+
sssom/sssom_datamodel.py

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
SSSOM (Simple Standard for Sharing Ontology Mappings) is a TSV and RDF/OWL standard for ontology mappings
2424

2525
```
26-
WARNING: the export formats (json, rdf) of sssom-py are not yet finalised! Please expect changes in future releases!
26+
WARNING:
27+
The export formats (json, rdf) of sssom-py are not yet finalised!
28+
Please expect changes in future releases!
2729
```
2830

2931
See https://github.com/OBOFoundry/SSSOM

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
##########################
44
[metadata]
55
name = sssom
6-
version = 0.3.30
6+
version = 0.3.31
77
description = Operations on SSSOM mapping tables
88
long_description = file: README.md
99
long_description_content_type = text/markdown

sssom/util.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,55 +1170,57 @@ def get_prefixes_used_in_metadata(meta: MetadataType) -> List[str]:
11701170

11711171

11721172
def filter_out_prefixes(
1173-
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
1173+
df: pd.DataFrame,
1174+
filter_prefixes: List[str],
1175+
features: list = KEY_FEATURES,
1176+
require_all_prefixes: bool = False,
11741177
) -> pd.DataFrame:
1175-
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.
1178+
"""Filter out rows which contains a CURIE with a prefix in the filter_prefixes list.
11761179
1177-
:param df: Pandas DataFrame
1180+
:param df: Pandas DataFrame of SSSOM Mapping
11781181
:param filter_prefixes: List of prefixes
11791182
:param features: List of dataframe column names dataframe to consider
1183+
:param require_all_prefixes: If True, all prefixes must be present in a row to be filtered out
11801184
:return: Pandas Dataframe
11811185
"""
11821186
filter_prefix_set = set(filter_prefixes)
11831187
rows = []
1188+
selection = all if require_all_prefixes else any
11841189

11851190
for _, row in df.iterrows():
11861191
prefixes = {get_prefix_from_curie(curie) for curie in row[features]}
1187-
# Confirm if none of the CURIEs in the list above appear in the filter_prefixes list.
1188-
# If TRUE, append row.
1189-
if not any(prefix in prefixes for prefix in filter_prefix_set):
1192+
if not selection(prefix in prefixes for prefix in filter_prefix_set):
11901193
rows.append(row)
1191-
if rows:
1192-
return pd.DataFrame(rows)
1193-
else:
1194-
return pd.DataFrame(columns=features)
1194+
1195+
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)
11951196

11961197

11971198
def filter_prefixes(
1198-
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
1199+
df: pd.DataFrame,
1200+
filter_prefixes: List[str],
1201+
features: list = KEY_FEATURES,
1202+
require_all_prefixes: bool = True,
11991203
) -> pd.DataFrame:
1200-
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.
1204+
"""Filter out rows which do NOT contain a CURIE with a prefix in the filter_prefixes list.
12011205
1202-
:param df: Pandas DataFrame
1206+
:param df: Pandas DataFrame of SSSOM Mapping
12031207
:param filter_prefixes: List of prefixes
12041208
:param features: List of dataframe column names dataframe to consider
1209+
:param require_all_prefixes: If True, all prefixes must be present in a row to be filtered out
12051210
:return: Pandas Dataframe
12061211
"""
12071212
filter_prefix_set = set(filter_prefixes)
12081213
rows = []
1214+
selection = all if require_all_prefixes else any
12091215

12101216
for _, row in df.iterrows():
12111217
prefixes = {
12121218
get_prefix_from_curie(curie) for curie in row[features] if curie is not None
12131219
}
1214-
# Confirm if all of the CURIEs in the list above appear in the filter_prefixes list.
1215-
# If TRUE, append row.
1216-
if all(prefix in filter_prefix_set for prefix in prefixes):
1220+
if selection(prefix in filter_prefix_set for prefix in prefixes):
12171221
rows.append(row)
1218-
if rows:
1219-
return pd.DataFrame(rows)
1220-
else:
1221-
return pd.DataFrame(columns=features)
1222+
1223+
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)
12221224

12231225

12241226
# TODO this is not used anywhere

tests/test_utils.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,54 @@ def test_broken_predicate_list(self):
3434
iri_list.extend(p_iri)
3535
self.assertEqual(3, len(iri_list))
3636

37-
def test_filter_prefixes(self):
37+
def test_filter_prefixes_any(self):
3838
"""Test filtering MSDF.df by prefixes provided."""
3939
prefix_filter_list = ["x", "y"]
4040
original_msdf = self.msdf
4141
filtered_df = filter_prefixes(
42-
original_msdf.df, prefix_filter_list, self.features
42+
original_msdf.df,
43+
prefix_filter_list,
44+
self.features,
45+
require_all_prefixes=False,
46+
)
47+
self.assertEqual(len(filtered_df), 136)
48+
49+
def test_filter_prefixes_all(self):
50+
"""Test filtering MSDF.df by prefixes provided."""
51+
prefix_filter_list = ["x", "y"]
52+
original_msdf = self.msdf
53+
filtered_df = filter_prefixes(
54+
original_msdf.df,
55+
prefix_filter_list,
56+
self.features,
57+
require_all_prefixes=True,
4358
)
4459
self.assertEqual(len(filtered_df), 40)
4560

46-
def test_filter_out_prefixes(self):
61+
def test_filter_out_prefixes_any(self):
4762
"""Test filtering MSDF.df by prefixes provided."""
4863
prefix_filter_list = ["x", "y"]
4964
original_msdf = self.msdf
5065
filtered_df = filter_out_prefixes(
51-
original_msdf.df, prefix_filter_list, self.features
66+
original_msdf.df,
67+
prefix_filter_list,
68+
self.features,
69+
require_all_prefixes=False,
5270
)
5371
self.assertEqual(len(filtered_df), 5)
5472

73+
def test_filter_out_prefixes_all(self):
74+
"""Test filtering MSDF.df by prefixes provided."""
75+
prefix_filter_list = ["x", "y"]
76+
original_msdf = self.msdf
77+
filtered_df = filter_out_prefixes(
78+
original_msdf.df,
79+
prefix_filter_list,
80+
self.features,
81+
require_all_prefixes=True,
82+
)
83+
self.assertEqual(len(filtered_df), 101)
84+
5585
def test_remove_mappings(self):
5686
"""Test remove mappings."""
5787
prefix_filter_list = ["x", "y"]

0 commit comments

Comments
 (0)