Skip to content

Commit 354dccd

Browse files
committed
gentle smiles parsing for lookup
1 parent f79c959 commit 354dccd

File tree

3 files changed

+37
-13
lines changed

3 files changed

+37
-13
lines changed

chebifier/prediction_models/c3p_predictor.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from pathlib import Path
22
from typing import List, Optional
33

4+
import tqdm
5+
46
from chebifier import modelwise_smiles_lru_cache
57
from chebifier.prediction_models import BasePredictor
68

@@ -26,14 +28,22 @@ def __init__(
2628
def predict_smiles_list(self, smiles_list: list[str]) -> list:
2729
from c3p import classifier as c3p_classifier
2830

29-
result_list = c3p_classifier.classify(
30-
list(smiles_list),
31-
self.program_directory,
32-
self.chemical_classes,
33-
strict=False,
34-
)
31+
result_list = []
32+
for batch_start in tqdm.tqdm(
33+
range(0, len(smiles_list), 32), desc="Classifying with C3P"
34+
):
35+
batch_end = min(batch_start + 32, len(smiles_list))
36+
result_list.extend(
37+
c3p_classifier.classify(
38+
smiles_list[batch_start:batch_end],
39+
self.program_directory,
40+
self.chemical_classes,
41+
strict=False,
42+
)
43+
)
44+
3545
result_reformatted = [dict() for _ in range(len(smiles_list))]
36-
for result in result_list:
46+
for result in tqdm.tqdm(result_list, desc="Reformatting C3P results"):
3747
chebi_id = result.class_id.split(":")[1]
3848
result_reformatted[smiles_list.index(result.input_smiles)][
3949
chebi_id
@@ -61,13 +71,13 @@ def explain_smiles(self, smiles):
6171
highlights.append(
6272
(
6373
"text",
64-
f"For class {result.class_name} ({result.class_id}), C3P gave the following explanation: {result.reason}",
74+
f"For {result.class_name} ({result.class_id}), C3P gave the following explanation: {result.reason}",
6575
)
6676
)
6777
highlights = [
6878
(
6979
"text",
70-
f"C3P made positive predictions for {len(highlights)} classes. The explanations are as follows:",
80+
f"C3P made positive predictions for {len(highlights)} classes. {'The explanations are as follows:' if len(highlights) > 0 else ''}",
7181
)
7282
] + highlights
7383

chebifier/prediction_models/chebi_lookup.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from chebifier import modelwise_smiles_lru_cache
88
from chebifier.prediction_models import BasePredictor
9-
from chebifier.utils import load_chebi_graph
9+
from chebifier.utils import _smiles_to_mol, load_chebi_graph
1010

1111

1212
class ChEBILookupPredictor(BasePredictor):
@@ -50,7 +50,7 @@ def build_smiles_lookup(self):
5050
).items():
5151
if smiles is not None:
5252
try:
53-
mol = Chem.MolFromSmiles(smiles)
53+
mol = _smiles_to_mol(smiles)
5454
if mol is None:
5555
print(
5656
f"Failed to parse SMILES {smiles} for ChEBI ID {chebi_id}"
@@ -72,7 +72,7 @@ def build_smiles_lookup(self):
7272
def predict_smiles(self, smiles: str) -> Optional[dict]:
7373
if not smiles:
7474
return None
75-
mol = Chem.MolFromSmiles(smiles)
75+
mol = _smiles_to_mol(smiles)
7676
if mol is None:
7777
return None
7878
canonical_smiles = Chem.MolToSmiles(mol)
@@ -110,7 +110,7 @@ def info_text(self):
110110
return self._description
111111

112112
def explain_smiles(self, smiles: str) -> dict:
113-
mol = Chem.MolFromSmiles(smiles)
113+
mol = _smiles_to_mol(smiles)
114114
if mol is None:
115115
return {
116116
"highlights": [

chebifier/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import functools
12
import importlib.resources
23
import os
34
import pickle
@@ -6,6 +7,7 @@
67
import networkx as nx
78
import requests
89
import yaml
10+
from rdkit import Chem
911

1012
from chebifier.hugging_face import download_model_files
1113

@@ -156,6 +158,18 @@ def process_config(config, model_registry):
156158
return new_config
157159

158160

161+
@functools.lru_cache(maxsize=128)
162+
def _smiles_to_mol(smiles: str):
163+
mol = Chem.MolFromSmiles(smiles, sanitize=False)
164+
if mol is not None:
165+
# turn aromatic bond types into single/double
166+
try:
167+
Chem.Kekulize(mol)
168+
except Chem.KekulizeException as e:
169+
print(f"Failed to Kekulize {smiles}: {e}")
170+
return mol
171+
172+
159173
if __name__ == "__main__":
160174
chebi_graph = build_chebi_graph(chebi_version=244)
161175
os.makedirs(os.path.join("data", "chebi_v244"), exist_ok=True)

0 commit comments

Comments
 (0)