Skip to content

Commit f63576a

Browse files
Speedup for molecule isomorphism checking (#2114)
* potentially quick speedup for molecule isomorphism * silence yappy test and handle nx graphs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix atom counting with negative numbered Hs * change test to expect new (equivalent) ordering for atom mapping * add new error, test, and update releasenotes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix greater-than/less-than mismatch * privatize Molecule.are_isomorphic.to_networks, unconditionally deepcopy input, polish docstrings * Update releasenotes for 0.18.0 release --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 7febf6d commit f63576a

File tree

6 files changed

+58
-11
lines changed

6 files changed

+58
-11
lines changed

docs/releasehistory.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,19 @@ Releases follow the `major.minor.micro` scheme recommended by [PEP440](https://w
77
* `micro` increments represent bugfix releases or improvements in documentation
88

99

10-
## Current development
10+
## 0.18.0
1111

1212
### API-breaking changes
13+
- [PR #2114](https://github.com/openforcefield/openff-toolkit/pull/2114): Removes `Molecule.are_isomorphic.to_networkx` from the public API (by renaming to `_to_networkx`). We are nearly certain that nobody is using this, and it is DIFFERENT from the much more useful `Molecule.to_networkx`. This change is motivated by the observation that further behavior changes to this method can relieve critical performance bottlenecks, and that we shouldn't repeatedly change the behavior of public methods.
1314

1415
### Behavior changes
16+
- [PR #2114](https://github.com/openforcefield/openff-toolkit/pull/2114): Fixes [Issue #2035](https://github.com/openforcefield/openff-toolkit/issues/2035) (and touches on several others) by improving the runtime of Molecule.are_isomorphic/is_isomorphic_with, which can greatly improve interchange creation and export runtime.
17+
- [PR #2114](https://github.com/openforcefield/openff-toolkit/pull/2114): May change the specific mapping of symmetric atoms returned by `Molecule.are_isomorphic` (to a different but chemically equivalent mapping)
1518

1619
### Bugfixes
1720

1821
- [PR #2115](https://github.com/openforcefield/openff-toolkit/pull/2115): Avoid unnecessary conformer coordinate lookups which were causing performance issues with large molecules. Fixes [Issue #1855](https://github.com/openforcefield/openff-toolkit/issues/1844).
1922

20-
### New features
21-
22-
### Improved documentation and warnings
23-
2423

2524
## 0.17.1
2625

openff/toolkit/_tests/test_molecule.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
from openff.toolkit.utils import get_data_file_path
6060
from openff.toolkit.utils.exceptions import (
6161
AtomMappingWarning,
62+
BadMoleculeAssumptionError,
6263
HierarchyIteratorNameConflictError,
6364
IncompatibleShapeError,
6465
IncompatibleTypeError,
@@ -1521,7 +1522,7 @@ def test_isomorphic_general(self):
15211522
ethanol_reverse = create_reversed_ethanol()
15221523
assert ethanol.is_isomorphic_with(ethanol_reverse) is True
15231524
# check a reference mapping between ethanol and ethanol_reverse matches that calculated
1524-
ref_mapping = {0: 8, 1: 7, 2: 6, 3: 3, 4: 4, 5: 5, 6: 1, 7: 2, 8: 0}
1525+
ref_mapping = {0: 8, 1: 7, 2: 6, 3: 5, 4: 4, 5: 3, 6: 2, 7: 1, 8: 0}
15251526
assert Molecule.are_isomorphic(ethanol, ethanol_reverse, return_atom_map=True)[1] == ref_mapping
15261527
# check matching with nx.Graph atomic numbers and connectivity only
15271528
assert (
@@ -1669,6 +1670,16 @@ def test_isomorphic_perumtations(self, inputs):
16691670
is inputs["result"]
16701671
)
16711672

1673+
def test_isomorphic_speedup_assumptions(self):
1674+
"""
1675+
This test ensures that the assumptions needed by the isomorphism checking speedup in
1676+
https://github.com/openforcefield/openff-toolkit/pull/2114 are policed
1677+
"""
1678+
mol1 = Molecule.from_mapped_smiles("[F:1][H-:2][H-:3][Cl:4]")
1679+
mol2 = Molecule.from_mapped_smiles("[Cl:1][H-:2][H-:3][F:4]")
1680+
with pytest.raises(BadMoleculeAssumptionError):
1681+
assert mol1.is_isomorphic_with(mol2)
1682+
16721683
@requires_openeye
16731684
def test_strip_atom_stereochemistry(self):
16741685
"""Test the basic behavior of strip_atom_stereochemistry"""

openff/toolkit/topology/molecule.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
from openff.toolkit.utils.constants import DEFAULT_AROMATICITY_MODEL
5656
from openff.toolkit.utils.exceptions import (
5757
AtomMappingWarning,
58+
BadMoleculeAssumptionError,
5859
BondExistsError,
5960
HierarchyIteratorNameConflictError,
6061
HierarchySchemeNotFoundException,
@@ -2131,20 +2132,44 @@ def edge_match_func(x, y):
21312132
edge_match_func = None # type: ignore
21322133

21332134
# Here we should work out what data type we have, also deal with lists?
2134-
def to_networkx(data: FrozenMolecule | nx.Graph) -> nx.Graph:
2135-
"""For the given data type, return the networkx graph"""
2135+
def _to_networkx(data: FrozenMolecule | nx.Graph) -> nx.Graph:
2136+
"""
2137+
For the given data type, return the networkx graph. Note that the graphs returned from this method
2138+
will have negative-numbered hydrogens (eg. the first hydrogen on a heavy atom will have atomic number
2139+
-1, the second -2, etc). This is a performance hack to reduce the number of potential symmetric matches
2140+
that must be evaluated during isomorphism checks.
2141+
"""
21362142
if strip_pyrimidal_n_atom_stereo:
21372143
SMARTS = "[N+0X3:1](-[*])(-[*])(-[*])"
21382144

2145+
data = deepcopy(data)
21392146
if isinstance(data, FrozenMolecule):
21402147
# Molecule class instance
21412148
if strip_pyrimidal_n_atom_stereo:
21422149
# Make a copy of the molecule so we don't modify the original
2143-
data = deepcopy(data)
21442150
data.strip_atom_stereochemistry(SMARTS, toolkit_registry=toolkit_registry)
2151+
for atom in data.atoms:
2152+
h_counter = -1
2153+
for neighbor in atom.bonded_atoms:
2154+
if neighbor.atomic_number < 0:
2155+
raise BadMoleculeAssumptionError(
2156+
f"Molecule {data} appears to violate an assumption of the OpenFF Toolkit "
2157+
f"(likely that an H can't have two bonds). Please check that your molecule is "
2158+
f"what you think it is, and if it is indeed what you intend, open an issue at "
2159+
f"https://github.com/openforcefield/openff-toolkit/issues"
2160+
)
2161+
if neighbor.atomic_number == 1:
2162+
neighbor._atomic_number = h_counter
2163+
h_counter -= 1
21452164
return data.to_networkx()
21462165

21472166
elif isinstance(data, nx.Graph):
2167+
for node in data:
2168+
h_counter = -1
2169+
for neighbor in data.neighbors(node):
2170+
if data.nodes[neighbor]["atomic_number"] == 1:
2171+
data.nodes[neighbor]["atomic_number"] = h_counter
2172+
h_counter -= 1
21482173
return data
21492174

21502175
else:
@@ -2154,8 +2179,8 @@ def to_networkx(data: FrozenMolecule | nx.Graph) -> nx.Graph:
21542179
f"or networkx.Graph representation of the molecule."
21552180
)
21562181

2157-
mol1_netx = to_networkx(mol1)
2158-
mol2_netx = to_networkx(mol2)
2182+
mol1_netx = _to_networkx(mol1)
2183+
mol2_netx = _to_networkx(mol2)
21592184

21602185
from networkx.algorithms.isomorphism import GraphMatcher
21612186

@@ -5665,6 +5690,11 @@ def _atom_nums_to_hill_formula(atom_nums: list[int]) -> str:
56655690
SYMBOLS_ = deepcopy(SYMBOLS)
56665691
SYMBOLS_[0] = "X"
56675692

5693+
atom_nums = deepcopy(atom_nums)
5694+
for idx in range(len(atom_nums)):
5695+
if atom_nums[idx] < 0:
5696+
atom_nums[idx] = 1
5697+
56685698
atom_symbol_counts = Counter(SYMBOLS_[atom_num] for atom_num in atom_nums)
56695699

56705700
formula = []

openff/toolkit/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
RDKIT_AVAILABLE,
1616
AmberToolsToolkitWrapper,
1717
AntechamberNotFoundError,
18+
BadMoleculeAssumptionError,
1819
BuiltInToolkitWrapper,
1920
ChargeCalculationError,
2021
ChargeMethodUnavailableError,

openff/toolkit/utils/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,3 +760,7 @@ def __init__(self, res_name, mol_bond, query_bond, reason):
760760
msg += f"\t{reason}"
761761
super().__init__(msg)
762762
self.msg = msg
763+
764+
765+
class BadMoleculeAssumptionError(OpenFFToolkitException):
766+
"""Exception raised when a molecule violates assumptions made by the OpenFF Toolkit's implementation"""

openff/toolkit/utils/toolkits.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
"RDKIT_AVAILABLE",
3535
"AmberToolsToolkitWrapper",
3636
"AntechamberNotFoundError",
37+
"BadMoleculeAssumptionError",
3738
"BuiltInToolkitWrapper",
3839
"ChargeCalculationError",
3940
"ChargeMethodUnavailableError",
@@ -72,6 +73,7 @@
7273
)
7374
from openff.toolkit.utils.exceptions import (
7475
AntechamberNotFoundError,
76+
BadMoleculeAssumptionError,
7577
ChargeCalculationError,
7678
ChargeMethodUnavailableError,
7779
GAFFAtomTypeWarning,

0 commit comments

Comments
 (0)