Skip to content

Isotopes not converted correctly in inchitomol() #137

@hhaensel

Description

@hhaensel

Here's a demo of H2O, which works fine:

julia> mol = smilestomol("O")
{1, 0} simple molecular graph SMILESMolGraph

julia> inchitosdf(inchi(mol)) |> println
Structure #1.
  InChIV10

  1  0  0  0  0  0  0  0  0  0  1 V2000
    0.0000    0.0000    0.0000 O   0  0  0     0  0  0  0  0  0
M  END
$$$$

but D2O fails:

julia> mol = smilestomol("[D]O[D]")
{3, 2} simple molecular graph SMILESMolGraph

julia> inchitosdf(inchi(mol)) |> println
Structure #1.
  InChIV10

  1  0  0  0  0  0  0  0  0  0  3 V2000
    0.0000    0.0000    0.0000 C   0  0  0     0  0  0  0  0  0
A  1
OD2
M  END
$$$$

The D#s are omitted and the O is replaced by a C - very strange.

Together with ChatGPT I arrived at a different implementation, that doesn't use the OutputSDF flag, but handles the output generation in julia:

using Printf
using MolecularGraph

import MolecularGraph: libinchi, inchi_OutputStructEx, inchi_InputINCHI, inchi_Atom, NUM_H_ISOTOPES

function atom_symbol(a::inchi_Atom)
    # extract main atom symbol
    s = String(reinterpret(UInt8, collect(Iterators.takewhile(!=(0x00), a.elname))))
    return s
end

function bonds_from_atoms(atoms::Vector{inchi_Atom})
    bonds = Set{Tuple{Int,Int,Int}}()
    for (i, a) in enumerate(atoms)
        for k in 1:a.num_bonds
            j = a.neighbor[k] + 1        # C -> Julia 1-based
            btype = a.bond_type[k]
            if j > 0  # skip invalid neighbor
                push!(bonds, (min(i,j), max(i,j), btype))
            end
        end
    end
    return collect(bonds)
end

function expand_atoms(atoms::Vector{inchi_Atom})
    expanded_atoms = Vector{String}()
    bonds = Vector{Tuple{Int,Int,Int}}()

    for (i, a) in enumerate(atoms)
        # Add main atom
        push!(expanded_atoms, atom_symbol(a))

        # Keep track of next atom index
        parent_idx = length(expanded_atoms)

        # Add implicit H/D/T as explicit atoms
        for iso_index in 1:NUM_H_ISOTOPES
            n = a.num_iso_H[iso_index+1]
            iso_symbol = ["H", "D", "T"][iso_index]
            for _ in 1:n
                child_idx = length(expanded_atoms) + 1
                push!(expanded_atoms, iso_symbol)
                push!(bonds, (parent_idx, child_idx, 1))  # single bond to parent
            end
        end
    end

    # Add original bonds between heavy atoms
    heavy_bonds = bonds_from_atoms(atoms)
    append!(bonds, heavy_bonds)

    return expanded_atoms, bonds
end

function expand_atoms(atoms::Vector{inchi_Atom})
    expanded_atoms = Vector{String}()
    bonds = Vector{Tuple{Int,Int,Int}}()

    for (i, a) in enumerate(atoms)
        # Add main atom
        push!(expanded_atoms, atom_symbol(a))

        # Keep track of next atom index
        parent_idx = length(expanded_atoms)

        # Add implicit H/D/T as explicit atoms
        for iso_index in 1:NUM_H_ISOTOPES
            n = a.num_iso_H[iso_index+1]
            iso_symbol = ["H", "D", "T"][iso_index]
            for _ in 1:n
                child_idx = length(expanded_atoms) + 1
                push!(expanded_atoms, iso_symbol)
                push!(bonds, (parent_idx, child_idx, 1))  # single bond to parent
            end
        end
    end

    # Add original bonds between heavy atoms
    heavy_bonds = bonds_from_atoms(atoms)
    append!(bonds, heavy_bonds)

    return expanded_atoms, bonds
end

function struct_to_sdf(structure::inchi_OutputStructEx)
    atoms = unsafe_wrap(Vector{inchi_Atom}, structure.atom, structure.num_atoms)
    expanded_atoms, bonds = expand_atoms(atoms)

    io = IOBuffer()
    println(io, "Generated by libinchi + Julia")
    println(io)
    println(io, "  Julia libinchi export")

    @printf(io, "%3d%3d  0  0  0  0            999 V2000\n",
            length(expanded_atoms), length(bonds))

    # Dummy coordinates (can use atom coords for heavy atoms, 0 for H/D/T)
    for (i, sym) in enumerate(expanded_atoms)
        x = y = z = 0.0
        if i <= length(atoms)  # heavy atoms
            x, y, z = atoms[i].x, atoms[i].y, atoms[i].z
        end
        @printf(io, "%10.4f%10.4f%10.4f %-3s 0  0  0  0  0  0  0  0  0  0\n",
                x, y, z, sym)
    end

    # Write bonds
    for (i,j,btype) in bonds
        @printf(io, "%3d%3d%3d  0  0  0  0\n", i, j, btype)
    end

    println(io, "M  END")
    println(io, raw"$$$$")

    return String(take!(io))
end

function inchitosdf2(inchi::String; options::String = "")
    structure = inchi_OutputStructEx()
    inchi_input = inchi_InputINCHI(Base.unsafe_convert(Cstring, inchi),
                                   Base.unsafe_convert(Cstring, options))

    ret = @ccall libinchi.GetStructFromINCHIEx(
        inchi_input::Ref{inchi_InputINCHI},
        structure::Ref{inchi_OutputStructEx}
    )::Int32

    if ret < 0
        error("libinchi failed with code $ret")
    end

    sdf = struct_to_sdf(structure)

    # cleanup
    @ccall libinchi.FreeStructFromINCHIEx(structure::Ref{inchi_OutputStructEx})::Cvoid

    return sdf
end

mol = smilestomol("[D]O[D]")
i = inchi(mol)
inchitosdf(i) |> println
inchitosdf2(i) |> println

Is that interesting for you?
I'm not sure whether the algorithm covers all special cases that your might cover. In that case one could test for existence of isotopes and only in that case apply this method. Happy to hear from you.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions