diff --git a/doc/apidoc.json b/doc/apidoc.json index b68573551..0d3eb7798 100644 --- a/doc/apidoc.json +++ b/doc/apidoc.json @@ -356,7 +356,6 @@ "set_component", "list_assemblies", "get_assembly" - ], "CIF format" : [ "CIFFile", @@ -382,5 +381,17 @@ "StringArrayEncoding", "TypeCode" ] + }, + "biotite.structure.alphabet" : { + "Structural alphabets": [ + "I3DSequence", + "ProteinBlocksAlphabet", + "ClepapsAlphabet" + ], + "Conversion Function": [ + "to_3di", + "to_protein_blocks", + "to_clepaps" + ] } } diff --git a/doc/references.bib b/doc/references.bib index 4a08c984c..bfa6b180f 100644 --- a/doc/references.bib +++ b/doc/references.bib @@ -742,7 +742,7 @@ @article{Steele2021 eprint = {2001.05304}, primaryclass = {cs}, doi = {10.48550/arXiv.2001.05304}, - archiveprefix = {arxiv} + archiveprefix = {arXiv} } @article{Steinegger2017, @@ -838,6 +838,35 @@ @article{VanHerk1992 doi = {10.1016/0167-8655(92)90069-C} } +@article{VanKempen2024, + title = {Fast and Accurate Protein Structure Search with {{Foldseek}}}, + author = {{van Kempen}, Michel and Kim, Stephanie S. and Tumescheit, Charlotte and Mirdita, Milot and Lee, Jeongjae and Gilchrist, Cameron L. M. and Söding, Johannes and Steinegger, Martin}, + year = {2024}, + month = feb, + journal = {Nature Biotechnology}, + volume = {42}, + number = {2}, + pages = {243--246}, + publisher = {Nature Publishing Group}, + issn = {1546-1696}, + doi = {10.1038/s41587-023-01773-0} +} + +@article{Wang2008, + title = {{{CLePAPS}}: {{FAST PAIR ALIGNMENT OF PROTEIN STRUCTURES BASED ON CONFORMATIONAL LETTERS}}}, + shorttitle = {{{CLePAPS}}}, + author = {Wang, Sheng and Zheng, Wei-Mou}, + year = {2008}, + month = apr, + journal = {Journal of Bioinformatics and Computational Biology}, + volume = {06}, + number = {02}, + pages = {347--366}, + publisher = {World Scientific Publishing Co.}, + issn = {0219-7200}, + doi = {10.1142/S0219720008003461} +} + @article{Westbrook2015, title = {The Chemical Component Dictionary: Complete Descriptions of Constituent Molecules in Experimentally Determined {{3D}} Macromolecules in the {{Protein Data Bank}}}, shorttitle = {The Chemical Component Dictionary}, diff --git a/src/biotite/sequence/align/matrix.py b/src/biotite/sequence/align/matrix.py index 2a7d23437..fd5f02e9a 100644 --- a/src/biotite/sequence/align/matrix.py +++ b/src/biotite/sequence/align/matrix.py @@ -2,14 +2,17 @@ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further # information. +__all__ = ["SubstitutionMatrix"] __name__ = "biotite.sequence.align" __author__ = "Patrick Kunzmann" -import os +import functools +from pathlib import Path import numpy as np from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence -__all__ = ["SubstitutionMatrix"] +# Directory of matrix files +_DB_DIR = Path(__file__).parent / "matrix_data" class SubstitutionMatrix(object): @@ -59,6 +62,12 @@ class SubstitutionMatrix(object): - **RBLOSUM_** - **CorBLOSUM_** + - Structural alphabet substitution matrices + + - **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024` + - **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017` + - **CLESUM** - For CLePAPS alphabet :footcite:`Wang2008` + A list of all available matrix names is returned by :meth:`list_db()`. @@ -124,9 +133,6 @@ class SubstitutionMatrix(object): >>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50") """ - # Directory of matrix files - _db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data") - def __init__(self, alphabet1, alphabet2, score_matrix): self._alph1 = alphabet1 self._alph2 = alphabet2 @@ -350,7 +356,7 @@ def dict_from_db(matrix_name): matrix_dict : dict A dictionary representing the substitution matrix. """ - filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat" + filename = _DB_DIR / f"{matrix_name}.mat" with open(filename, "r") as f: return SubstitutionMatrix.dict_from_str(f.read()) @@ -364,11 +370,10 @@ def list_db(): db_list : list List of matrix names in the internal database. """ - files = os.listdir(SubstitutionMatrix._db_dir) - # Remove '.mat' from files - return [file[:-4] for file in sorted(files)] + return [path.stem for path in _DB_DIR.glob("*.mat")] @staticmethod + @functools.cache def std_protein_matrix(): """ Get the default :class:`SubstitutionMatrix` for protein sequence @@ -379,9 +384,12 @@ def std_protein_matrix(): matrix : SubstitutionMatrix Default matrix. """ - return _matrix_blosum62 + return SubstitutionMatrix( + ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62" + ) @staticmethod + @functools.cache def std_nucleotide_matrix(): """ Get the default :class:`SubstitutionMatrix` for DNA sequence @@ -392,13 +400,107 @@ def std_nucleotide_matrix(): matrix : SubstitutionMatrix Default matrix. """ - return _matrix_nuc + return SubstitutionMatrix( + NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" + ) + + @staticmethod + @functools.cache + def std_3di_matrix(): + """ + Get the default :class:`SubstitutionMatrix` for 3Di sequence + alignments. + :footcite:`VanKempen2024` + + Returns + ------- + matrix : SubstitutionMatrix + Default matrix. + """ + # Import inside function to avoid circular import + from biotite.structure.alphabet.i3d import I3DSequence + return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di") + + @staticmethod + @functools.cache + def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200): + """ + Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences. -# Preformatted BLOSUM62 and NUC substitution matrix from NCBI -_matrix_blosum62 = SubstitutionMatrix( - ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62" -) -_matrix_nuc = SubstitutionMatrix( - NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC" -) + The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`. + + Parameters + ---------- + unknown_match, unkown_mismatch : int, optional + The match and mismatch score for undefined symbols. + The default values were chose arbitrarily. + + Returns + ------- + matrix : SubstitutionMatrix + Default matrix. + + References + ---------- + + .. footbibliography:: + + """ + from biotite.structure.alphabet.pb import ProteinBlocksSequence + + alphabet = ProteinBlocksSequence.alphabet + unknown_symbol = ProteinBlocksSequence.unknown_symbol + matrix_dict = SubstitutionMatrix.dict_from_db("PB") + for symbol in alphabet: + if symbol == unknown_symbol: + continue + matrix_dict[symbol, unknown_symbol] = unkown_mismatch + matrix_dict[unknown_symbol, symbol] = unkown_mismatch + matrix_dict[unknown_symbol, unknown_symbol] = unknown_match + return SubstitutionMatrix( + alphabet, + alphabet, + matrix_dict, + ) + + @staticmethod + @functools.cache + def std_clepaps_matrix(unknown_match=200, unkown_mismatch=-200): + """ + Get the default :class:`SubstitutionMatrix` for *CLePAPS* sequences. + + Parameters + ---------- + unknown_match, unkown_mismatch : int, optional + The match and mismatch score for undefined symbols. + The default values were chose arbitrarily. + + Returns + ------- + matrix : SubstitutionMatrix + Default matrix. + + References + ---------- + + .. footbibliography:: + + """ + from biotite.structure.alphabet.pb import ProteinBlocksSequence + + alphabet = ProteinBlocksSequence.alphabet + unknown_symbol = ProteinBlocksSequence.unknown_symbol + matrix_dict = SubstitutionMatrix.dict_from_db("CLESUM") + # Add match/mismatch scores for undefined symbols + for symbol in alphabet: + if symbol == unknown_symbol: + continue + matrix_dict[symbol, unknown_symbol] = unkown_mismatch + matrix_dict[unknown_symbol, symbol] = unkown_mismatch + matrix_dict[unknown_symbol, unknown_symbol] = unknown_match + return SubstitutionMatrix( + alphabet, + alphabet, + matrix_dict, + ) diff --git a/src/biotite/sequence/align/matrix_data/3Di.mat b/src/biotite/sequence/align/matrix_data/3Di.mat new file mode 100644 index 000000000..93fe4e97b --- /dev/null +++ b/src/biotite/sequence/align/matrix_data/3Di.mat @@ -0,0 +1,25 @@ +# 3Di bit/2 +# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001 +# Lambda (precomputed optional): 0.351568 + A C D E F G H I K L M N P Q R S T V W Y X +A 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 0 +C -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 0 +D 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 0 +E 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 0 +F 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 0 +G -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 0 +H -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 0 +I -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 0 +K -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 0 +L -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 0 +M -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 0 +N -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 0 +P -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 0 +Q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 0 +R -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 0 +S -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 0 +T -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 0 +V -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 0 +W 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 0 +Y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 0 +X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \ No newline at end of file diff --git a/src/biotite/sequence/align/matrix_data/PB.license b/src/biotite/sequence/align/matrix_data/PB.license new file mode 100644 index 000000000..688633bfa --- /dev/null +++ b/src/biotite/sequence/align/matrix_data/PB.license @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 Poulain, A. G. de Brevern + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/src/biotite/sequence/align/matrix_data/PB.mat b/src/biotite/sequence/align/matrix_data/PB.mat new file mode 100644 index 000000000..abb8dc293 --- /dev/null +++ b/src/biotite/sequence/align/matrix_data/PB.mat @@ -0,0 +1,18 @@ +# PB substitution matrix, adapted from PBxplore + a b c d e f g h i j k l m n o p +a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 +b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 +c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 +d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 +e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 +f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 +g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 +h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 +i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 +j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 +k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 +l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 +m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 +n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 +o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 +p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 \ No newline at end of file diff --git a/src/biotite/structure/alphabet/__init__.py b/src/biotite/structure/alphabet/__init__.py new file mode 100644 index 000000000..f517b9ed9 --- /dev/null +++ b/src/biotite/structure/alphabet/__init__.py @@ -0,0 +1,14 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +A subpackage for converting structures to structural alphabet sequences. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde, Patrick Kunzmann" + +from .clepaps import * +from .i3d import * +from .pb import * diff --git a/src/biotite/structure/alphabet/clepaps.py b/src/biotite/structure/alphabet/clepaps.py new file mode 100644 index 000000000..70bc01f2d --- /dev/null +++ b/src/biotite/structure/alphabet/clepaps.py @@ -0,0 +1,156 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Conversion of structures into the *Protein Blocks* structural alphabet. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Patrick Kunzmann" +__all__ = ["ClepapsSequence", "to_clepaps"] + +import numpy as np +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.sequence import Sequence +from biotite.structure.chains import get_chain_starts +from biotite.structure.filter import filter_amino_acids +from biotite.structure.geometry import angle, dihedral +from biotite.structure.util import coord_for_atom_name_per_residue + +# CLePAPS reference angles +CLEPAPS_CENTERS = np.array( + [ + [ 1.02, -2. , 1.55], + [ 1.06, -2.94, 1.34], + [ 1.01, -1.88, 1.14], + [ 0.79, -2.3 , 1.03], + [ 1.02, -2.98, 0.95], + [ 1.09, -2.72, 0.91], + [ 1.49, 2.09, 1.05], + [ 1.55, 0.88, 1.55], + [ 1.52, 0.83, 1.52], + [ 1.58, 1.05, 1.55], + [ 1.48, 0.7 , 1.43], + [ 1.4 , 0.75, 0.84], + [ 1.47, 1.64, 1.44], + [ 1.12, 0.14, 1.49], + [ 1.54, -1.89, 1.48], + [ 1.24, -2.98, 1.49], + [ 0.86, -0.37, 1.01], + ] +) # fmt: skip + + +class ClepapsSequence(Sequence): + """ + Representation of a structure in the *CLePAPS* structural alphabet. + :footcite:`Wang2008` + + Parameters + ---------- + sequence : iterable object, optional + The *CLePAPS* sequence. + This may either be a list or a string. + May take upper or lower case letters. + By default the sequence is empty. + + See also + -------- + to_clepaps : Create *CLePAPS* sequences from a structure. + + References + ---------- + + .. footbibliography:: + + """ + + alphabet = LetterAlphabet("ABCDEFGHIJKLMNOPQR") + unknown_symbol = "R" + + def get_alphabet(self): + return ClepapsSequence.alphabet + + +def to_clepaps(atoms): + """ + Encode each chain in the given structure to the *CLePAPS* structural + alphabet. + :footcite:`Wang2008` + + Parameters + ---------- + atoms : AtomArray + The atom array to encode. + May contain multiple chains. + + Returns + ------- + sequences : list of Sequence, length=n + The encoded *CLePAPS* sequence for each peptide chain in the structure. + chain_start_indices : ndarray, shape=(n,), dtype=int + The atom index where each chain starts. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> sequences, chain_starts = to_clepaps(atom_array) + >>> print(sequences[0]) + """ + sequences = [] + chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) + for i in range(len(chain_start_indices) - 1): + start = chain_start_indices[i] + stop = chain_start_indices[i + 1] + chain = atoms[start:stop] + sequences.append(_to_clepaps(chain)) + return sequences, chain_start_indices[:-1] + + +def _to_clepaps(chain): + amino_acid_mask = filter_amino_acids(chain) + + # Coordinates for dihedral angle calculation + (coord_ca,) = coord_for_atom_name_per_residue( + chain, + ("CA",), + amino_acid_mask, + ) + + bending = angle(coord_ca[:-2], coord_ca[1:-1], coord_ca[2:]) + theta_1 = bending[:-1] + theta_2 = bending[1:] + tau = dihedral(coord_ca[:-3], coord_ca[1:-2], coord_ca[2:-1], coord_ca[3:]) + clepaps_angles = np.stack([theta_1, tau, theta_2], axis=-1) + + # Angle RMSD of all reference angles with all actual angles + rmsda = np.sum( + (CLEPAPS_CENTERS[:, np.newaxis] - clepaps_angles[np.newaxis, :]) ** 2, + axis=-1, + ) + # Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown + clepaps_seq_code = np.full( + len(clepaps_angles), + ClepapsSequence.alphabet.encode(ClepapsSequence.unknown_symbol), + ) + available_mask = ~np.isnan(rmsda).any(axis=0) + # Chose symbol, where the RMSDA to the reference angle is lowest + # Due to the definition of Biotite symbol codes + # the index of the chosen PB is directly the symbol code + clepaps_seq_code[available_mask] = np.argmin(rmsda[:, available_mask], axis=0) + # Put the array of symbol codes into actual sequence objects + clepaps_sequence = ClepapsSequence() + # Since every symbols comprises 4 residues, the sequence length is shortened by 3 + # By definition of CLePAPS, the first two and the last residue are undefined + clepaps_sequence.code = np.full( + coord_ca.shape[0], + ClepapsSequence.alphabet.encode(ClepapsSequence.unknown_symbol), + ) + clepaps_sequence.code[2:-1] = clepaps_seq_code + return clepaps_sequence diff --git a/src/biotite/structure/alphabet/encoder.py b/src/biotite/structure/alphabet/encoder.py new file mode 100644 index 000000000..9793a59f4 --- /dev/null +++ b/src/biotite/structure/alphabet/encoder.py @@ -0,0 +1,332 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Implementation of the encoder neural network adapted from ``foldseek``. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["Encoder", "VirtualCenterEncoder", "PartnerIndexEncoder", "FeatureEncoder"] + +import abc +from importlib.resources import files as resource_files +import numpy +import numpy.ma +from biotite.structure.alphabet.layers import CentroidLayer, Model +from biotite.structure.alphabet.unkerasify import load_kerasify + + +class _BaseEncoder(abc.ABC): + @abc.abstractmethod + def encode(self, ca, cb, n, c): + """ + Encode the given atom coordinates to a different representation. + + Parameters + ---------- + ca, cb, n, c : ndarray, shape=(n, 3), dtype=float + The coordinates of the ``CA``, ``CB``, ``N`` and ``C`` atoms for each + residue. + *NaN* if missing, e.g. ``CB`` for glycine. + + Returns + ------- + encoded : MaskedArray, shape=(n, m), dtype=float + The encoded representation. + """ + raise NotImplementedError + + +class VirtualCenterEncoder(_BaseEncoder): + r""" + An encoder for converting a protein structure to a virtual center. + + For each residue, the coordinates of the virtual center are computed + from the coordinates of the ``CA``, ``CB`` and ``N`` atoms. The virtual center + :math:`V` is defined by the angle :math:`\theta = \angle V C_{\alpha} C_{\beta}`, + the dihedral angle :math:`\tau = \angle V C_{\alpha} C_{\beta} N` and the length + :math:`l = |V - C_{\alpha}|`. The default parameters used + in ``foldseek`` were selected after optimization on a validation set. + + Parameters + ---------- + distance_alpha_beta : float + The default distance between the ``CA`` and ``CB`` atoms to use when + reconstructing missing *Cβ* coordinates. + distance_alpha_v : float + The distance between the virtual center *V* and the ``CA`` atom, used to compute + the virtual center coordinates. + theta : float + The angle θ between the virtual center *V*, the ``CA`` and ``CB`` atoms, used to + compute the virtual center coordinates. + tau : float + The dihedral angle τ between the virtual center *V* and the ``CA``, ``CB`` + and ``N`` atoms, used to compute the virtual center coordinates. + """ + + _DISTANCE_ALPHA_BETA = 1.5336 + + def __init__( + self, + *, + distance_alpha_beta=_DISTANCE_ALPHA_BETA, + distance_alpha_v=2.0, + theta=270.0, + tau=0.0, + ): + self.theta = theta + self.tau = tau + self.distance_alpha_v = distance_alpha_v + self.distance_alpha_beta = distance_alpha_beta + + @property + def theta(self): + return numpy.rad2deg(self._theta) + + @theta.setter + def theta(self, theta): + self._theta = numpy.deg2rad(theta) + self._cos_theta = numpy.cos(self._theta) + self._sin_theta = numpy.sin(self._theta) + + @property + def tau(self): + return numpy.rad2deg(self._tau) + + @tau.setter + def tau(self, tau): + self._tau = numpy.deg2rad(tau) + self._cos_tau = numpy.cos(self._tau) + self._sin_tau = numpy.sin(self._tau) + + def _compute_virtual_center(self, ca, cb, n): + assert ca.shape == n.shape + assert ca.shape == cb.shape + v = cb - ca + a = cb - ca + b = n - ca + # normal angle + k = _normalize(numpy.cross(a, b, axis=-1), inplace=True) + v = ( + v * self._cos_theta + + numpy.cross(k, v) * self._sin_theta + + k * (k * v).sum(axis=-1).reshape(-1, 1) * (1 - self._cos_theta) + ) + # dihedral angle + k = _normalize(n - ca, inplace=True) + v = ( + v * self._cos_tau + + numpy.cross(k, v) * self._sin_tau + + k * (k * v).sum(axis=-1).reshape(-1, 1) * (1 - self._cos_tau) + ) + # apply final vector to Cα + v *= self.distance_alpha_v + v += ca + return v + + def _approximate_cb_position(self, ca, n, c): + """ + Approximate the position of ``CB`` from the backbone atoms. + """ + assert ca.shape == n.shape + assert ca.shape == c.shape + v1 = _normalize(c - ca, inplace=True) + v2 = _normalize(n - ca, inplace=True) + v3 = v1 / 3.0 + + b1 = numpy.add(v2, v3, out=v2) + b2 = numpy.cross(v1, b1, axis=-1) + u1 = _normalize(b1, inplace=True) + u2 = _normalize(b2, inplace=True) + + out = (numpy.sqrt(8) / 3.0) * ((-u1 / 2.0) - (u2 * numpy.sqrt(3) / 2.0)) - v3 + out *= self.distance_alpha_beta + out += ca + return out + + def _create_nan_mask(self, ca, n, c): + """ + Mask any column which contains at least one *NaN* value. + """ + mask_ca = numpy.isnan(ca).max(axis=1) + mask_n = numpy.isnan(n).max(axis=1) + mask_c = numpy.isnan(c).max(axis=1) + return (mask_ca | mask_n | mask_c).repeat(3).reshape(-1, 3) + + def encode(self, ca, cb, n, c): + ca = numpy.asarray(ca) + cb = numpy.asarray(cb) + n = numpy.asarray(n) + c = numpy.asarray(c) + + assert ca.shape == cb.shape + assert ca.shape == c.shape + assert ca.shape == n.shape + + # fix CB positions if needed + nan_indices = numpy.isnan(cb) + if numpy.any(nan_indices): + cb_approx = self._approximate_cb_position(ca, n, c) + # avoid writing to CB directly since it should be callee-save + cb_approx[~nan_indices] = cb[~nan_indices] + cb = cb_approx + # compute virtual center + vc = self._compute_virtual_center(ca, cb, n) + # mask residues without coordinates + return numpy.ma.masked_array( + vc, + mask=self._create_nan_mask(ca, n, c), + fill_value=numpy.nan, + ) + + +class PartnerIndexEncoder(_BaseEncoder): + """ + An encoder for converting a protein structure to partner indices. + + For each residue, the coordinates of the virtual center are computed from the + coordinates of the ``CA``, ``CB`` and ``N`` atoms. + A pairwise distance matrix is then created, and the index of the closest partner + residue is extracted for each position. + """ + + def __init__(self): + self.vc_encoder = VirtualCenterEncoder() + + def _find_residue_partners( + self, + x, + ): + # compute pairwise squared distance matrix + r = numpy.sum(x * x, axis=-1).reshape(-1, 1) + r[0] = r[-1] = numpy.nan + D = r - 2 * numpy.ma.dot(x, x.T) + r.T + # avoid selecting residue itself as the best + D[numpy.diag_indices_from(D)] = numpy.inf + # get the closest non-masked residue + return numpy.nan_to_num(D, copy=False, nan=numpy.inf).argmin(axis=1) + + def encode(self, ca, cb, n, c): + # encode backbone atoms to virtual center + vc = self.vc_encoder.encode(ca, cb, n, c) + # find closest neighbor for each residue + return self._find_residue_partners(vc) + + +class FeatureEncoder(_BaseEncoder): + """ + An encoder for converting a protein structure to structural descriptors. + """ + + def __init__(self): + self.partner_index_encoder = PartnerIndexEncoder() + self.vc_encoder = self.partner_index_encoder.vc_encoder + + def _calc_conformation_descriptors(self, ca, partner_index, dtype=numpy.float32): + # build arrays of indices to use for vectorized angles + i = numpy.arange(1, ca.shape[-2] - 1) + j = partner_index[i] + # compute conformational descriptors + u1 = _normalize(ca[..., i, :] - ca[..., i - 1, :], inplace=True) + u2 = _normalize(ca[..., i + 1, :] - ca[..., i, :], inplace=True) + u3 = _normalize(ca[..., j, :] - ca[..., j - 1, :], inplace=True) + u4 = _normalize(ca[..., j + 1, :] - ca[..., j, :], inplace=True) + u5 = _normalize(ca[..., j, :] - ca[..., i, :], inplace=True) + desc = numpy.zeros((ca.shape[0], 10), dtype=dtype) + desc[i, 0] = numpy.sum(u1 * u2, axis=-1) + desc[i, 1] = numpy.sum(u3 * u4, axis=-1) + desc[i, 2] = numpy.sum(u1 * u5, axis=-1) + desc[i, 3] = numpy.sum(u3 * u5, axis=-1) + desc[i, 4] = numpy.sum(u1 * u4, axis=-1) + desc[i, 5] = numpy.sum(u2 * u3, axis=-1) + desc[i, 6] = numpy.sum(u1 * u3, axis=-1) + desc[i, 7] = numpy.linalg.norm(ca[i] - ca[j], axis=-1) + desc[i, 8] = numpy.clip(j - i, -4, 4) + desc[i, 9] = numpy.copysign(numpy.log(numpy.abs(j - i) + 1), j - i) + return desc + + def _create_descriptor_mask(self, mask, partner_index): + i = numpy.arange(1, mask.shape[0] - 1) + j = partner_index[i] + out = numpy.zeros((mask.shape[0], 10), dtype=numpy.bool_) + out[1:-1, :] |= ( + mask[i - 1] | mask[i] | mask[i + 1] | mask[j - 1] | mask[j] | mask[j + 1] + ).reshape(mask.shape[0] - 2, 1) + out[0] = out[-1] = True + return out + + def encode(self, ca, cb, n, c): + # encode backbone atoms to virtual center + vc = self.vc_encoder.encode(ca, cb, n, c) + # find closest neighbor for each residue + partner_index = self.partner_index_encoder._find_residue_partners(vc) + # build position features from residue angles + descriptors = self._calc_conformation_descriptors(ca, partner_index) + # create mask + mask = self._create_descriptor_mask(vc.mask[:, 0], partner_index) + return numpy.ma.masked_array( + descriptors, + mask=mask, + fill_value=numpy.nan, + ) + + +class Encoder(_BaseEncoder): + """ + An encoder for converting a protein structure to 3di states. + """ + + _INVALID_STATE = 2 + _CENTROIDS = numpy.array( + [ + [-1.0729, -0.3600], + [-0.1356, -1.8914], + [0.4948, -0.4205], + [-0.9874, 0.8128], + [-1.6621, -0.4259], + [2.1394, 0.0486], + [1.5558, -0.1503], + [2.9179, 1.1437], + [-2.8814, 0.9956], + [-1.1400, -2.0068], + [3.2025, 1.7356], + [1.7769, -1.3037], + [0.6901, -1.2554], + [-1.1061, -1.3397], + [2.1495, -0.8030], + [2.3060, -1.4988], + [2.5522, 0.6046], + [0.7786, -2.1660], + [-2.3030, 0.3813], + [1.0290, 0.8772], + ] + ) + + def __init__(self): + self.feature_encoder = FeatureEncoder() + layers = load_kerasify( + resource_files(__package__).joinpath("encoder_weights_3di.kerasify") + ) + self.vae_encoder = Model(layers + (CentroidLayer(self._CENTROIDS),)) + + def encode( + self, + ca, + cb, + n, + c, + ): + descriptors = self.feature_encoder.encode(ca, cb, n, c) + states = self.vae_encoder(descriptors.data) + return numpy.ma.masked_array( + states, + mask=descriptors.mask[:, 0], + fill_value=self._INVALID_STATE, + ) + + +def _normalize(x, *, inplace=False): + norm = numpy.linalg.norm(x, axis=-1).reshape(*x.shape[:-1], 1) + return numpy.divide(x, norm, out=x if inplace else None, where=norm != 0) diff --git a/src/biotite/structure/alphabet/encoder_weights_3di.kerasify b/src/biotite/structure/alphabet/encoder_weights_3di.kerasify new file mode 100644 index 000000000..cfec8fbe4 Binary files /dev/null and b/src/biotite/structure/alphabet/encoder_weights_3di.kerasify differ diff --git a/src/biotite/structure/alphabet/i3d.py b/src/biotite/structure/alphabet/i3d.py new file mode 100644 index 000000000..1f295d1e1 --- /dev/null +++ b/src/biotite/structure/alphabet/i3d.py @@ -0,0 +1,131 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +NumPy port of the ``foldseek`` code for encoding structures to 3di. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["I3DSequence", "to_3di"] + +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.sequence import Sequence +from biotite.structure.alphabet.encoder import Encoder +from biotite.structure.chains import get_chain_starts +from biotite.structure.util import coord_for_atom_name_per_residue + + +class I3DSequence(Sequence): + """ + Representation of a structure in the 3Di structural alphabet. + :footcite:`VanKempen2024` + + Parameters + ---------- + sequence : iterable object, optional + The 3Di sequence. + This may either be a list or a string. + May take upper or lower case letters. + By default the sequence is empty. + + See also + -------- + to_3di : Create 3Di sequences from a structure. + + References + ---------- + + .. footbibliography:: + + """ + + alphabet = LetterAlphabet( + [ + "A", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "K", + "L", + "M", + "N", + "P", + "Q", + "R", + "S", + "T", + "V", + "W", + "Y", + ] + ) + unknown_symbol = "D" + + def __init__(self, sequence=""): + if isinstance(sequence, str): + sequence = sequence.upper() + else: + sequence = [symbol.upper() for symbol in sequence] + seq_code = I3DSequence.alphabet.encode_multiple(sequence) + super().__init__() + self.code = seq_code + + def get_alphabet(self): + return I3DSequence.alphabet + + def __repr__(self): + return f'I3DSequence("{"".join(self.symbols)}")' + + +def to_3di(atoms): + """ + Encode each chain in the given structure to the 3Di structure alphabet. + :footcite:`VanKempen2024` + + Parameters + ---------- + atoms : AtomArray + The atom array to encode. + May contain multiple chains. + + Returns + ------- + sequences : list of Sequence, length=n + The encoded 3Di sequence for each peptide chain in the structure. + chain_start_indices : ndarray, shape=(n,), dtype=int + The atom index where each chain starts. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> sequences, chain_starts = to_3di(atom_array) + >>> print(sequences[0]) + DQQVVCVVCPNVVNVDHGDD + """ + sequences = [] + chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) + for i in range(len(chain_start_indices) - 1): + start = chain_start_indices[i] + stop = chain_start_indices[i + 1] + chain = atoms[start:stop] + sequence = I3DSequence() + sequence.code = ( + Encoder() + .encode( + *coord_for_atom_name_per_residue(chain, ["CA", "CB", "N", "C"]), + ) + .filled() + ) + sequences.append(sequence) + return sequences, chain_start_indices[:-1] diff --git a/src/biotite/structure/alphabet/layers.py b/src/biotite/structure/alphabet/layers.py new file mode 100644 index 000000000..63279cd15 --- /dev/null +++ b/src/biotite/structure/alphabet/layers.py @@ -0,0 +1,86 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Implementation of the neural network layers used in ``foldseek``. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["Layer", "DenseLayer", "CentroidLayer", "Model"] + +import abc +import functools +import numpy + + +class Layer(abc.ABC): + @abc.abstractmethod + def __call__(self, x): + raise NotImplementedError + + +class DenseLayer(Layer): + def __init__(self, weights, biases=None, activation: bool = True): + self.activation = activation + self.weights = numpy.asarray(weights) + if biases is None: + self.biases = numpy.zeros(self.weights.shape[1]) + else: + self.biases = numpy.asarray(biases) + + def __call__(self, x): + x = numpy.asarray(x) + out = x @ self.weights + out += self.biases + + if self.activation: + return _relu(out, out=out) + else: + return out + + +class CentroidLayer(Layer): + def __init__(self, centroids) -> None: + self.centroids = numpy.asarray(centroids) + self.r2 = numpy.sum(self.centroids**2, axis=1).reshape(-1, 1).T + + def __call__(self, x): + # compute pairwise squared distance matrix + r1 = numpy.sum(x**2, axis=1).reshape(-1, 1) + D = r1 - 2 * x @ self.centroids.T + self.r2 + # find closest centroid + states = numpy.empty(D.shape[0], dtype=numpy.uint8) + D.argmin(axis=1, out=states) + return states + + +class Model: + def __init__(self, layers=()): + self.layers = list(layers) + + def __call__(self, x): + return functools.reduce(lambda x, f: f(x), self.layers, x) + + +def _relu( + x, + out=None, + *, + where=True, + casting="same_kind", + order="K", + dtype=None, + subok=True, +): + return numpy.maximum( + 0.0, + x, + out=out, + where=where, + casting=casting, + order=order, + dtype=dtype, + subok=subok, + ) diff --git a/src/biotite/structure/alphabet/pb.license b/src/biotite/structure/alphabet/pb.license new file mode 100644 index 000000000..688633bfa --- /dev/null +++ b/src/biotite/structure/alphabet/pb.license @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2013 Poulain, A. G. de Brevern + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/src/biotite/structure/alphabet/pb.py b/src/biotite/structure/alphabet/pb.py new file mode 100644 index 000000000..e2c527cca --- /dev/null +++ b/src/biotite/structure/alphabet/pb.py @@ -0,0 +1,143 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Conversion of structures into the *Protein Blocks* structural alphabet. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Patrick Kunzmann" +__all__ = ["ProteinBlocksSequence", "to_protein_blocks"] + +import numpy as np +from biotite.sequence.alphabet import LetterAlphabet +from biotite.sequence.sequence import Sequence +from biotite.structure.chains import get_chain_starts +from biotite.structure.geometry import dihedral_backbone + +# PB reference angles, adapted from PBxplore +PB_ANGLES = np.array( + [ + [41.14, 75.53, 13.92, -99.80, 131.88, -96.27, 122.08, -99.68], + [108.24, -90.12, 119.54, -92.21, -18.06, -128.93, 147.04, -99.90], + [-11.61, -105.66, 94.81, -106.09, 133.56, -106.93, 135.97, -100.63], + [141.98, -112.79, 132.20, -114.79, 140.11, -111.05, 139.54, -103.16], + [133.25, -112.37, 137.64, -108.13, 133.00, -87.30, 120.54, 77.40], + [116.40, -105.53, 129.32, -96.68, 140.72, -74.19, -26.65, -94.51], + [0.40, -81.83, 4.91, -100.59, 85.50, -71.65, 130.78, 84.98], + [119.14, -102.58, 130.83, -67.91, 121.55, 76.25, -2.95, -90.88], + [130.68, -56.92, 119.26, 77.85, 10.42, -99.43, 141.40, -98.01], + [114.32, -121.47, 118.14, 82.88, -150.05, -83.81, 23.35, -85.82], + [117.16, -95.41, 140.40, -59.35, -29.23, -72.39, -25.08, -76.16], + [139.20, -55.96, -32.70, -68.51, -26.09, -74.44, -22.60, -71.74], + [-39.62, -64.73, -39.52, -65.54, -38.88, -66.89, -37.76, -70.19], + [-35.34, -65.03, -38.12, -66.34, -29.51, -89.10, -2.91, 77.90], + [-45.29, -67.44, -27.72, -87.27, 5.13, 77.49, 30.71, -93.23], + [-27.09, -86.14, 0.30, 59.85, 21.51, -96.30, 132.67, -92.91], + ] +) # fmt: skip + + +class ProteinBlocksSequence(Sequence): + """ + Representation of a structure in the *Protein Blocks* structural alphabet. + :footcite:`Brevern2000` + + Parameters + ---------- + sequence : iterable object, optional + The *Protein Blocks* sequence. + This may either be a list or a string. + May take upper or lower case letters. + By default the sequence is empty. + + See also + -------- + to_protein_blocks : Create *Protein Blocks* sequences from a structure. + + References + ---------- + + .. footbibliography:: + + """ + + alphabet = LetterAlphabet("abcdefghijklmnopZ") + unknown_symbol = "Z" + + def get_alphabet(self): + return ProteinBlocksSequence.alphabet + + +def to_protein_blocks(atoms): + """ + Encode each chain in the given structure to the *Protein Blocks* structural + alphabet. + :footcite:`Brevern2000` + + Parameters + ---------- + atoms : AtomArray + The atom array to encode. + May contain multiple chains. + + Returns + ------- + sequences : list of Sequence, length=n + The encoded *Protein Blocks* sequence for each peptide chain in the structure. + chain_start_indices : ndarray, shape=(n,), dtype=int + The atom index where each chain starts. + + References + ---------- + + .. footbibliography:: + + Examples + -------- + + >>> sequences, chain_starts = to_protein_blocks(atom_array) + >>> print(sequences[0]) + ZZmmmmmnopjmnopacdZZ + """ + sequences = [] + chain_start_indices = get_chain_starts(atoms, add_exclusive_stop=True) + for i in range(len(chain_start_indices) - 1): + start = chain_start_indices[i] + stop = chain_start_indices[i + 1] + chain = atoms[start:stop] + sequences.append(_to_protein_blocks(chain)) + return sequences, chain_start_indices[:-1] + + +def _to_protein_blocks(chain): + phi, psi, _ = dihedral_backbone(chain) + + pb_angles = np.full((len(phi), 8), np.nan) + pb_angles[2:-2, 0] = psi[:-4] + pb_angles[2:-2, 1] = phi[1:-3] + pb_angles[2:-2, 2] = psi[1:-3] + pb_angles[2:-2, 3] = phi[2:-2] + pb_angles[2:-2, 4] = psi[2:-2] + pb_angles[2:-2, 5] = phi[3:-1] + pb_angles[2:-2, 6] = psi[3:-1] + pb_angles[2:-2, 7] = phi[4:] + pb_angles = np.rad2deg(pb_angles) + + # Angle RMSD of all reference angles with all actual angles + rmsda = np.sum( + ((PB_ANGLES[:, np.newaxis] - pb_angles[np.newaxis, :] + 180) % 360 - 180) ** 2, + axis=-1, + ) + # Where RMSDA is NaN, (missing atoms/residues or chain ends) set symbol to unknown + pb_seq_code = np.full(len(pb_angles), ProteinBlocksSequence.alphabet.encode("Z")) + pb_available_mask = ~np.isnan(rmsda).any(axis=0) + # Chose PB, where the RMSDA to the reference angle is lowest + # Due to the definition of Biotite symbol codes + # the index of the chosen PB is directly the symbol code + pb_seq_code[pb_available_mask] = np.argmin(rmsda[:, pb_available_mask], axis=0) + # Put the array of symbol codes into actual sequence objects + pb_sequence = ProteinBlocksSequence() + pb_sequence.code = pb_seq_code + return pb_sequence diff --git a/src/biotite/structure/alphabet/unkerasify.py b/src/biotite/structure/alphabet/unkerasify.py new file mode 100644 index 000000000..95e228af0 --- /dev/null +++ b/src/biotite/structure/alphabet/unkerasify.py @@ -0,0 +1,122 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +Parser for extracting weights from Keras files. + +Adapted from `moof2k/kerasify `_. +""" + +__name__ = "biotite.structure.alphabet" +__author__ = "Martin Larralde" +__all__ = ["load_kerasify"] + +import enum +import functools +import itertools +import struct +import numpy as np +from biotite.structure.alphabet.layers import DenseLayer, Layer + + +class LayerType(enum.IntEnum): + DENSE = 1 + CONVOLUTION2D = 2 + FLATTEN = 3 + ELU = 4 + ACTIVATION = 5 + MAXPOOLING2D = 6 + LSTM = 7 + EMBEDDING = 8 + + +class ActivationType(enum.IntEnum): + LINEAR = 1 + RELU = 2 + SOFTPLUS = 3 + SIGMOID = 4 + TANH = 5 + HARD_SIGMOID = 6 + + +class KerasifyParser: + """An incomplete parser for model files serialized with `kerasify`. + + Notes + ----- + Only dense layers are supported, since the ``foldseek`` VQ-VAE model + is only using 3 dense layers. + """ + + def __init__(self, file) -> None: + self.file = file + self.buffer = bytearray(1024) + (self.n_layers,) = self._get("I") + + def read(self): + if self.n_layers == 0: + return None + + self.n_layers -= 1 + layer_type = LayerType(self._get("I")[0]) + if layer_type == LayerType.DENSE: + (w0,) = self._get("I") + (w1,) = self._get("I") + (b0,) = self._get("I") + weights = ( + np.frombuffer(self._read(f"={w0*w1}f"), dtype="f4") + .reshape(w0, w1) + .copy() + ) + biases = np.frombuffer(self._read(f"={b0}f"), dtype="f4").copy() + activation = ActivationType(self._get("I")[0]) + if activation not in (ActivationType.LINEAR, ActivationType.RELU): + raise NotImplementedError( + f"Unsupported activation type: {activation!r}" + ) + return DenseLayer(weights, biases, activation == ActivationType.RELU) + else: + raise NotImplementedError(f"Unsupported layer type: {layer_type!r}") + + def __iter__(self): + return self + + def __next__(self) -> Layer: + layer = self.read() + if layer is None: + raise StopIteration + return layer + + def _read(self, format: str) -> memoryview: + n = struct.calcsize(format) + if len(self.buffer) < n: + self.buffer.extend( + itertools.islice(itertools.repeat(0), n - len(self.buffer)) + ) + v = memoryview(self.buffer)[:n] + self.file.readinto(v) # type: ignore + return v + + def _get(self, format: str): + v = self._read(format) + return struct.unpack(format, v) + + +@functools.cache +def load_kerasify(file_path): + """ + Load the the model layers from a ``.kerasify`` file. + + Parameters + ---------- + file_path : str + The path to the ``.kerasify`` file. + + Returns + ------- + layers : tuple of Layer + The model layers. + """ + with open(file_path, "rb") as file: + return tuple(KerasifyParser(file)) diff --git a/src/biotite/structure/geometry.py b/src/biotite/structure/geometry.py index cc5c59f4e..8f64fbfb8 100644 --- a/src/biotite/structure/geometry.py +++ b/src/biotite/structure/geometry.py @@ -25,10 +25,12 @@ import numpy as np from biotite.structure.atoms import AtomArray, AtomArrayStack, coord from biotite.structure.box import coord_to_fraction, fraction_to_coord, is_orthogonal -from biotite.structure.chains import chain_iter -from biotite.structure.error import BadStructureError -from biotite.structure.filter import filter_peptide_backbone -from biotite.structure.util import norm_vector, vector_dot +from biotite.structure.filter import filter_amino_acids +from biotite.structure.util import ( + coord_for_atom_name_per_residue, + norm_vector, + vector_dot, +) def displacement(atoms1, atoms2, box=None): @@ -480,139 +482,84 @@ def index_dihedral(*args, **kwargs): def dihedral_backbone(atom_array): """ - Measure the characteristic backbone dihedral angles of a protein - structure. + Measure the characteristic backbone dihedral angles of a chain. Parameters ---------- - atom_array: AtomArray or AtomArrayStack - The protein structure. A complete backbone, without gaps, - is required here. - Chain transitions are allowed, the angles at the transition are - `NaN`. - The order of the backbone atoms for each residue must be - (N, CA, C). + atoms: AtomArray or AtomArrayStack + The protein structure to measure the dihedral angles for. + For missing backbone atoms the corresponding angles are `NaN`. Returns ------- phi, psi, omega : ndarray - An array containing the 3 backbone dihedral angles for every - CA. 'phi' is not defined at the N-terminus, 'psi' and 'omega' - are not defined at the C-terminus. In these places the arrays - have *NaN* values. If an :class:`AtomArrayStack` is given, the - output angles are 2-dimensional, the first dimension corresponds - to the model number. - - Raises - ------ - BadStructureError - If the amount of backbone atoms is not equal to amount of - residues times 3 (for N, CA and C). - - See Also - -------- - dihedral - - Examples - -------- - - >>> phi, psi, omega = dihedral_backbone(atom_array) - >>> print(np.stack([np.rad2deg(phi), np.rad2deg(psi)]).T) - [[ nan -56.145] - [ -43.980 -51.309] - [ -66.466 -30.898] - [ -65.219 -45.945] - [ -64.747 -30.346] - [ -73.136 -43.425] - [ -64.882 -43.255] - [ -59.509 -25.698] - [ -77.989 -8.823] - [ 110.784 8.079] - [ 55.244 -124.371] - [ -57.983 -28.766] - [ -81.834 19.125] - [-124.057 13.401] - [ 67.931 25.218] - [-143.952 131.297] - [ -70.100 160.068] - [ -69.484 145.669] - [ -77.264 124.223] - [ -78.100 nan]] + An array containing the 3 backbone dihedral angles for every CA atom. + `phi` is not defined at the N-terminus, `psi` and `omega` are not defined at the + C-terminus. + In these places the arrays have *NaN* values. + If an :class:`AtomArrayStack` is given, the output angles are 2-dimensional, + the first dimension corresponds to the model number. """ - bb_filter = filter_peptide_backbone(atom_array) - backbone = atom_array[..., bb_filter] - - if ( - backbone.array_length() % 3 != 0 - or (backbone.atom_name[0::3] != "N").any() - or (backbone.atom_name[1::3] != "CA").any() - or (backbone.atom_name[2::3] != "C").any() - ): - raise BadStructureError( - "The backbone is invalid, must be repeats of (N, CA, C), " - "maybe a backbone atom is missing" - ) - phis = [] - psis = [] - omegas = [] - for chain_bb in chain_iter(backbone): - phi, psi, omega = _dihedral_backbone(chain_bb) - phis.append(phi) - psis.append(psi) - omegas.append(omega) - return ( - np.concatenate(phis, axis=-1), - np.concatenate(psis, axis=-1), - np.concatenate(omegas, axis=-1), - ) + amino_acid_mask = filter_amino_acids(atom_array) + # Coordinates for dihedral angle calculation + coord_n, coord_ca, coord_c = coord_for_atom_name_per_residue( + atom_array, + ("N", "CA", "C"), + amino_acid_mask, + ) + n_residues = coord_n.shape[-2] -def _dihedral_backbone(chain_bb): - bb_coord = chain_bb.coord # Coordinates for dihedral angle calculation # Dim 0: Model index (only for atom array stacks) # Dim 1: Angle index # Dim 2: X, Y, Z coordinates # Dim 3: Atoms involved in dihedral angle - if isinstance(chain_bb, AtomArray): - angle_coord_shape = (len(bb_coord) // 3, 3, 4) - elif isinstance(chain_bb, AtomArrayStack): - angle_coord_shape = (bb_coord.shape[0], bb_coord.shape[1] // 3, 3, 4) - phi_coord = np.full(angle_coord_shape, np.nan) - psi_coord = np.full(angle_coord_shape, np.nan) - omega_coord = np.full(angle_coord_shape, np.nan) - - # Indices for coordinates of CA atoms - ca_i = np.arange(bb_coord.shape[-2] // 3) * 3 + 1 + if isinstance(atom_array, AtomArray): + angle_coord_shape: tuple[int, ...] = (n_residues, 3, 4) + elif isinstance(atom_array, AtomArrayStack): + angle_coord_shape = (atom_array.stack_depth(), n_residues, 3, 4) + coord_for_phi = np.full(angle_coord_shape, np.nan, dtype=np.float32) + coord_for_psi = np.full(angle_coord_shape, np.nan, dtype=np.float32) + coord_for_omg = np.full(angle_coord_shape, np.nan, dtype=np.float32) + # fmt: off - phi_coord [..., 1:, :, 0] = bb_coord[..., ca_i[1: ]-2, :] - phi_coord [..., 1:, :, 1] = bb_coord[..., ca_i[1: ]-1, :] - phi_coord [..., 1:, :, 2] = bb_coord[..., ca_i[1: ], :] - phi_coord [..., 1:, :, 3] = bb_coord[..., ca_i[1: ]+1, :] - psi_coord [..., :-1, :, 0] = bb_coord[..., ca_i[:-1]-1, :] - psi_coord [..., :-1, :, 1] = bb_coord[..., ca_i[:-1], :] - psi_coord [..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+1, :] - psi_coord [..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+2, :] - omega_coord[..., :-1, :, 0] = bb_coord[..., ca_i[:-1], :] - omega_coord[..., :-1, :, 1] = bb_coord[..., ca_i[:-1]+1, :] - omega_coord[..., :-1, :, 2] = bb_coord[..., ca_i[:-1]+2, :] - omega_coord[..., :-1, :, 3] = bb_coord[..., ca_i[:-1]+3, :] + coord_for_phi[..., 1:, :, 0] = coord_c[..., 0:-1, :] + coord_for_phi[..., 1:, :, 1] = coord_n[..., 1:, :] + coord_for_phi[..., 1:, :, 2] = coord_ca[..., 1:, :] + coord_for_phi[..., 1:, :, 3] = coord_c[..., 1:, :] + + coord_for_psi[..., 0:-1, :, 0] = coord_n[..., 0:-1, :] + coord_for_psi[..., 0:-1, :, 1] = coord_ca[..., 0:-1, :] + coord_for_psi[..., 0:-1, :, 2] = coord_c[..., 0:-1, :] + coord_for_psi[..., 0:-1, :, 3] = coord_n[..., 1:, :] + + coord_for_omg[..., 0:-1, :, 0] = coord_ca[..., 0:-1, :] + coord_for_omg[..., 0:-1, :, 1] = coord_c[..., 0:-1, :] + coord_for_omg[..., 0:-1, :, 2] = coord_n[..., 1:, :] + coord_for_omg[..., 0:-1, :, 3] = coord_ca[..., 1:, :] # fmt: on phi = dihedral( - phi_coord[..., 0], phi_coord[..., 1], phi_coord[..., 2], phi_coord[..., 3] + coord_for_phi[..., 0], + coord_for_phi[..., 1], + coord_for_phi[..., 2], + coord_for_phi[..., 3], ) psi = dihedral( - psi_coord[..., 0], psi_coord[..., 1], psi_coord[..., 2], psi_coord[..., 3] + coord_for_psi[..., 0], + coord_for_psi[..., 1], + coord_for_psi[..., 2], + coord_for_psi[..., 3], ) - omega = dihedral( - omega_coord[..., 0], - omega_coord[..., 1], - omega_coord[..., 2], - omega_coord[..., 3], + omg = dihedral( + coord_for_omg[..., 0], + coord_for_omg[..., 1], + coord_for_omg[..., 2], + coord_for_omg[..., 3], ) - return phi, psi, omega + return phi, psi, omg def centroid(atoms): diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 551155f5c..eef76924f 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -600,7 +600,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): def _find_matches(query_arrays, reference_arrays): """ For each index in the `query_arrays` find the indices in the - `reference_arrays` where all query values the reference counterpart. + `reference_arrays` where all query values match the reference counterpart. If no match is found for a query, the corresponding index is -1. """ match_masks_for_all_columns = np.stack( diff --git a/src/biotite/structure/segments.py b/src/biotite/structure/segments.py index 5841346b3..f67c24d21 100644 --- a/src/biotite/structure/segments.py +++ b/src/biotite/structure/segments.py @@ -16,7 +16,7 @@ import numpy as np -def apply_segment_wise(starts, data, function, axis): +def apply_segment_wise(starts, data, function, axis=None): """ Generalized version of :func:`apply_residue_wise()` for residues and chains. @@ -36,7 +36,6 @@ def apply_segment_wise(starts, data, function, axis): value = function(segment) else: value = function(segment, axis=axis) - value = function(segment, axis=axis) # Identify the shape of the resulting array by evaluation # of the function return value for the first segment if processed_data is None: diff --git a/src/biotite/structure/util.py b/src/biotite/structure/util.py index cabbdc8f5..018426061 100644 --- a/src/biotite/structure/util.py +++ b/src/biotite/structure/util.py @@ -8,9 +8,18 @@ __name__ = "biotite.structure" __author__ = "Patrick Kunzmann" -__all__ = ["vector_dot", "norm_vector", "distance", "matrix_rotate"] +__all__ = [ + "vector_dot", + "norm_vector", + "distance", + "matrix_rotate", + "coord_for_atom_name_per_residue", +] import numpy as np +from biotite.structure.atoms import AtomArray +from biotite.structure.error import BadStructureError +from biotite.structure.residues import get_residue_masks, get_residue_starts def vector_dot(v1, v2): @@ -94,3 +103,59 @@ def matrix_rotate(v, matrix): if orig_ndim > 2: v = v.reshape(*orig_shape) return v + + +def coord_for_atom_name_per_residue(atoms, atom_names, mask=None): + """ + Get the coordinates of a specific atom for every residue. + + If a residue does not contain the specified atom, the coordinates are `NaN`. + If a residue contains multiple atoms with the specified name, an exception is + raised. + + Parameters + ---------- + atoms : AtomArray, shape=(n,) or AtomArrayStack, shape=(m,n) + The atom array or stack to get the residue-wise coordinates from. + atom_names : list of str, length=k + The atom names to get the coordinates for. + mask : ndarray, shape=(n,), dtype=bool, optional + A boolean mask to further select valid atoms from `atoms`. + + Returns + ------- + coord: ndarray, shape=(k, m, r, 3) or shape=(k, r, 3) + The coordinates of the specified atom for each residue. + """ + residue_starts = get_residue_starts(atoms) + all_residue_masks = get_residue_masks(atoms, residue_starts) + + if isinstance(atoms, AtomArray): + coord = np.full( + (len(atom_names), len(residue_starts), 3), + np.nan, + dtype=np.float32, + ) + else: + coord = np.full( + (len(atom_names), atoms.stack_depth(), len(residue_starts), 3), + np.nan, + dtype=np.float32, + ) + + for i, atom_name in enumerate(atom_names): + specified_atom_mask = atoms.atom_name == atom_name + if mask is not None: + specified_atom_mask &= mask + all_residue_masks_for_specified_atom = all_residue_masks & specified_atom_mask + number_of_specified_atoms_per_residue = np.count_nonzero( + all_residue_masks_for_specified_atom, axis=-1 + ) + if np.any(number_of_specified_atoms_per_residue > 1): + raise BadStructureError(f"Multiple '{atom_name}' atoms per residue") + residues_with_specified_atom = number_of_specified_atoms_per_residue == 1 + coord[i, ..., residues_with_specified_atom, :] = atoms.coord[ + ..., specified_atom_mask, : + ] + + return coord diff --git a/tests/sequence/align/test_matrix.py b/tests/sequence/align/test_matrix.py index 570878945..02a1cbae3 100644 --- a/tests/sequence/align/test_matrix.py +++ b/tests/sequence/align/test_matrix.py @@ -6,6 +6,7 @@ import pytest import biotite.sequence as seq import biotite.sequence.align as align +import biotite.structure.alphabet as strucalph @pytest.mark.parametrize( @@ -13,7 +14,7 @@ [ entry for entry in align.SubstitutionMatrix.list_db() - if entry not in ["NUC", "GONNET"] + if entry not in ["NUC", "GONNET", "3Di", "PB"] ], ) def test_matrices(db_entry): @@ -25,6 +26,37 @@ def test_matrices(db_entry): align.SubstitutionMatrix(alph1, alph2, db_entry) +@pytest.mark.parametrize( + "matrix_name, alphabet", + [ + ("3Di", strucalph.I3DSequence.alphabet), + ], +) +def test_structural_alphabet_matrices(matrix_name, alphabet): + """ + Test for exceptions when reading structural alphabet matrix files. + """ + align.SubstitutionMatrix(alphabet, alphabet, matrix_name) + + +@pytest.mark.parametrize( + "method_name", + [ + "std_protein_matrix", + "std_nucleotide_matrix", + "std_3di_matrix", + "std_protein_blocks_matrix", + "std_clepaps_matrix", + ], +) +def test_default_matrices(method_name): + """ + Test for exceptions when using the static methods for getting default matrices. + """ + matrix = getattr(align.SubstitutionMatrix, method_name)() + assert isinstance(matrix, align.SubstitutionMatrix) + + def test_matrix_str(): """ Test conversion of substitution matrix to string via a small diff --git a/tests/structure/data/alphabet/1ay7.bcif b/tests/structure/data/alphabet/1ay7.bcif new file mode 100644 index 000000000..3ce454e2e Binary files /dev/null and b/tests/structure/data/alphabet/1ay7.bcif differ diff --git a/tests/structure/data/alphabet/1cew.bcif b/tests/structure/data/alphabet/1cew.bcif new file mode 100644 index 000000000..f19f878e8 Binary files /dev/null and b/tests/structure/data/alphabet/1cew.bcif differ diff --git a/tests/structure/data/alphabet/1mol.bcif b/tests/structure/data/alphabet/1mol.bcif new file mode 100644 index 000000000..82787d42f Binary files /dev/null and b/tests/structure/data/alphabet/1mol.bcif differ diff --git a/tests/structure/data/alphabet/README.rst b/tests/structure/data/alphabet/README.rst new file mode 100644 index 000000000..afb292bd2 --- /dev/null +++ b/tests/structure/data/alphabet/README.rst @@ -0,0 +1,31 @@ +Structural alphabet sequences +============================== + +This directory contains structural alphabet sequences for the test structure files +from the `tests/structure/data/` directory, generated with the respective reference +implementation. + +3Di sequences +------------- + +The 3Di sequences in `i3d.fasta` were generated with `foldseek` according to +`these instructions `_: + +.. code-block:: console + + $ foldseek createdb --chain-name-mode 1 tests/structure/data/*.cif /tmp/biotite_3di + $ foldseek lndb /tmp/biotite_3di_h /tmp/biotite_3di_ss_h + $ foldseek convert2fasta /tmp/biotite_3di_ss tests/structure/data/alphabet/i3d.fasta + +Protein Blocks sequences +------------------------ + +Only one sequence is available in `pb.fasta`, that is taken from +`https://pbxplore.readthedocs.io/en/latest/PBassign.html`. +`1ay7.bcif` contains the corresponding structure. + +CLePAPS sequences +----------------- + +The CLePAPS sequences in `clepaps.fasta` were taken from +`presentation slides `_. diff --git a/tests/structure/data/alphabet/clepaps.fasta b/tests/structure/data/alphabet/clepaps.fasta new file mode 100644 index 000000000..7edd7291c --- /dev/null +++ b/tests/structure/data/alphabet/clepaps.fasta @@ -0,0 +1,4 @@ +>1mol_A +RRFEDECCGAIHHHHHHHHHHHHHHHOMICQEECBLDFQNBFEEEEFEQNNGCPLDDEEEDEEENOGCEDEEEEEEPKKOGFEDPLDEQBGCCR +>1cew_I +RRCECECAJGBIHHHHHHHHIHHHIGGBLDFFCPLDPLEEFEDPOLCEEEEEEDEFDEAGCAKLAJGKHHIMNGKLQQQDEEEDEEEEEBPKKOGEEDPLEEER diff --git a/tests/structure/data/alphabet/i3d.fasta b/tests/structure/data/alphabet/i3d.fasta new file mode 100644 index 000000000..a931e9059 --- /dev/null +++ b/tests/structure/data/alphabet/i3d.fasta @@ -0,0 +1,216 @@ +>1aki_A THE STRUCTURE OF THE ORTHORHOMBIC FORM OF HEN EGG-WHITE LYSOZYME AT 1.5 ANGSTROMS RESOLUTION +DADDLQRVLVLCVVLPPACQVHDHSLLVSQQLCQPPVRAQADWDADPQQWIQGGSNRHIQQEDAPQPRGDNGNNPVVYHSVVSSDPRCPSVVSRVSVVCVPPVHVVVRVSSVVPPPPDPSVVSCPPHDD +>1crr_MODEL_21_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEAAEDFDQPQCVVLLVVCVAVNDRDPPPCLPDFDWDWHWDCLVHDRYTYTYGDHHPPDDDDDVNLVCLLPGFFYLQEYEQAVCVRVVVSVPVQVSSCVNVVHLDTQHAYEYEPPPDHPRNDDVVVVVVVVVVSVYHYWYYYSVPRPGSCVRVSSRVVSVVVD +>1crr_MODEL_22_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEEWDAQLCLVQLLVCVPVVDGDNCPPQADFDWRKDWDAAPNRIYIYGYGNGGDHPDDDPVSLVVLVVHFQYAQEYAQLVCVRVVVSQVVQVSSCVNVVHLQTLYAYEYEPPPDDRGPHPVVVCVVRCVVSPHYYWYYYSNPRPGSCVRVRSRVVSVVVD +>1crr_MODEL_23_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEAFDDPQCLVLLQVCVAVVDGDPDDALLDADWDWHWDQQDPGIYTYGYHDHGDHPPDDDVSLVVLQVGFFYELEAAQLPCVRVVVVQVVQPSSCVSVVHLDTLYAYEYENPPDDNGNNPPPPVCVRCVVSVHHYWYAYSNPRPGSCCRVSSRVVSVVPD +>1crr_MODEL_24_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEAEEDWAQFLCLVLLLVCVPVVDGDGCPDQADFDWRWHWDAAPHDIYIYTYGDGHDHDDDDVVSLVCLQVHFFYQLRAAQQPVVRVVRVVVVQVSSCVSVVHLQTQYEDEHEPPPDDDGNHDPPVQCCVQVVSVHHYWYAYSPPRRGSVCRVNVRVVSVVPD +>1crr_MODEL_25_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEAQEAWDLPQCRVLLLVCVAVNDRDPCPPLPDFDWRWDWDCLPPNIYIYGYGDGRDDDPDDPPSLVVLQVHQFYAQEAAQAPCVRVVVVVVVQPSSCVSVVHLDTQYEYEYEPPPDDRGRRDVVNVCVVVVVSPYHYWYYYSVPRRGSCCRVSVRCVSVVVD +>1crr_MODEL_26_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEQLEAFDPPLCLPLLQCCQPVNDRDPPPDAQDADWHWDWDAQPNPIYIYIYGSGGDDDDPDDVSLVCLQPHFNYAHEAAFAPCVRVVVVQVVQVSSCVSVVHQQTLHAYEYENPPDDDGNDDPVRVCVVCVVSVHYYWYAYSVPRRGSCVRVNSRVVSVVPD +>1crr_MODEL_27_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEEQEDWDQPLCLVQLVVCVPVPAGDPDPDQADADFDKDWDALDPRIYIYRYGSGGPHDDPPCVSLVCLQVHFFYAYEAEQLPVVRVVVCLPVVVCSCVSVVHLDTLYAHEYENPPDPDGNRDPPVVCCRQVVSPHHYFYAYSNVRHGSSVRVSSRCVSVVVD +>1crr_MODEL_28_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEAEEAWAAPLCPVLLVVCVPVVDRDDPDQLQDADWDWDFDPLVDDTYIYTYGDHRHDCPPPDVSLLSLQVHQQYALEAAQAPVVRVVRVVVVQPSSCVRVVHRDGQHAYEHEPPPDDDGPHDPVVQCCVQVVSVHYYYYAYSPPRRGSSVRVSVRVVSVVPD +>1crr_MODEL_29_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEEFAQPQCLVLLLCCVPVNDRDDRVQLQDFDKDWDWDQQPNPIYIYIYGNGRHDPDDDCVSLVCLQVHFNYALEAAQVDCVRVVVSVVVQVSSCVNVVHLDTQYEYEHENPPDDPGPRDPVNVCVPCVVSVYYYFYYYSVPRHGSCVRVNSSVVSVVVD +>1crr_MODEL_30_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEAWDQPQCPVLLQVCVPVVAGDPCPPLVDFDWGWDWDPQVDDIYTYTYGDDHDDPPDPVVSLVVLQGGFNYALEAAQQDVVRVVRVVVVQVSSCVSVVHLDTQYEYEYEDPVDDDGPHDPVRQCCVQVVSVHHYFYAYSNHTPRSCVRVNSRVVVVVVD +>1crr_MODEL_31_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEEFDLPLCQVLLLVCVAPNDRDPCDPQANWDWDWHWDPLPPDIYTYTYGYDGHDDDDDDVSLVVLQVGFQYEQEYAQLVCVRVVVCVPVVVSSCVNVVDLQTLYAYEHEDPPDDRGDHPVPVSCVVQVVSVHHYFYAYRNVGGRSSVRVSSRVVSVVPD +>1crr_MODEL_32_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEQLEAWDQPQCLVQLQVCQPVNDGDPPPPLQDFDWGKDWDDQPPDIYIYTYGDDHDDPPPRVVSLVCLQVGFNYAYEAEQVPCVRVVVVVCVQPSSCVNVVHQDTQHAYEYENCPDDHGNDDPVNVCVVCVVSVHYYFYYYSVVRHRSSVRVNSSCVSVVVD +>1crr_MODEL_33_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEQLEAFDQPQQPVLLLVCLVPNAGDPPPDLPDFDWDKDWDCLDNRIHIYTYGDGRQPPDDDDVSLVRLQGGFLYALRAAQAPVVRVVRVQCVQPSSCVNVVHLQTLYADEHEPPPDPDGPHPVVVVCVVQVVSPHYYYYYYRRPRRRSSVRVNSRVVSVVVD +>1crr_MODEL_34_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEEQEAWDPQLCLVQLQVCVVPNAGDNPPPLQDFDFGWDQDAFPNHTYTYGYGSGHDHDPDDPPSLQVLQVGFFYAQEAEFAPRVRVVRCVVRQPSSCVNVVHLDTQHAYEHENPPDDHGNDPVVVSCVVVVVSPYHYYYYYSHPRRCSCVRVRSRVVSVVVD +>1crr_MODEL_35_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEFEAAEAFDDPLQPPLLQVCQAPNARDPCPDLPDAFWGWHWDAQVNRIYTYTYGDGRDCPDDNVDNLVCLLGGFLYALRAEQQDVVRVVRVQCVQVSSCVSVVHNQGLYAYEHEPPPDPDGPHPVVNSVVRCVVSPHHYYYAYNNVGRRSCVSVNSSVVSVVVD +>1crr_MODEL_36_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEDFDLPQCLVQLLVCVVVVAGDDDDPQQDFDWRWDWDDQPPHTYIYTYGNGGDNPDDDDVSLVVLQVHFNYALEAAQQDVVRVVVVVVVQVSSCVSVVHLDTQYAYEHENPPDDPGNHDPVVVCVVCVVSVHYYFYYYSNPRGRSSVRVNSRVVSVVVD +>1crr_MODEL_37_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEALEAFDQPLCPVQLQVCVVVNDGDDDDQLQDADWGWDFDDQDPDTHIYTYGDGRDDPPPPDVSLVVLQVHQNYAHEAEFAPCVRVVVVLCVLVSSCVSVVPLQGLYAYEHEPPVDPDGNDDPVNVVVVCVVSPHDYFYAYSVHRPRSSVRVNVSVVSVVVD +>1crr_MODEL_38_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEAFDAPLCPPLLQVCVPVNDRDPPPPQPDFDWGKDWDALPPHIYIYTYGDGGDDPDDCVVSLVVLQVHFFYAHEAAQLDVVRVVRVVVVQVSSCVSVVHLQTLYEHEYENPPDDNGNHDPPVQCVVCVVSVHHYFYYYSVVRHGSNCRVSVRVVSVVVD +>1crr_MODEL_39_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DAEAEQLEAFDQPQCPVQLVVCLVPNAGDPDPPLPDFDWDKDFDQQVNRTYIYTYGDHRDDDPDDPPSLQVLQVGFQYAHEAAQVPVVRVVVVVVVLVVSCVNVVHLDGLYAYEHEPPPDDDGNRPVVVVCVVCVVSVHYYWYAYSNHRPGSCVRVSSRVVSVVVD +>1crr_MODEL_40_A THE SOLUTION STRUCTURE AND DYNAMICS OF RAS P21. GDP DETERMINED BY HETERONUCLEAR THREE AND FOUR DIMENSIONAL NMR SPECTROSCOPY +DEEAEALEEFDAPLCVVQLVVCQPVNDRDPDDDLVDFDWAWHWDAQPPHIYIYIYGDHRDDDDDDDVSLVCLQVGFQYALEAAQAPCVSVVVSVVVLVSSCVNVVHLQTLYAYEHEPCPDPHGNCPPVNCCVSAVVSPHHYYYYYSVVRPCSCVRVNSRVVSVVVD +>1dix_A CRYSTAL STRUCTURE OF RNASE LE +DDDPQLAWWKKWKWWFVLQVDQAPDHWAFAVVDRADRFIATQAIAGHHLVQHGAWQQPPPPFDDCVVVVVCVVVCQHHHGGRHPPHYNCVVRLRRRCRTGVVSLCVQQVDPNSRVVLSSVVCVVQRLQVLCVVLVHHLPWFWAFPVSSQVSSCVVPVEGKDFAWDAGHVGFTATTMIIWMGGSNSPDTDHDSDDHDHDRDTIHTRHGD +>1f2n_A RICE YELLOW MOTTLE VIRUS +DDDFKDKDWDKAFQWFQKFALPAQFWDKAFPAQLSGVVSLLVQLQFFKKAWPKKKKWWAFDDDPPDFWKKKKAKALFQPDDTDSADVSSVPHHQIFMDTLHDANVCVCCNPPHCCPRIGMRIDDRDPGDIATHANDAPVVDDRVVRSRHTRIMMIMGINDNDNHIDGGTTMMITIMMMGGHGDDSVPGD +>1f2n_B RICE YELLOW MOTTLE VIRUS +DDDQKAKDKDKAFQWFQKFAQPALWWDKAFPAQLSGVVSLLVQLQFFKKAWDKKKKWKAFDDDPPDFWKKWKAKALFLVDDTDSADVSRVPHHQIFMGGRRFANVQVVCNPPRDPDSHGMRIDPRVPQDMATHANDAPNVDDRVVRSSHGRIMMTMGINHRDNHIDGGITMMITIIMMGGHGDDSVVGD +>1f2n_C RICE YELLOW MOTTLE VIRUS +DDDDDDDDDPDDDDDDPADPDDDPGDQKDKDWAKAFQWFQKFAQPALFWDKAFPAQLSGVVSVVNQLQFFKKAWDKKKKWKAFPDDPVDAWKKWKAKDLFQPDDTDSADVSSVVGHQIFMGGLRFANCCVVCNPPHCCVGIGMRIDDRVPGDMATHANDAPNPDPSVVSSSHTRIMMIMGINHRDNHMDGGTTMMITIIMMGGHGDDSVVGD +>1gya_MODEL_1_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFDEDPAEDAAQFKFKADFDPDDDDDFFAKKFKAFVVPRHGAFMTGHPPDGDHPDLCWDADDSGMIIGHGDDPVSWGWMWIFTGGPVPDSPDIHTYGYYYDDD +>1gya_MODEL_2_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEAEAEDAFPFKWKADDDPDFDDDFFWKKWKAFVPPRGTQFITADPPDGDHDDLCWYADNGQMIIHGGAHQVSWTWMKMWTGGNPPDTDDIGTYGGDYDDD +>1gya_MODEL_3_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDQEDPAEDEAFDKDKADDDPDFDDDWFFKKWKAFVPPTGTQAITADPPDGPDPDLCWYADPRGMTIHHGDDVVSWTWMKIFTGTNVPHRPDIGTYGYDYDDD +>1gya_MODEL_4_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDDEDAFQFKDKDAFDPDFDDDFFAKWFKAFPPRRHGCFMGDDPPDGRDPDLCWYADPRQMIMRGGHHQVSWGKMKIFTGGPVPDRPGIHTYTYHYDDD +>1gya_MODEL_5_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFFEDEAEDAAFFKDKADDDPAFDDDFAWKWFKAFDPVRGTQFMTADPPDGHDPDQCWYADRRQMIIHGTDGQVSFGWMWIFGGGHVPDRPDIHTYGYHYDDD +>1gya_MODEL_6_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFFEDAAEDAAQFKGKFDDDPDADDQFFQKKFKAQVPPGHTQAIGDDPPDGDDPDLCWYADPRHMIIHHGHHPVSWTWMKMFTGGPVPDSPDIGTYTYDHDDD +>1gya_MODEL_7_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDQEDPAEDAAFFKDKADDDPDFDDDFFFKKFKAFVVVRHGDFMTQPPPDTDDDDLCWYADRRRMIMGGTHGQVSWTWMKMFTGGNVPDRPDITTYTYGYDDD +>1gya_MODEL_8_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDAEDAAFAKDKADDDQDADDDFFFKKFKAFVPPGHTQAITDDPPDGHDPDLCWYADRGRMTIHHGDHVVSWGWMKIFTGGNVPDRPDIGTYGHYYDDD +>1gya_MODEL_9_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDEAEDAAFFKGKAADDPDADDVWFFKKFKAQVVVRGGLFMTDDPPDGDAPDLLWHADNRNMTIHGGDDPVSWTWMKIFTGTPVPDRPDIGTYGYYHDDD +>1gya_MODEL_10_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDAADPAEDAAFFKDKFDFDPDADDDWFQKWFKFFPPVTGGAFMGDHPPDGDHPDLCWYADPRQMTMHHGDHPVSFGWMWIFTGGPVPDRPDIHTYGYYYDDD +>1gya_MODEL_11_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDAEDAAQDKDKFDFDPDADDDFFFKKFKAQVPPGHTQFITDDPPDGDHPDLCWYADPRRMIMHGGDDQVSWGWMKIFTGTPVPDRPDIGTYTYYYDDD +>1gya_MODEL_12_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDDAEDAFQGKDKAADDPDADDDFFFKKFKAQVVVRHTQAMGDDPPDGHAPDQCWHADRRQMITHGGDDPVSFTWMKIFTGGPVPDRPDIGTYTYDYDDD +>1gya_MODEL_13_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDQEDEAEDAQPWKDKAAFDPDFDDDFAFKWFKFFDPVRGGQFMGDDPPDGDDDDLCWYADRRQMIIHGGQRPVSFGWMKIFGGGNVPDRPDIGTYGGYYDDD +>1gya_MODEL_14_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDEWEDAFQFKTKAQFDPDDDDDFFFKKFKAFVPVRHGQFIGDPPPDTDDDDLCWYADPNNMIMRGTHHVVSFGWMWIFGGGNVPDSPDIGTYGGDYDDD +>1gya_MODEL_15_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDEAEDAAFFWDKAAFDPDFDDPWFFKKFKAFDPVGGTQAIGDDPPDGDAPDQCWHADRGRMTIHGTGGQVSWGWMKMFTGTPVPDRPDIGTYGYHYDDD +>1gya_MODEL_16_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFADDAEDAAQDKDKFDDDPPFDDDFFFKKFKAQVPVRHTDAIGDDPPDGDDPDQQWYADRRRMIMGGTHDPVSWTWMKMFTGGNVPDRPDIHTYGGDYDDD +>1gya_MODEL_17_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDFFADEAEDAAQFKDKDDDDPDFDDDQWFWKFKAQVVVRHGQFIGDDPPDGDDPDLCWYADNGQMTMHGGGHQVNFTWMKIFTGGNVPDGPDIHTYTYDYDDD +>1gya_MODEL_18_A N-GLYCAN AND POLYPEPTIDE NMR SOLUTION STRUCTURES OF THE ADHESION DOMAIN OF HUMAN CD2 +DDDDFEDPAEDAAQGKDKADDDPDADDDFFFKKWKAFVPPGGTQAITDPPDDGRAPDQCWHADRRRMIMGGGDDPVSFGWMKIFTGGNVPDRPDIGTYGYYYDDD +>1igy_A STRUCTURE OF IMMUNOGLOBULIN +DDKAKPAQEAEEAWQAKDKIKMFDPWFQFQAKWKWWDAPPDDIHTAAGGQWHGDPPHDPQWTWHDGTGMIMIIGRTADLVRFTKMWMWRDRDPPIDIHPIHGYQYDDDWDFWDKDWDWFDPVVLVVWKTKIKMKTAQTPDPPKDKAKDLADGHRDDQKDKDWDDQDSPSRGIIIMIMHMDTSCVSVVHFKIKTWMDDPVDPDIDIDIGTDDPD +>1igy_B STRUCTURE OF IMMUNOGLOBULIN +DDWAKDDAEEDEQFAKDKMKTFDDDDQQLQWKKWKWWAADVGDIGTAWIAGNPRPDIDGDDVQPPQKDWDADSVRGIIMIMHGGHHQVVFTWMWMAIDVPGPHIHDTDTYGHDPDDFWAWDKDWFAPQCDPCPPQKDKTKIKTWFGPDDDKDKDKPNVPDDDDKAWDDWDDDPRMTITMMMHIGGRPCPQPPWMWIWMAGVVVGDTDIHTRDHDDPDPPPPDDDDPPDKDKDKAFFALCQLQVQVHWTWIKMKIWAAAPAWPQQDKWKDFPNDIDDFPPQWPWFDCPPNHTMTMGGGTDHSVSLVVWTWIWIFTDTDPDPDRDTGIDTHDDDDWFFWPKDKADFDPVPDPDFKGKIKIKTWFTDDQNKDKFKAFLHDGDDDWDKDGWDQDDVRGTMIMIIDMDGNVSQVVFSKMKMKMADVVDDSRIDIDIDTD +>1igy_C STRUCTURE OF IMMUNOGLOBULIN +DAKAKPAQEAEEAWQAKDKIKMFGPWFQFQAKWKWWDAPPDDIHTAAGRQWHGDPPHDPQWTWHDGTGMIMIIGRTAALVRFTKMWMWRDRDPPIDIHPIHGYQYDDDWDFWDKDWDWFDPVVLVVWKTKIKMKTAQTPDPPKDKAKDLADGHRDDQKDKDWDDQDSPSRGIIIMIMGMDTNCVSVVHFKMKTWMDDDVDPDIDIDIGGNPPD +>1igy_D STRUCTURE OF IMMUNOGLOBULIN +DDWAKDDAEEDEQFAKDKMKTFDDDDQQLQWKKFKWWAADVGDIGTAWIAGRPRPDIDGDDVQPPQKDWDADSVRGIIMIMHGGHHQVVFTWMWMAIDVPGPHIHDTDTYGHDPDDFDAWDKDWFAPQCDDCPPQKDKTKIKTWFGPDDDKDKDKPNVPADPDKAWDDWDDDPRMIITMMMHIGGRPCPVPPWMWIWMAGVVVGDTDIHTRDHPDDDDDQPQDDDDQDKDKDKAFFALVQLQDQVHWTWTKMKIWAAAPAWDQQDKWKDFANDIDDFPPQWPWFDCPSNHTMTMGGGTDHSVSLVVWTWIWIFTDTDSDPDRDIGIDTHDDDDWFFWPKDKAFFDPVPDPDFKGKIKIKTWFTDDQSKDKFKAFLHDGDDDWDKDGWDQDDVGGTMIMIIDMDGPVSQVVFSKMKMKMADVVDDSRTDIDIDTD +>1l2y_MODEL_1_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVNVDHGDD +>1l2y_MODEL_2_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNPDHGDD +>1l2y_MODEL_3_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNVVVVDHGPD +>1l2y_MODEL_4_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVNPDDGDD +>1l2y_MODEL_5_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNVDHGDD +>1l2y_MODEL_6_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNVDHGPD +>1l2y_MODEL_7_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAPVNVDHGPD +>1l2y_MODEL_8_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCVNPVNPDDGPD +>1l2y_MODEL_9_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNPVNVDDGDD +>1l2y_MODEL_10_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNPVNPDDGDD +>1l2y_MODEL_11_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVVPNCVVVDHGDD +>1l2y_MODEL_12_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNPDDGPD +>1l2y_MODEL_13_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVHPPNPDDGDD +>1l2y_MODEL_14_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCCVCPNVVNPDHGDD +>1l2y_MODEL_15_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNPDDGDD +>1l2y_MODEL_16_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQCVCVVCPNVVVVDHGDD +>1l2y_MODEL_17_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DVQVVCVVCPAPVNVDHGDD +>1l2y_MODEL_18_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAVVNPDHGDD +>1l2y_MODEL_19_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDHGDD +>1l2y_MODEL_20_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVVVDHGPD +>1l2y_MODEL_21_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCVNCVVVDHGDD +>1l2y_MODEL_22_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNPDHGPD +>1l2y_MODEL_23_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVVVNCVVVDHGPD +>1l2y_MODEL_24_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNVVNVDDGDD +>1l2y_MODEL_25_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DVQVVCVVCVNCVVVDHGDD +>1l2y_MODEL_26_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAVVNVDHGPD +>1l2y_MODEL_27_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNVVNVDHGPD +>1l2y_MODEL_28_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNPDHGDD +>1l2y_MODEL_29_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNPVNVDDGDD +>1l2y_MODEL_30_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPHPVNPDDGDD +>1l2y_MODEL_31_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDHGDD +>1l2y_MODEL_32_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCVNPVNPDDGDD +>1l2y_MODEL_33_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVNPDHGPD +>1l2y_MODEL_34_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNPPNPDDGDD +>1l2y_MODEL_35_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPAVVNPDHGPD +>1l2y_MODEL_36_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DQQVVCVVCPNVVVVDHGDD +>1l2y_MODEL_37_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDHGPD +>1l2y_MODEL_38_A NMR Structure of Trp-Cage Miniprotein Construct TC5b +DCQVVCVVCPNVVNVDDGDD +>1o1z_A Crystal structure of glycerophosphodiester phosphodiesterase (GDPD) (TM1621) from Thermotoga maritima at 1.60 A resolution +DDDDAQEAAEQALVVVDPGVFLVRQLVLVVLPHLGYEWEWDAALVGFTWTDPDQADCPPQNDGGGRNRHHPVRVCVSVVNGIDGPVVSPVRDDLSGAYEYEYPDLSNVVVVVVVCPVGPRYEYEYCPVVSCQVRVPPHAYEYEDDPPSCPDVVSNLVVCVVRVHQEYEYELVLVVDVVSLVSLQVSVVVNRAYEYDDDDDVVSCVVCVNSHRYYHYSNSNVVVVVD +>2axd_MODEL_1_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPPCCVVVPPVNVLLVVLLVLLLQQLCVVVVVPDGDPVVSQVVNPPVVSVSSVVSNVVSVVVSVVVPVPDPDDDDD +>2axd_MODEL_2_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPDDCVNVPVVVVVLVVLLVLLLLLLCVVVVVDDGDNVVSPVVDDPVCVVSSVVSNVVSVVCSVVVHDPDDPPDDD +>2axd_MODEL_3_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DVDVPCVDDVVVVLLVVLLVLLLLLLVVVVVVPPRPNVVSVVPRDPVPVVSNVVSNVVSVVVSVCVVPDDPPDDDD +>2axd_MODEL_4_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPDDCVVDPPPVVLLVQLLVQLLLVLCVVVVVDDGPPVVLVVSQDPVPVVSSVVSNVVSVVCSVCVVVPDCPPPDD +>2axd_MODEL_5_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DVDDDCVVPVVNCVLVVLLVLLLLLLVVVCPVPDGDNVVSLVVDDPVPSVSNVVSNVVNVVLVVVVVNPPPPDDPD +>2axd_MODEL_6_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DDDDPVCDDVVVCLLVVLLVLLLLLLCVVVVVPPGPVCVSVVSQDPVPNVSNVVSNVVSVCVSVLVPGNDPNDDDD +>2axd_MODEL_7_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DQDPVCCDDPVNCLLVVLLVVLLLVLCCVPVVDDRPNVCNVVSDDPVCPVSNVVSNVVSVVVSVCVVVPPDDDDDD +>2axd_MODEL_8_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPPVVLVDDVCNVVLVVLLVLLLLVLCVVVPVPDGDPVVSVVVDDPVVVVSSVVSNVVSPVCSVCVPPPDPDDDDD +>2axd_MODEL_9_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DDDDVVVPDCVPVLLVVLLVLLLQVLCVVVVVDDGDPVVLVVSDDPVCVVSSVVSNVVNVVCVVVPVPVDDDPPDD +>2axd_MODEL_10_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DQCDPCSPPPVVVLLVVLLVLLLLLLCVVVVVPDRDNVVSLVVRDPVPNVSNVVSNVVSNVCSVVVCPDDDDDDDD +>2axd_MODEL_11_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPPCCVVDPPVVVLLVVLLVLLLVVLVCVCVVPDDDPVVSLVVDDVVPNVSSVVSNVVSVVCSVVVVVDPDDDDDD +>2axd_MODEL_12_S solution structure of the theta subunit of escherichia coli DNA polymerase III in complex with the epsilon subunit +DPDPCVVPDCNVCLLVVLLVLLLLQLCVVVVVDDGDPPVSQVPRDPVPPVSSVVSNVVSPVVSVVVVNVDDDPDDD +>2d0f_A Crystal Structure of Thermoactinomyces vulgaris R-47 Alpha-Amylase 1 (TVAI) Mutant D356N complexed with P2, a pullulan model oligosaccharide +DALDFEFDQLFWDDDCFFVFWPCLAAAQQDKIKGKIKGFPHRADWKWKWKAWPVVRDIDIWTWAWDQAFVQNGMTIIMTIDGGGQTKMFIKMWTGRVNQIWIQALLGIGRDDDPALTAIDHHPFDAFVCLLQFAEEEDLQQFEFQQDQVLAQDFQNDDDPPATAHEDEAPDDLCDDPRHHSFRYGDQGALNGVQVCVLLVCPLQNGAEYEYQDQADDRTSRQLQAQDLPWGHSSRPTVVSLLVSLVSSCDPPSHPGHAYEYAQNQFWHFCCHLLNNPPCPDPAAHLLRDCPHPSVQQFAAPPGPPRGQADVPPPRTTGGAPPDPPDVSLQQCFDDCNHSLNVQCAPPHNHAEYEYDLLQQRYHNNGRHPDPRSLVSQLVRLCRNCVSPVRRAYEYEDQEQCLVQAVNSRGHLEYAPNQQAQQLLLQDQLCAGQQLHHHHDFLVVSVSSNSVSCSSHHQSHQSNHEHENHEQATFQSCVSNVNDVLLLQLSLLDLLQARHHRYYHPPSQLSDGAHGPPRRSHRDDVVSSHCVRVSSVSSSLSSVVSSVDPQSSRFDKAWFDRDRVQRKTWIWGDHPPKIKIKIFGSAQAKDKDWTQSVSVVDDAQDKWAFSVVRDMFGHHPSTTIGIAGHSHMTITMD +>3o5r_A Complex of Fk506 with the Fk1 domain mutant A19T of FKBP51 +DVLVCQVVPFDQQDPVPPSQKTKDWDFAADDQDAADAFKKWWKWKWKDWPVGDTDDTPVVVVGTDIDGAPPPPAFQNVNRVSRRHHAFIKMKMWGFCNRHVAQVFDPPRGHGRITMIMIMHTHDIDHD +>5eil_A Computational design of a high-affinity metalloprotein homotrimer containing a metal chelating non-canonical amino acid +DDPLQVQLCCLLVQNPVSNVVSVVVPHDLCRADQQQDGSLLSNLQNLRLVSNVVSVVVPRQQCRQGNQQDGSLLNNLLVLNQNSNVVSLVSPHDQCRQTNQGDGSLLNNLLNLNQNNNVVSVVSPRDQCRATNVRDGSLRSNVVNVNVVSVVVSVVD +>5eil_B Computational design of a high-affinity metalloprotein homotrimer containing a metal chelating non-canonical amino acid +DDPLQVVLLVLLVQVPVVNVVSVVVPHDLCRADPQQDGSLLSNLQNLNLVSNVVSVVVPHQQCRQGNQQDGSLLNNLLNQNLNSNVVSVVSPHDQCRQGNQGDGSLLNNLLNLNQSVNVVSVVSPHAQCRATNVRDGSLNNNVVNVNVVNNVVSVPHD +>5eil_C Computational design of a high-affinity metalloprotein homotrimer containing a metal chelating non-canonical amino acid +DDPLQVQLLVLLVLPPVSNVVSVVVVHDLCRADPQQDGSLLSNLQNLNLVSNVVSVVVPRDQCGQGNQQDGSLLNNLLVQNQNSNVVSVVSPRDQCRQTNQGDGSLLNNLLNLNVVNNVVSVVSPHQQPRATNVRDGSLNSNVVNVNVVSNVVSVPVD +>5h73_A Crystal structure of human DHODH with 18F +DLLVCLQPPDLVVCVVVDFVVVSLVVVQVCLLVLVAPQDDDDQDVQQWDDAQNAIFSHQEEADEDSAAQLRRQVSRVSLPGREYEYHQAWAAADQFDDPPFKDDQVVQLKIFGQRGGHHNHLVRSLVSLVVCQVVQVVCRVVRYFYEYEYDYDQPDPDGLVRLLVCCQRCLLSGQEYEDEQAFQNRPPSVVCLAQVNVLVSVVSNVVSLVPDDVNSRHQYEYEEALPDDPVSLLSNLVSCVVRVRQAYEFHDFDQDQDPDRDGPCSPPGGGMAADVCQVSRLVVLLVNCVSNVLPHAYEGEGHDQALVSQLSSQLSAHPHYYYDSNCSNRNSVSSSRNSVRVVVVCVVVVHSHSVRNRNNVVD +>5ugo_A DNA polymerase beta nick complex with imidodiphosphate +DLPVLLLVQLQVVLLLCCQPVVNNVSNVLSNVLSVQVVPPPDNDDALVVSCVTDSRHDVSRVQRRVCNVPVDGPVVVVCCVDQLSVQRSVLCLQFPDGNVNSNVCVVVVQRDLVSCVVPLVPDDPLRNVSSVQVPLLPDKAFVVVVVVVVVVLQVQVCVVPVQKDKDWAEVVQLPDRIGSATEIEIEGQCFAPVHNNDPCSVVSSVVSCVVVQFFDFFSDDDRFKTWGWGFDDDDPPDDGRRIGIYMYGYAHVQQQLLQRVVRNADPVLVVVLQVLQVVLQWGGDSGAIFGADPVRDTDDGDDDDHNVVSCVSSVHGDDRSNVRPD +>5zng_A The crystal complex of immune receptor RGA5A_S of Pia from rice (Oryzae sativa) with rice blast (Magnaporthe oryzae) effector protein AVR1-CO39 +DVQAFFKKKWKKFAPQQDPVSLVVLQVLLCPFPFWDHWDQDDPNSGMITTIGGPGDVVVSQVRRCVVRVDMDTDDMDGD +>5zng_C The crystal complex of immune receptor RGA5A_S of Pia from rice (Oryzae sativa) with rice blast (Magnaporthe oryzae) effector protein AVR1-CO39 +DPAFKKKFKDDPHDTPDIDGDHAQDWDDDPNDIWGQHNQRAIPPQQDPVRIGMGMHTDPVHD +>7gsa_A PanDDA Analysis group deposition -- Crystal structure of PTP1B in complex with FMOPL000260a +DQVVVVLVVCVVVVCLVVVVVVLVVPADDADQVLCPDPVNPQQAPDPPFAFHPVFAQFQPDPPRRDHRWGWQQDVVLRWIAIFHAAGDPVCLLVVVSSCVVLQFQEEEEEEDCDAPRDGLHDPSADPDQPDWDADPVQQKIKGWDDWDDDDQWIKTWMWIARNVVRDIDIHIYIYGYPAHSDFGDPACVVVLVSLVVCVVVCRGPPVGGHHYYYYRSRAAVVLLSVLLVSLLSVLVPDPHNSPRDSSVSSSSSSVITPCHHPDSRSVSSSSNSNVVVCPVVPPD diff --git a/tests/structure/data/alphabet/pb.fasta b/tests/structure/data/alphabet/pb.fasta new file mode 100644 index 000000000..3d7cb3f8c --- /dev/null +++ b/tests/structure/data/alphabet/pb.fasta @@ -0,0 +1,2 @@ +>1ay7 +ZZdddfklpcbfklmmmmmmmmnopafklgoiaklmmmmmmmmpacddddddehkllmmmmnnommmmmmmmmmmmmmnopacddddZZ \ No newline at end of file diff --git a/tests/structure/data/ids.txt b/tests/structure/data/ids.txt index ec9fcc5ab..e905de2f8 100644 --- a/tests/structure/data/ids.txt +++ b/tests/structure/data/ids.txt @@ -16,4 +16,4 @@ 5eil 4p5j 1crr -7gsa +7gsa \ No newline at end of file diff --git a/tests/structure/test_clepaps.py b/tests/structure/test_clepaps.py new file mode 100644 index 000000000..a9d74cb7a --- /dev/null +++ b/tests/structure/test_clepaps.py @@ -0,0 +1,82 @@ +from pathlib import Path +import numpy as np +import pytest +import biotite.sequence.io.fasta as fasta +import biotite.structure as struc +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +def _get_ref_3di_sequence(pdb_id, chain_id): + """ + Get the reference CLePAPS sequence for the the structure with the given + PDB ID and chain ID. + """ + ref_3di_file = fasta.FastaFile.read( + Path(data_dir("structure")) / "alphabet" / "clepaps.fasta" + ) + return strucalph.ClepapsSequence(ref_3di_file[f"{pdb_id.lower()}_{chain_id}"]) + + +@pytest.mark.parametrize("pdb_id, chain_id", [("1mol", "A"), ("1cew", "I")]) +def test_to_clepaps(pdb_id, chain_id): + """ + Test the structure conversion to CLePAPS based on a reference example from + presentation slides. + """ + pdbx_file = pdbx.BinaryCIFFile.read( + Path(data_dir("structure")) / "alphabet" / f"{pdb_id}.bcif" + ) + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + chain = atoms[atoms.chain_id == chain_id] + test_sequences, _ = strucalph.to_clepaps(chain) + + ref_sequence = _get_ref_3di_sequence(pdb_id, chain_id) + + # Only a single chain was used as input -> expect only one sequence + assert len(test_sequences) == 1 + assert str(test_sequences[0]) == str(ref_sequence) + + +@pytest.mark.parametrize("pdb_id, chain_id", [("1mol", "A"), ("1cew", "I")]) +def test_missing_residues(pdb_id, chain_id): + """ + Like, `test_to_clepaps()`, but in some residues backbone atoms are missing. + Expect that these and adjacent residues get the unknown symbol 'R' in the + CLePAPs sequence. + """ + N_DELETIONS = 5 + # The 'R' symbol + UKNOWN_SYMBOL = strucalph.ClepapsSequence.unknown_symbol + + pdbx_file = pdbx.BinaryCIFFile.read( + Path(data_dir("structure")) / "alphabet" / f"{pdb_id}.bcif" + ) + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + chain = atoms[atoms.chain_id == chain_id] + + # Randomly delete some backbone atoms + rng = np.random.default_rng(1) + del_backbone_residue_ids = rng.choice( + np.unique(chain.res_id), N_DELETIONS, replace=False + ) + chain = chain[ + ~np.isin(chain.res_id, del_backbone_residue_ids) | ~(chain.atom_name == "CA") + ] + + test_sequences = strucalph.to_clepaps(chain) + + # Apply the same deletions to the reference sequence + ref_sequence, _ = strucalph.to_clepaps(chain) + for res_id in del_backbone_residue_ids: + seq_index = res_id - chain.res_id[0] + # Convert the symbol for residue and adjacent ones to 'R' + start_index = max(0, seq_index - 2) + end_index = min(len(ref_sequence), seq_index + 1) + ref_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + + assert len(test_sequences) == 1 + assert str(test_sequences[0]) == str(ref_sequence) diff --git a/tests/structure/test_i3d.py b/tests/structure/test_i3d.py new file mode 100644 index 000000000..402554608 --- /dev/null +++ b/tests/structure/test_i3d.py @@ -0,0 +1,112 @@ +import re +from pathlib import Path +import numpy as np +import pytest +import biotite.sequence.io.fasta as fasta +import biotite.structure as struc +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +def _get_ref_3di_sequence(pdb_id, chain_id): + """ + Get the reference 3di sequence for the first model of the structure with the given + PDB ID and chain ID. + """ + ref_3di_file = fasta.FastaFile.read( + Path(data_dir("structure")) / "alphabet" / "i3d.fasta" + ) + for header, seq_string in ref_3di_file.items(): + # The first model of a structure is also the first sequence to appear + # and thus to be matched + if re.match(rf"^{pdb_id}(_MODEL_\d+)?_{chain_id}", header): + ref_3di_sequence = strucalph.I3DSequence(seq_string) + break + else: + raise ValueError( + f"Reference 3Di sequence not found for {pdb_id} chain {chain_id}" + ) + return ref_3di_sequence + + +@pytest.mark.parametrize( + "path", Path(data_dir("structure")).glob("*.bcif"), ids=lambda path: path.stem +) +def test_to_3di(path): + """ + Check if the 3di sequence of a chain is correctly generated, by comparing the result + to a reference sequence generated with *foldseek*. + """ + if ( + path.stem + in [ + "1dix" # `get_chain_starts()` does not work properly here with `use_author_fields=True` + ] + ): + pytest.skip("Miscellaneous issues") + + pdbx_file = pdbx.BinaryCIFFile.read(path) + if np.any( + pdbx_file.block["atom_site"]["label_alt_id"].mask.array + == pdbx.MaskValue.PRESENT + ): + # There is some inconsistency in how foldseek and Biotite handle altloc IDs + # -> skip these cases for the sake of simplicity + pytest.skip("Structure contains altlocs") + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + if len(atoms) == 0: + pytest.skip("Structure contains no peptide chains") + test_3di, chain_starts = strucalph.to_3di(atoms) + + ref_3di = [ + _get_ref_3di_sequence(path.stem, chain_id) + for chain_id in atoms.chain_id[chain_starts] + ] + + for test, ref, chain_id in zip(test_3di, ref_3di, atoms.chain_id[chain_starts]): + assert str(test) == str(ref), f"3Di sequence of chain {chain_id} does not match" + + +def test_missing_residues(): + """ + Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing. + Expect that these and adjacent residues get the unknown symbol 'Z' in the + PB sequence. + """ + PDB_ID = "1aki" + N_DELETIONS = 5 + MAX_MISMATCH_PERCENTAGE = 0.1 + UKNOWN_SYMBOL = strucalph.I3DSequence.unknown_symbol + + pdbx_file = pdbx.BinaryCIFFile.read(Path(data_dir("structure")) / f"{PDB_ID}.bcif") + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + + # Randomly delete some backbone atoms + rng = np.random.default_rng(1) + del_backbone_residue_ids = rng.choice( + np.unique(atoms.res_id), N_DELETIONS, replace=False + ) + atoms = atoms[ + ~np.isin(atoms.res_id, del_backbone_residue_ids) + | ~np.isin(atoms.atom_name, ("N", "CA", "CB", "C")) + ] + test_sequences, _ = strucalph.to_3di(atoms) + + # Apply the same deletions to the reference sequence + ref_sequence = _get_ref_3di_sequence(PDB_ID, atoms.chain_id[0]) + for res_id in del_backbone_residue_ids: + seq_index = res_id - atoms.res_id[0] + # Convert the PDB symbol for residue and adjacent ones to 'Z' + start_index = max(0, seq_index - 1) + end_index = min(len(ref_sequence), seq_index + 1) + ref_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + + assert len(test_sequences) == 1 + # 3Di sequences are quite complex, i.e. removing backbone atoms at some position + # might alter the symbols in remote positions + # -> Allow for mismatches + n_mismatches = np.count_nonzero(test_sequences[0].code != ref_sequence.code) + assert n_mismatches / len(ref_sequence) <= MAX_MISMATCH_PERCENTAGE diff --git a/tests/structure/test_pb.py b/tests/structure/test_pb.py new file mode 100644 index 000000000..5ecb57348 --- /dev/null +++ b/tests/structure/test_pb.py @@ -0,0 +1,76 @@ +from pathlib import Path +import numpy as np +import pytest +import biotite.sequence.io.fasta as fasta +import biotite.structure as struc +import biotite.structure.alphabet as strucalph +import biotite.structure.io.pdbx as pdbx +from tests.util import data_dir + + +@pytest.fixture +def reference_sequence(): + """ + Get the reference Protein Blocks sequence for the alphabet example structure. + """ + _, seq_string = next( + fasta.FastaFile.read_iter(Path(data_dir("structure")) / "alphabet" / "pb.fasta") + ) + return strucalph.ProteinBlocksSequence(seq_string) + + +@pytest.fixture +def reference_chain(): + pdbx_file = pdbx.BinaryCIFFile.read( + Path(data_dir("structure")) / "alphabet" / "1ay7.bcif" + ) + atoms = pdbx.get_structure(pdbx_file, model=1) + atoms = atoms[struc.filter_amino_acids(atoms)] + chain = atoms[atoms.chain_id == "B"] + return chain + + +def test_to_protein_blocks(reference_chain, reference_sequence): + """ + Test the structure conversion to protein blocks based on a reference example from + the PBexplore documentation + (https://pbxplore.readthedocs.io/en/latest/intro_PB.html). + """ + test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) + + assert len(test_pb_sequences) == 1 + assert str(test_pb_sequences[0]) == str(reference_sequence) + + +def test_missing_residues(reference_chain, reference_sequence): + """ + Like, `test_to_protein_blocks()`, but in some residues backbone atoms are missing. + Expect that these and adjacent residues get the unknown symbol 'Z' in the + PB sequence. + """ + N_DELETIONS = 5 + # The 'Z' symbol + UKNOWN_SYMBOL = strucalph.ProteinBlocksSequence.unknown_symbol + + # Randomly delete some backbone atoms + rng = np.random.default_rng(1) + del_backbone_residue_ids = rng.choice( + np.unique(reference_chain.res_id), N_DELETIONS, replace=False + ) + reference_chain = reference_chain[ + ~np.isin(reference_chain.res_id, del_backbone_residue_ids) + | ~np.isin(reference_chain.atom_name, ("N", "CA", "C")) + ] + + # Apply the same deletions to the reference sequence + for res_id in del_backbone_residue_ids: + seq_index = res_id - reference_chain.res_id[0] + # Convert the PB symbol for residue and adjacent ones to 'Z' + start_index = max(0, seq_index - 2) + end_index = min(len(reference_sequence), seq_index + 2) + reference_sequence[start_index : end_index + 1] = UKNOWN_SYMBOL + + test_pb_sequences, _ = strucalph.to_protein_blocks(reference_chain) + + assert len(test_pb_sequences) == 1 + assert str(test_pb_sequences[0]) == str(reference_sequence) diff --git a/tests/test_doctest.py b/tests/test_doctest.py index 8293210b6..98875124e 100644 --- a/tests/test_doctest.py +++ b/tests/test_doctest.py @@ -68,6 +68,7 @@ "biotite.structure.io.mol", ["biotite.structure", "biotite.structure.info"] ), pytest.param("biotite.structure.info", ["biotite.structure"]), + pytest.param("biotite.structure.alphabet", ["biotite.structure"]), pytest.param( "biotite.database.entrez", [], diff --git a/tests/test_repr.py b/tests/test_repr.py index f8bf319c4..7097ba072 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -20,6 +20,7 @@ ) from biotite.sequence.align import Alignment, SubstitutionMatrix from biotite.structure import Atom +from biotite.structure.alphabet import I3DSequence __author__ = "Maximilian Greil" @@ -32,6 +33,7 @@ ProteinSequence("BIQTITE"), Alphabet(["X", "Y", "Z"]), GeneralSequence(Alphabet(["X", 42, False]), ["X", 42, "X"]), + I3DSequence("ACDE"), LetterAlphabet(["X", "Y", "Z"]), Location(98, 178), Feature("CDS", [Location(98, 178)], qual={"gene": "test1"}),