From 6bb1a85ae3503e39b379ccddaa6e0e6358d514da Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 26 Aug 2024 19:45:34 +0200 Subject: [PATCH 01/29] Create data_exploration.ipynb --- data_exploration.ipynb | 637 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 637 insertions(+) create mode 100644 data_exploration.ipynb diff --git a/data_exploration.ipynb b/data_exploration.ipynb new file mode 100644 index 00000000..6f1045a4 --- /dev/null +++ b/data_exploration.ipynb @@ -0,0 +1,637 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 16, + "id": "81559360-c8b8-462d-bfa1-6ae22bed1615", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "# Ignore all warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "33275d3c-cdbf-4c1f-aa04-f135511f3643", + "metadata": {}, + "source": [ + "# 1. Instantiation of a Data Class\r\n", + "\r\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "### Inheritance Hierarchy\n", + "\n", + "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + ".\r\n", + "\r\n", + "### Explanation\r\n", + "a ChEBI data classiData` class can be configured with the following main parameters:\r\n", + "\r\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\r\n", + "\r\n", + "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\r\n", + "\r\n", + "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\r\n", + "\r\n", + "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\r\n", + "\r\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\r\n", + "\r\n", + "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", + "\r\n", + "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\r\n", + "\r\n", + "### Additional Input Parameters\r\n", + "\r\n", + "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\r\n", + "\r\n", + "- **batch_size (int)**: The batch size for data loading. Default is `1`.\r\n", + "\r\n", + "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\r\n", + "\r\n", + "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\r\n", + "\r\n", + "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\r\n", + "\r\n", + "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\r\n", + "\r\n", + "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\r\n", + "\r\n", + "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\r\n", + "\r\n", + "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\r\n", + "\r\n", + "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\r\n", + "\r\n", + "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\r\n", + "\r\n", + "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\r\n", + "\r\n", + "- **kwargs**: Additional keyword arguments.\r\n", + "\r\n", + "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\r\n", + "ining and validation.\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, + "source": [ + "# Available ChEBI Data Classes\n", + "\n", + "## `ChEBIOver100`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver50`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver100DeepSMILES`\n", + "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver100SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver50SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", + "\n", + "## `ChEBIOver50Partial`\n", + "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": {}, + "outputs": [], + "source": [ + "chebi_class = ChEBIOver50(chebi_version=231)" + ] + }, + { + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "1655d489-25fe-46de-9feb-eeca5d36936f", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\r\n", + "\r\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\r\n", + "\r\n", + "### Why is Preparation Needed?\r\n", + "\r\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\r\n", + "- **Data Integrity**: It ensures that the data files are up-to-date and compatible with the specified ChEBI version.\r\n", + "\r\n", + "### Main Methods for Data Preprocessing\r\n", + "\r\n", + "The data preprocessing in a data class involves two main methods:\r\n", + "\r\n", + "1. **`prepare_data` Method**:\r\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\r\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\r\n", + "\r\n", + "2. **`setup` Method**:\r\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\r\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\r\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\r\n", + "\r\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes.\r\n", + "alidation processes.\r\n", + "processed(data_df, processed_name)\r\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", + "Cross-validation enabled: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", + "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", + "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" + ] + } + ], + "source": [ + "chebi_class.prepare_data()\n", + "chebi_class.setup()" + ] + }, + { + "cell_type": "markdown", + "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", + "metadata": {}, + "source": [ + "# 3. Different Data Files Created and their Structure\n", + "\r\n", + "\r\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\r\n", + "\r\n", + "### Data Files\r\n", + "\r\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file)\r\n", + " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", + "\r\n", + "2. **`data.pkl`**\r\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", + "\r\n", + "3. **`data.pt`**\r\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", + "\r\n", + "4. **`classes.txt`**\r\n", + " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\r\n", + "\r\n", + "5. **`splits.csv`**\r\n", + " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\r\n", + "\r\n", + "### File Structure and Preprocessing Stages\r\n", + "\r\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\r\n", + "\r\n", + "1. **Raw Data Stage**:\r\n", + " - **File**: `chebi.obo`\r\n", + " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", + "\r\n", + "2. **Processed Data Stage 1**:\r\n", + " - **File**: `data.pkl`\r\n", + " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\r\n", + "\r\n", + "3. **Processed Data Stage 2**:\r\n", + " - **File**: `data.pt`\r\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\r\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\r\n", + "\r\n", + "### Data Splits\r\n", + "\r\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\r\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\r\n", + "\r\n", + "### Summary of File Paths\r\n", + "\r\n", + "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\r\n", + "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\r\n", + "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\r\n", + "\r\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\r\n", + "that each step is well-documented and reproducible.\r\n", + "sing, from raw input to model-ready formats.\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "74adb549-9e02-472d-a535-78a584853b52", + "metadata": {}, + "source": [ + "# 4. Information Stored in the Files\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "metadata": {}, + "source": [ + "\n", + "## data.pkl\n", + "\n", + "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", + "\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of each ChEBI data instance.\n", + "- **Column 1**: Contains the name of each ChEBI data instance.\n", + "- **Column 2**: Contains the SMILES representation of the chemical.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (129184, 1335)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameSMILES1722246825712580263430983992...143017143212143813146180147334156473166828166904167497167559
033429monoatomic monoanion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
130151aluminide(1-)[Al-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
216042halide anion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
317051fluoride[F-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
428741sodium fluoride[F-].[Na+]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1335 columns

\n", + "
" + ], + "text/plain": [ + " id name SMILES 1722 2468 2571 2580 2634 \\\n", + "0 33429 monoatomic monoanion [*-] False False False False False \n", + "1 30151 aluminide(1-) [Al-] False False False False False \n", + "2 16042 halide anion [*-] False False False False False \n", + "3 17051 fluoride [F-] False False False False False \n", + "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", + "\n", + " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", + "0 False False ... False False False False False False False \n", + "1 False False ... False False False False False False False \n", + "2 False False ... False False False False False False False \n", + "3 False False ... False False False False False False False \n", + "4 False False ... False False False False False False False \n", + "\n", + " 166904 167497 167559 \n", + "0 False False False \n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False \n", + "\n", + "[5 rows x 1335 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "metadata": {}, + "source": [ + "# 6. Example Molecule: Different Encodings\n", + "\n", + "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", + "\n", + "### Explanation:\n", + "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", + "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "5b0f7974-f262-429c-b064-4207277e22ad", + "metadata": {}, + "source": [ + "# 7. Additional Useful Features\n", + "\n", + "- **Substructure Search**: `chebai` allows you to perform substructure searches within the ChEBI database.\n", + "- **Property Filters**: You can filter molecules based on specific properties, such as molecular weight or charge.\n", + "- **Visualization**: `chebai` provides tools for visualizing molecular structures directly within the notebook.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "314801c7-9a1c-4247-9809-497f8481ac90", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "This notebook provided an introduction to the `chebai` package, focusing on how data is structured and utilized. With this knowledge, you can start exploring chemical data more effectively using `chebai`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (env_chebai)", + "language": "python", + "name": "env_chebai" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 830184f6886a42f293c2ff702c0509aff29ca9cb Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 00:04:40 +0200 Subject: [PATCH 02/29] added information stored in files --- data_exploration.ipynb | 289 +++++++++++++++++++++++++++++++++++------ 1 file changed, 251 insertions(+), 38 deletions(-) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index 6f1045a4..c4d60ab2 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -1,18 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 16, - "id": "81559360-c8b8-462d-bfa1-6ae22bed1615", - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "# Ignore all warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, { "cell_type": "markdown", "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", @@ -314,13 +301,51 @@ ] }, { - "cell_type": "code", - "execution_count": 49, - "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "cell_type": "markdown", + "id": "43329709-5134-4ce5-88e7-edd2176bf84d", "metadata": {}, - "outputs": [], "source": [ - "import pandas as pd" + "## chebi.obo\n", + "\n", + "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", + "\n", + "### Structure of `chebi.obo`\n", + "\n", + "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "\n", + "#### Example of a Term Document\n", + "\n", + "```plaintext\n", + "[Term]\n", + "id: CHEBI:24867\n", + "name: monoatomic ion\n", + "subset: 3_STAR\n", + "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", + "is_a: CHEBI:24870\n", + "is_a: CHEBI:33238\n", + "```0\r\n", + "is_a: CHEBI:3323Relevant 8\r\n", + "```\r\n", + "\r\n", + "### Breakdown of Attributes\r\n", + "\r\n", + "Each term document in the `chebi.obo` file consists of the following key attributes:\r\n", + "\r\n", + "- **`[Term]`**: \r\n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\r\n", + "\r\n", + "- **`id: CHEBI:24867`**: \r\n", + " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\r\n", + " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\r\n", + "\r\n", + "- **`name: monoatomic ion`**: \r\n", + " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\r\n", + " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\r\n", + "\r\n", + "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \r\n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\r\n", + " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats.\r\n", + "```" ] }, { @@ -345,6 +370,16 @@ "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" ] }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, { "cell_type": "code", "execution_count": 53, @@ -566,50 +601,228 @@ }, { "cell_type": "markdown", - "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", "metadata": {}, "source": [ - "---" + "## `data.pt` File\n", + "\n", + "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", + "\n", + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n" + ] + } + ], + "source": [ + "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n", + "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n", + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n", + "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n", + "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n" + ] + } + ], + "source": [ + "for i in range(5):\n", + " print(data_pt[i])" ] }, { "cell_type": "markdown", - "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", "metadata": {}, "source": [ - "# 6. Example Molecule: Different Encodings\n", + "## `classes.txt` File\n", "\n", - "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", + "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", "\n", - "### Explanation:\n", - "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", - "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", - "\n", - "---" + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1722\n", + "2468\n", + "2571\n", + "2580\n", + "2634\n" + ] + } + ], + "source": [ + "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" ] }, { "cell_type": "markdown", - "id": "5b0f7974-f262-429c-b064-4207277e22ad", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "## `splits.csv`\r\n", + "\r\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
033429train
130151train
217051train
332129train
430340train
\n", + "
" + ], + "text/plain": [ + " id split\n", + "0 33429 train\n", + "1 30151 train\n", + "2 17051 train\n", + "3 32129 train\n", + "4 30340 train" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", + "csv_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", "metadata": {}, "source": [ - "# 7. Additional Useful Features\n", - "\n", - "- **Substructure Search**: `chebai` allows you to perform substructure searches within the ChEBI database.\n", - "- **Property Filters**: You can filter molecules based on specific properties, such as molecular weight or charge.\n", - "- **Visualization**: `chebai` provides tools for visualizing molecular structures directly within the notebook.\n", - "\n", "---" ] }, { "cell_type": "markdown", - "id": "314801c7-9a1c-4247-9809-497f8481ac90", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", "metadata": {}, "source": [ - "# Conclusion\n", + "# 6. Example Molecule: Different Encodings\n", + "\n", + "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", "\n", - "This notebook provided an introduction to the `chebai` package, focusing on how data is structured and utilized. With this knowledge, you can start exploring chemical data more effectively using `chebai`." + "### Explanation:\n", + "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", + "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", + "\n", + "---" ] } ], From 7005a69c420b95cfe4e0ad4a23414ccc90858199 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 00:29:31 +0200 Subject: [PATCH 03/29] Molecule: Different Encodings --- data_exploration.ipynb | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index c4d60ab2..e36fc1fe 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -353,8 +353,7 @@ "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", "metadata": {}, "source": [ - "\n", - "## data.pkl\n", + "## `data.pkl` File\n", "\n", "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", "\n", @@ -716,7 +715,7 @@ "id": "b058714f-e434-4367-89b9-74c129ac727f", "metadata": {}, "source": [ - "## `splits.csv`\r\n", + "## `splits.csv` File\r\n", "\r\n", "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n" ] @@ -814,7 +813,7 @@ "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", "metadata": {}, "source": [ - "# 6. Example Molecule: Different Encodings\n", + "# 5. Example Molecule: Different Encodings\n", "\n", "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", "\n", @@ -822,7 +821,40 @@ "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", "\n", - "---" + "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\r\n", + "\r\n", + "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\r\n", + " - **Benzene SMILES**: `c1ccccc1`\r\n", + " - **Explanation**: \r\n", + " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\r\n", + "\r\n", + "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\r\n", + " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\r\n", + " - **Explanation**: \r\n", + " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\r\n", + " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\r\n", + "\r\n", + "### 3. **InChI (IUPAC International Chemical Identifier)**\r\n", + " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\r\n", + " - **Explanation**: \r\n", + " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\r\n", + "\r\n", + "### 4. **InChIKey**\r\n", + " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\r\n", + " - **Explanation**: \r\n", + " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\r\n", + "\r\n", + "### 5. **Canonical SMILES**\r\n", + " - **Benzene Canonical SMILES**: `c1ccccc1`\r\n", + " - **Explanation**:\r\n", + " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\r\n", + "\r\n", + "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\r\n", + " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\r\n", + " - **Explanation**: \r\n", + " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\r\n", + "\r\n", + "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools." ] } ], From 13aa945938079e265aa28947e9509a5484d03a2d Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 11:24:05 +0200 Subject: [PATCH 04/29] add info related to protein dataset --- data_exploration.ipynb | 418 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 418 insertions(+) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index e36fc1fe..b0c9e78f 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -856,6 +856,424 @@ "\r\n", "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools." ] + }, + { + "cell_type": "markdown", + "id": "93e328cf-09f9-4694-b175-28320590937d", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde", + "metadata": {}, + "source": [ + "# Information for Protein Dataset\r\n", + "\r\n", + "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\r\n", + "\r\n", + "### Configuration Parameters\r\n", + "\r\n", + "Data classes related to proteins can be configured using the following main parameters:\r\n", + "\r\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\r\n", + "\r\n", + "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\r\n", + "\r\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\r\n", + "\r\n", + "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", + "\r\n", + "### Available GOUniProt Data Classes\r\n", + "\r\n", + "#### `GOUniProtOver250`\r\n", + "\r\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\r\n", + "\r\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", + "\r\n", + "#### `GOUniProtOver50`\r\n", + "\r\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\r\n", + "\r\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", + "\r\n", + "### Instantiation Example\r\n", + "\r\n", + "```python\r\n", + "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\r\n", + "go_class = GOUniProtOver250()\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "2ffca830-bc0b-421c-8054-0860c95c10f2", + "metadata": {}, + "source": [ + "## GOUniProt Data File Structure\r\n", + "\r\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\r\n", + " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\r\n", + " - **File Paths**:\r\n", + " - `data/GO_UniProt/raw/${filename}.obo`\r\n", + " - `data/GO_UniProt/raw/${filename}.dat`\r\n", + "\r\n", + "2. **`data.pkl`**\r\n", + " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\r\n", + "\r\n", + "3. **`data.pt`**\r\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\r\n", + "\r\n", + "4. **`classes.txt`**\r\n", + " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\r\n", + "\r\n", + "5. **`splits.csv`**\r\n", + " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\r\n", + "\r\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\r\n", + "}/processed/splits.csv`\r\n" + ] + }, + { + "cell_type": "markdown", + "id": "61bc261e-2328-4968-aca6-14c48bb24348", + "metadata": {}, + "source": [ + "## data.pkl" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "31df4ee7-4c03-4ea2-9798-5e5082a74c2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (27459, 1050)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
swiss_idaccessiongo_idssequence4175122165209226...2000145200014620001472000241200024320003772001020200114120012332001234
814331_ARATHP42643,Q945M2,Q9M0S7[19222]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1014331_MAIZEP49106[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1314332_MAIZEQ01526[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1414333_ARATHP42644,F4KBI7,Q945L2[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1050 columns

\n", + "
" + ], + "text/plain": [ + " swiss_id accession \\\n", + "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", + "9 14331_CAEEL P41932,Q21537 \n", + "10 14331_MAIZE P49106 \n", + "13 14332_MAIZE Q01526 \n", + "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", + "\n", + " go_ids \\\n", + "8 [19222] \n", + "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", + "10 [3677, 5634, 10468, 44877] \n", + "13 [3677, 5634, 10468, 44877] \n", + "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", + "\n", + " sequence 41 75 122 \\\n", + "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", + "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", + "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", + "\n", + " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", + "8 False False False ... False False False False False \n", + "9 False False False ... False False False False False \n", + "10 False False False ... False False False False False \n", + "13 False False False ... False False False False False \n", + "14 False False False ... False False False False False \n", + "\n", + " 2000377 2001020 2001141 2001233 2001234 \n", + "8 False False False False False \n", + "9 False False False False False \n", + "10 False False False False False \n", + "13 False False False False False \n", + "14 False False False False False \n", + "\n", + "[5 rows x 1050 columns]" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "be0078fd-bcf1-4d4c-b8c6-c84e3aeac99c", + "metadata": {}, + "source": [ + "## data.pt" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "a70f9c35-daca-4728-a9ea-b1212866f421", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n", + "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n" + ] + } + ], + "source": [ + "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))\n", + "for i in range(1):\n", + " print(data_pt[i])" + ] + }, + { + "cell_type": "markdown", + "id": "380049c1-2963-4223-b698-a7b59b9fe595", + "metadata": {}, + "source": [ + "## Protein Representation Using Amino Acid Sequence Notation\n", + "\n", + "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", + "\n", + "### Example Protein Sequence\n", + "\n", + "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", + "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", + "\n", + "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", + "- **Sequence Length**: 147\n", + "\n", + "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", + "\n", + "### The 20 Amino Acids and Their One-Letter Notations\n", + "\n", + "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", + "\n", + "| One-Letter Notation | Amino Acid Name | Description |\n", + "|---------------------|----------------------|---------------------------------------------------------|\n", + "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", + "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", + "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **F** | Phenylalanine | Aromatic, non-polar. |\n", + "| **G** | Glycine | Smallest amino acid, non-polar. |\n", + "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", + "| **I** | Isoleucine | Non-polar, aliphatic. |\n", + "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", + "| **L** | Leucine | Non-polar, aliphatic. |\n", + "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", + "| **N** | Asparagine | Polar, uncharged. |\n", + "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", + "| **Q** | Glutamine | Polar, uncharged. |\n", + "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", + "| **S** | Serine | Polar, can be phosphorylated. |\n", + "| **T** | Threonine | Polar, can be phosphorylated. |\n", + "| **V** | Valine | Non-polar, aliphatic. |\n", + "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", + "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", + "\n", + "### Understanding Protein Sequences\n", + "\n", + "In the example sequence `MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQGQL`, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "\n", + "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", + "\n", + "\n", + "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" + ] + }, + { + "cell_type": "markdown", + "id": "702359d6-5338-4391-b196-2328ba5676a1", + "metadata": {}, + "source": [ + "---" + ] } ], "metadata": { From 0e4814fde3f5b365587912729eba6ef5aba131c6 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 12:33:37 +0200 Subject: [PATCH 05/29] fix - jupyter markdown cells formatting issue - https://github.com/jupyter/notebook/issues/7002 - Fix using notebook formatter provided by pycharm professional --- data_exploration.ipynb | 512 ++++++++++++++++++++--------------------- 1 file changed, 252 insertions(+), 260 deletions(-) diff --git a/data_exploration.ipynb b/data_exploration.ipynb index b0c9e78f..8cd834b1 100644 --- a/data_exploration.ipynb +++ b/data_exploration.ipynb @@ -14,11 +14,11 @@ }, { "cell_type": "markdown", - "id": "33275d3c-cdbf-4c1f-aa04-f135511f3643", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", "metadata": {}, "source": [ - "# 1. Instantiation of a Data Class\r\n", - "\r\n", + "# 1. Instantiation of a Data Class\n", + "\n", "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", "### Inheritance Hierarchy\n", "\n", @@ -29,55 +29,54 @@ "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", "\n", "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - ".\r\n", - "\r\n", - "### Explanation\r\n", - "a ChEBI data classiData` class can be configured with the following main parameters:\r\n", - "\r\n", - "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\r\n", - "\r\n", - "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\r\n", - "\r\n", - "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\r\n", - "\r\n", - "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\r\n", - "\r\n", - "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\r\n", - "\r\n", - "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", - "\r\n", - "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\r\n", - "\r\n", - "### Additional Input Parameters\r\n", - "\r\n", - "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\r\n", - "\r\n", - "- **batch_size (int)**: The batch size for data loading. Default is `1`.\r\n", - "\r\n", - "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\r\n", - "\r\n", - "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\r\n", - "\r\n", - "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\r\n", - "\r\n", - "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\r\n", - "\r\n", - "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\r\n", - "\r\n", - "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\r\n", - "\r\n", - "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\r\n", - "\r\n", - "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\r\n", - "\r\n", - "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\r\n", - "\r\n", - "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\r\n", - "\r\n", - "- **kwargs**: Additional keyword arguments.\r\n", - "\r\n", - "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\r\n", - "ining and validation.\r\n" + "\n", + "\n", + "### Explanation\n", + "A ChEBI data class can be configured with the following main parameters:\n", + "\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "\n", + "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", + "\n", + "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\n", + "\n", + "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\n", + "\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", + "\n", + "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\n", + "\n", + "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", + "\n", + "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n", + "\n", + "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\n", + "\n", + "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\n", + "\n", + "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\n", + "\n", + "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\n", + "\n", + "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\n", + "\n", + "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\n", + "\n", + "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\n", + "\n", + "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\n", + "\n", + "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\n", + "\n", + "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\n", + "\n", + "- **kwargs**: Additional keyword arguments.\n", + "\n", + "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\n" ] }, { @@ -151,31 +150,29 @@ "id": "1655d489-25fe-46de-9feb-eeca5d36936f", "metadata": {}, "source": [ - "# 2. Preparation / Setup Methods\r\n", - "\r\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\r\n", - "\r\n", - "### Why is Preparation Needed?\r\n", - "\r\n", - "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\r\n", - "- **Data Integrity**: It ensures that the data files are up-to-date and compatible with the specified ChEBI version.\r\n", - "\r\n", - "### Main Methods for Data Preprocessing\r\n", - "\r\n", - "The data preprocessing in a data class involves two main methods:\r\n", - "\r\n", - "1. **`prepare_data` Method**:\r\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\r\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\r\n", - "\r\n", - "2. **`setup` Method**:\r\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\r\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\r\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\r\n", - "\r\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes.\r\n", - "alidation processes.\r\n", - "processed(data_df, processed_name)\r\n" + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." ] }, { @@ -221,67 +218,65 @@ "metadata": {}, "source": [ "# 3. Different Data Files Created and their Structure\n", - "\r\n", - "\r\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\r\n", - "\r\n", - "### Data Files\r\n", - "\r\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file)\r\n", - " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", - "\r\n", - "2. **`data.pkl`**\r\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", - "\r\n", - "3. **`data.pt`**\r\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", - "\r\n", - "4. **`classes.txt`**\r\n", - " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\r\n", - "\r\n", - "5. **`splits.csv`**\r\n", - " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\r\n", - "\r\n", - "### File Structure and Preprocessing Stages\r\n", - "\r\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\r\n", - "\r\n", - "1. **Raw Data Stage**:\r\n", - " - **File**: `chebi.obo`\r\n", - " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\r\n", - "\r\n", - "2. **Processed Data Stage 1**:\r\n", - " - **File**: `data.pkl`\r\n", - " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\r\n", - " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\r\n", - "\r\n", - "3. **Processed Data Stage 2**:\r\n", - " - **File**: `data.pt`\r\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\r\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\r\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\r\n", - "\r\n", - "### Data Splits\r\n", - "\r\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\r\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\r\n", - "\r\n", - "### Summary of File Paths\r\n", - "\r\n", - "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\r\n", - "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\r\n", - "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\r\n", - "\r\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\r\n", - "that each step is well-documented and reproducible.\r\n", - "sing, from raw input to model-ready formats.\r\n" + "\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", + "\n", + "### Data Files\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", + " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "### File Structure and Preprocessing Stages\n", + "\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `chebi.obo`\n", + " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", + "\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", + "\n", + "### Summary of File Paths\n", + "\n", + "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", + "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." ] }, { @@ -323,29 +318,27 @@ "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", "is_a: CHEBI:24870\n", "is_a: CHEBI:33238\n", - "```0\r\n", - "is_a: CHEBI:3323Relevant 8\r\n", - "```\r\n", - "\r\n", - "### Breakdown of Attributes\r\n", - "\r\n", - "Each term document in the `chebi.obo` file consists of the following key attributes:\r\n", - "\r\n", - "- **`[Term]`**: \r\n", - " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\r\n", - "\r\n", - "- **`id: CHEBI:24867`**: \r\n", - " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\r\n", - " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\r\n", - "\r\n", - "- **`name: monoatomic ion`**: \r\n", - " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\r\n", - " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\r\n", - "\r\n", - "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \r\n", - " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\r\n", - " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats.\r\n", - "```" + "is_a: CHEBI:3323Relevant 8\n", + "```\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each term document in the `chebi.obo` file consists of the following key attributes:\n", + "\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n", + "\n", + "- **`id: CHEBI:24867`**: \n", + " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n", + " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n", + "\n", + "- **`name: monoatomic ion`**: \n", + " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n", + " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n", + "\n", + "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n", + " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats." ] }, { @@ -715,9 +708,9 @@ "id": "b058714f-e434-4367-89b9-74c129ac727f", "metadata": {}, "source": [ - "## `splits.csv` File\r\n", - "\r\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\r\n" + "## `splits.csv` File\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" ] }, { @@ -821,40 +814,40 @@ "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", "\n", - "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\r\n", - "\r\n", - "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\r\n", - " - **Benzene SMILES**: `c1ccccc1`\r\n", - " - **Explanation**: \r\n", - " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\r\n", - "\r\n", - "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\r\n", - " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\r\n", - " - **Explanation**: \r\n", - " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\r\n", - " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\r\n", - "\r\n", - "### 3. **InChI (IUPAC International Chemical Identifier)**\r\n", - " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\r\n", - " - **Explanation**: \r\n", - " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\r\n", - "\r\n", - "### 4. **InChIKey**\r\n", - " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\r\n", - " - **Explanation**: \r\n", - " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\r\n", - "\r\n", - "### 5. **Canonical SMILES**\r\n", - " - **Benzene Canonical SMILES**: `c1ccccc1`\r\n", - " - **Explanation**:\r\n", - " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\r\n", - "\r\n", - "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\r\n", - " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\r\n", - " - **Explanation**: \r\n", - " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\r\n", - "\r\n", - "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics.d by different computational tools." + "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n", + "\n", + "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n", + " - **Benzene SMILES**: `c1ccccc1`\n", + " - **Explanation**: \n", + " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n", + "\n", + "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n", + " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n", + " - **Explanation**: \n", + " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n", + " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n", + "\n", + "### 3. **InChI (IUPAC International Chemical Identifier)**\n", + " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\n", + " - **Explanation**: \n", + " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\n", + "\n", + "### 4. **InChIKey**\n", + " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\n", + " - **Explanation**: \n", + " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\n", + "\n", + "### 5. **Canonical SMILES**\n", + " - **Benzene Canonical SMILES**: `c1ccccc1`\n", + " - **Explanation**:\n", + " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\n", + "\n", + "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\n", + " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\n", + " - **Explanation**: \n", + " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\n", + "\n", + "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics." ] }, { @@ -870,41 +863,41 @@ "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde", "metadata": {}, "source": [ - "# Information for Protein Dataset\r\n", - "\r\n", - "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\r\n", - "\r\n", - "### Configuration Parameters\r\n", - "\r\n", - "Data classes related to proteins can be configured using the following main parameters:\r\n", - "\r\n", - "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\r\n", - "\r\n", - "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\r\n", - "\r\n", - "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\r\n", - "\r\n", - "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\r\n", - "\r\n", - "### Available GOUniProt Data Classes\r\n", - "\r\n", - "#### `GOUniProtOver250`\r\n", - "\r\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\r\n", - "\r\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", - "\r\n", - "#### `GOUniProtOver50`\r\n", - "\r\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\r\n", - "\r\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\r\n", - "\r\n", - "### Instantiation Example\r\n", - "\r\n", - "```python\r\n", - "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\r\n", - "go_class = GOUniProtOver250()\r\n" + "# Information for Protein Dataset\n", + "\n", + "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\n", + "\n", + "### Configuration Parameters\n", + "\n", + "Data classes related to proteins can be configured using the following main parameters:\n", + "\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + "\n", + "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\n", + "\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", + "\n", + "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\n", + "\n", + "### Available GOUniProt Data Classes\n", + "\n", + "#### `GOUniProtOver250`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "\n", + "#### `GOUniProtOver50`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "\n", + "### Instantiation Example\n", + "\n", + "```python\n", + "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\n", + "go_class = GOUniProtOver250()\n" ] }, { @@ -912,32 +905,31 @@ "id": "2ffca830-bc0b-421c-8054-0860c95c10f2", "metadata": {}, "source": [ - "## GOUniProt Data File Structure\r\n", - "\r\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\r\n", - " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\r\n", - " - **File Paths**:\r\n", - " - `data/GO_UniProt/raw/${filename}.obo`\r\n", - " - `data/GO_UniProt/raw/${filename}.dat`\r\n", - "\r\n", - "2. **`data.pkl`**\r\n", - " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\r\n", - "\r\n", - "3. **`data.pt`**\r\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\r\n", - "\r\n", - "4. **`classes.txt`**\r\n", - " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\r\n", - "\r\n", - "5. **`splits.csv`**\r\n", - " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\r\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\r\n", - "\r\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\r\n", - "}/processed/splits.csv`\r\n" + "## GOUniProt Data File Structure\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", + " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", + " - **File Paths**:\n", + " - `data/GO_UniProt/raw/${filename}.obo`\n", + " - `data/GO_UniProt/raw/${filename}.dat`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" ] }, { @@ -1259,7 +1251,7 @@ "\n", "### Understanding Protein Sequences\n", "\n", - "In the example sequence `MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQGQL`, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", "\n", "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", "\n", From 8539f3bc3f1376dcf98eecfa06de6258f7a0b77a Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Tue, 27 Aug 2024 12:34:47 +0200 Subject: [PATCH 06/29] move to tutorials dir --- data_exploration.ipynb => tutorials/data_exploration.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data_exploration.ipynb => tutorials/data_exploration.ipynb (100%) diff --git a/data_exploration.ipynb b/tutorials/data_exploration.ipynb similarity index 100% rename from data_exploration.ipynb rename to tutorials/data_exploration.ipynb From 6b9024b088e244c13bf74f3be797fcb2154077d9 Mon Sep 17 00:00:00 2001 From: sfluegel Date: Tue, 24 Sep 2024 18:10:06 +0200 Subject: [PATCH 07/29] minor changes to texts --- tutorials/data_exploration.ipynb | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tutorials/data_exploration.ipynb b/tutorials/data_exploration.ipynb index 8cd834b1..fce3a9f7 100644 --- a/tutorials/data_exploration.ipynb +++ b/tutorials/data_exploration.ipynb @@ -19,14 +19,16 @@ "source": [ "# 1. Instantiation of a Data Class\n", "\n", - "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data.\n", "### Inheritance Hierarchy\n", "\n", "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n", "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "- **`XYBaseDataModule`**: This is the base class for all data modules in `chebai`, providing foundational properties and methods for handling and processing datasets, including loading a stored dataset and creating a `DataLoader`.\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for some datasets (e.g. the ChEBI and Gene Ontology datasets). The defining feature is the dynamically created data split into training, validation and test sets. It inherits from `XYBaseDataModule`.\n", + "\n", "\n", - "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", "\n", "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", "\n", @@ -34,7 +36,7 @@ "### Explanation\n", "A ChEBI data class can be configured with the following main parameters:\n", "\n", - "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI dataset to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", "\n", "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", "\n", @@ -50,7 +52,7 @@ "\n", "### Additional Input Parameters\n", "\n", - "The `XYBaseDa ChEBI data class, whsich `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", + "The `XYBaseDa ChEBI data class, which `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", "\n", "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n", "\n", @@ -225,11 +227,11 @@ "### Data Files\n", "\n", "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", - " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " - **Description**: Contains the raw ChEBI ontology data in OBO format, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", "\n", "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a Pandas dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", "\n", "3. **`data.pt`**\n", @@ -261,7 +263,7 @@ "\n", "3. **Processed Data Stage 2**:\n", " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **Description**: This final stage includes the tokenized data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", "\n", From 4fc31dab7716a54f05666f9bd0d5fe51d066e647 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 16:13:40 +0200 Subject: [PATCH 08/29] chebi notebook : suggested changes - https://github.com/ChEB-AI/python-chebai/pull/46#pullrequestreview-2325741708 --- tutorials/data_exploration_chebi.ipynb | 836 +++++++++++++++++++++++++ 1 file changed, 836 insertions(+) create mode 100644 tutorials/data_exploration_chebi.ipynb diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb new file mode 100644 index 00000000..17c3ae33 --- /dev/null +++ b/tutorials/data_exploration_chebi.ipynb @@ -0,0 +1,836 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "\n", + "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", + "metadata": {}, + "source": [ + "# 1. Instantiation of a Data Class\n", + "\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "### Inheritance Hierarchy\n", + "\n", + "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + "\n", + "\n", + "### Explanation\n", + "A ChEBI data class can be configured with the following main parameters:\n", + "\n", + "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "\n", + "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", + "\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" + ] + }, + { + "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, + "source": [ + "# Available ChEBI Data Classes\n", + "\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py):\n", + "\n", + "## `ChEBIOver100`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver50`\n", + "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "\n", + "## `ChEBIOver100DeepSMILES`\n", + "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver100SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", + "\n", + "## `ChEBIOver50SELFIES`\n", + "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", + "\n", + "## `ChEBIOver50Partial`\n", + "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": {}, + "outputs": [], + "source": [ + "chebi_class = ChEBIOver50(chebi_version=231)" + ] + }, + { + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "1655d489-25fe-46de-9feb-eeca5d36936f", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", + "Cross-validation enabled: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", + "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", + "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" + ] + } + ], + "source": [ + "chebi_class.prepare_data()\n", + "chebi_class.setup()" + ] + }, + { + "cell_type": "markdown", + "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", + "metadata": {}, + "source": [ + "# 3. Different Data Files Created and their Structure\n", + "\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", + "\n", + "### Data Files\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", + " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "### File Structure and Preprocessing Stages\n", + "\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `chebi.obo`\n", + " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", + "\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", + "\n", + "### Summary of File Paths\n", + "\n", + "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", + "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." + ] + }, + { + "cell_type": "markdown", + "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "74adb549-9e02-472d-a535-78a584853b52", + "metadata": {}, + "source": [ + "# 4. Information Stored in the Files\n" + ] + }, + { + "cell_type": "markdown", + "id": "43329709-5134-4ce5-88e7-edd2176bf84d", + "metadata": {}, + "source": [ + "## chebi.obo\n", + "\n", + "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", + "\n", + "### Structure of `chebi.obo`\n", + "\n", + "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "\n", + "#### Example of a Term Document\n", + "\n", + "```plaintext\n", + "[Term]\n", + "id: CHEBI:24867\n", + "name: monoatomic ion\n", + "subset: 3_STAR\n", + "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", + "is_a: CHEBI:24870\n", + "is_a: CHEBI:33238\n", + "is_a: CHEBI:3323Relevant 8\n", + "```\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each term document in the `chebi.obo` file consists of the following key attributes:\n", + "\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n", + "\n", + "- **`id: CHEBI:24867`**: \n", + " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n", + " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n", + "\n", + "- **`name: monoatomic ion`**: \n", + " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n", + " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n", + "\n", + "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n", + " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats." + ] + }, + { + "cell_type": "markdown", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "metadata": {}, + "source": [ + "## `data.pkl` File\n", + "\n", + "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", + "\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of each ChEBI data instance.\n", + "- **Column 1**: Contains the name of each ChEBI data instance.\n", + "- **Column 2**: Contains the SMILES representation of the chemical.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (129184, 1335)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameSMILES1722246825712580263430983992...143017143212143813146180147334156473166828166904167497167559
033429monoatomic monoanion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
130151aluminide(1-)[Al-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
216042halide anion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
317051fluoride[F-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
428741sodium fluoride[F-].[Na+]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1335 columns

\n", + "
" + ], + "text/plain": [ + " id name SMILES 1722 2468 2571 2580 2634 \\\n", + "0 33429 monoatomic monoanion [*-] False False False False False \n", + "1 30151 aluminide(1-) [Al-] False False False False False \n", + "2 16042 halide anion [*-] False False False False False \n", + "3 17051 fluoride [F-] False False False False False \n", + "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", + "\n", + " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", + "0 False False ... False False False False False False False \n", + "1 False False ... False False False False False False False \n", + "2 False False ... False False False False False False False \n", + "3 False False ... False False False False False False False \n", + "4 False False ... False False False False False False False \n", + "\n", + " 166904 167497 167559 \n", + "0 False False False \n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False \n", + "\n", + "[5 rows x 1335 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", + "metadata": {}, + "source": [ + "## `data.pt` File\n", + "\n", + "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", + "\n", + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n" + ] + } + ], + "source": [ + "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n", + "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n", + "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n", + "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n", + "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n" + ] + } + ], + "source": [ + "for i in range(5):\n", + " print(data_pt[i])" + ] + }, + { + "cell_type": "markdown", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", + "metadata": {}, + "source": [ + "## `classes.txt` File\n", + "\n", + "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1722\n", + "2468\n", + "2571\n", + "2580\n", + "2634\n" + ] + } + ], + "source": [ + "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" + ] + }, + { + "cell_type": "markdown", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "## `splits.csv` File\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
033429train
130151train
217051train
332129train
430340train
\n", + "
" + ], + "text/plain": [ + " id split\n", + "0 33429 train\n", + "1 30151 train\n", + "2 17051 train\n", + "3 32129 train\n", + "4 30340 train" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", + "csv_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "metadata": {}, + "source": [ + "# 5. Example Molecule: Different Encodings\n", + "\n", + "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", + "\n", + "### Explanation:\n", + "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", + "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", + "\n", + "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n", + "\n", + "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n", + " - **Benzene SMILES**: `c1ccccc1`\n", + " - **Explanation**: \n", + " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n", + "\n", + "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n", + " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n", + " - **Explanation**: \n", + " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n", + " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n", + "\n", + "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics." + ] + }, + { + "cell_type": "markdown", + "id": "93e328cf-09f9-4694-b175-28320590937d", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (env_chebai)", + "language": "python", + "name": "env_chebai" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 587c0264b6a9c79a7d2b6be490c03486acc197f8 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 16:14:31 +0200 Subject: [PATCH 09/29] go_notebook: data exploration --- tutorials/data_exploration_go.ipynb | 551 ++++++++++++++++++++++++++++ 1 file changed, 551 insertions(+) create mode 100644 tutorials/data_exploration_go.ipynb diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb new file mode 100644 index 00000000..391192a1 --- /dev/null +++ b/tutorials/data_exploration_go.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on Gene Ontology (GO) and Swiss UniProt Protein data. This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "\n", + "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "\n", + "---" + ], + "id": "da687d32ba48b188" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Information for Protein Dataset\n", + "\n", + "# 1. Instantiation of a Data Class\n", + "\n", + "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n", + "### Inheritance Hierarchy\n", + "\n", + "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, GO_UniProt data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + "\n", + "\n", + "### Configuration Parameters\n", + "\n", + "Data classes related to proteins can be configured using the following main parameters:\n", + "\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + "\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", + "\n", + "### Available GOUniProt Data Classes\n", + "\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n", + "\n", + "#### `GOUniProtOver250`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "\n", + "#### `GOUniProtOver50`\n", + "\n", + "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", + "\n", + "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n" + ], + "id": "64585012b0d7f66f" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Instantiation Example", + "id": "605bbca601037df2" + }, + { + "metadata": {}, + "cell_type": "code", + "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250", + "id": "440f203ceaf7e4b7", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:08:21.236447Z", + "start_time": "2024-09-30T14:08:21.130242Z" + } + }, + "cell_type": "code", + "source": "go_class = GOUniProtOver250()", + "id": "a648346d81d0dc5e", + "outputs": [], + "execution_count": 2 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## GOUniProt Data File Structure\n", + "\n", + "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", + " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", + " - **File Paths**:\n", + " - `data/GO_UniProt/raw/${filename}.obo`\n", + " - `data/GO_UniProt/raw/${filename}.dat`\n", + "\n", + "2. **`data.pkl`**\n", + " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + "\n", + "3. **`data.pt`**\n", + " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "4. **`classes.txt`**\n", + " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "\n", + "5. **`splits.csv`**\n", + " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" + ], + "id": "ee174b61b36c71aa" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ], + "id": "2328e824c4dafb2d" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ], + "id": "9f77351090560bc4" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## data.pkl", + "id": "735844f0b2474ad6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:08:33.990378Z", + "start_time": "2024-09-30T14:08:33.959459Z" + } + }, + "cell_type": "code", + "source": "import pandas as pd", + "id": "b4da7e73e251e1d1", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:10:12.796911Z", + "start_time": "2024-09-30T14:10:06.052276Z" + } + }, + "cell_type": "code", + "source": [ + "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ], + "id": "b66fbb9b720d053c", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (27459, 1050)\n" + ] + }, + { + "data": { + "text/plain": [ + " swiss_id accession \\\n", + "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", + "9 14331_CAEEL P41932,Q21537 \n", + "10 14331_MAIZE P49106 \n", + "13 14332_MAIZE Q01526 \n", + "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", + "\n", + " go_ids \\\n", + "8 [19222] \n", + "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", + "10 [3677, 5634, 10468, 44877] \n", + "13 [3677, 5634, 10468, 44877] \n", + "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", + "\n", + " sequence 41 75 122 \\\n", + "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", + "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", + "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", + "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", + "\n", + " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", + "8 False False False ... False False False False False \n", + "9 False False False ... False False False False False \n", + "10 False False False ... False False False False False \n", + "13 False False False ... False False False False False \n", + "14 False False False ... False False False False False \n", + "\n", + " 2000377 2001020 2001141 2001233 2001234 \n", + "8 False False False False False \n", + "9 False False False False False \n", + "10 False False False False False \n", + "13 False False False False False \n", + "14 False False False False False \n", + "\n", + "[5 rows x 1050 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
swiss_idaccessiongo_idssequence4175122165209226...2000145200014620001472000241200024320003772001020200114120012332001234
814331_ARATHP42643,Q945M2,Q9M0S7[19222]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1014331_MAIZEP49106[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1314332_MAIZEQ01526[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1414333_ARATHP42644,F4KBI7,Q945L2[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 1050 columns

\n", + "
" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## data.pt", + "id": "2c9f23883c66b48d" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:10:35.034002Z", + "start_time": "2024-09-30T14:10:35.018342Z" + } + }, + "cell_type": "code", + "source": "import torch", + "id": "85b097601fb242d6", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:11:36.443693Z", + "start_time": "2024-09-30T14:11:34.199285Z" + } + }, + "cell_type": "code", + "source": [ + "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n", + "print(\"Type of loaded data:\", type(data_pt))\n", + "for i in range(1):\n", + " print(data_pt[i])" + ], + "id": "289a54a71dec20fb", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n", + "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n" + ] + } + ], + "execution_count": 11 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Protein Representation Using Amino Acid Sequence Notation\n", + "\n", + "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", + "\n", + "### Example Protein Sequence\n", + "\n", + "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", + "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", + "\n", + "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", + "- **Sequence Length**: 147\n", + "\n", + "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", + "\n", + "### The 20 Amino Acids and Their One-Letter Notations\n", + "\n", + "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", + "\n", + "| One-Letter Notation | Amino Acid Name | Description |\n", + "|---------------------|----------------------|---------------------------------------------------------|\n", + "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", + "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", + "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **F** | Phenylalanine | Aromatic, non-polar. |\n", + "| **G** | Glycine | Smallest amino acid, non-polar. |\n", + "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", + "| **I** | Isoleucine | Non-polar, aliphatic. |\n", + "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", + "| **L** | Leucine | Non-polar, aliphatic. |\n", + "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", + "| **N** | Asparagine | Polar, uncharged. |\n", + "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", + "| **Q** | Glutamine | Polar, uncharged. |\n", + "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", + "| **S** | Serine | Polar, can be phosphorylated. |\n", + "| **T** | Threonine | Polar, can be phosphorylated. |\n", + "| **V** | Valine | Non-polar, aliphatic. |\n", + "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", + "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", + "\n", + "### Understanding Protein Sequences\n", + "\n", + "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "\n", + "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", + "\n", + "\n", + "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" + ], + "id": "481b8c0271ec9636" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 71e9888d54276413f4d145c031ea56cd60d0f228 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 20:29:32 +0200 Subject: [PATCH 10/29] Delete data_exploration.ipynb --- tutorials/data_exploration.ipynb | 1294 ------------------------------ 1 file changed, 1294 deletions(-) delete mode 100644 tutorials/data_exploration.ipynb diff --git a/tutorials/data_exploration.ipynb b/tutorials/data_exploration.ipynb deleted file mode 100644 index fce3a9f7..00000000 --- a/tutorials/data_exploration.ipynb +++ /dev/null @@ -1,1294 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", - "metadata": {}, - "source": [ - "# Introduction\n", - "\n", - "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", - "\n", - "---\n" - ] - }, - { - "cell_type": "markdown", - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", - "metadata": {}, - "source": [ - "# 1. Instantiation of a Data Class\n", - "\n", - "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data.\n", - "### Inheritance Hierarchy\n", - "\n", - "ChEBI data classes inherit from `_DynamicDataset`, which in turn inherits from `XYBaseDataModule`. Specifically:\n", - "\n", - "- **`XYBaseDataModule`**: This is the base class for all data modules in `chebai`, providing foundational properties and methods for handling and processing datasets, including loading a stored dataset and creating a `DataLoader`.\n", - "\n", - "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for some datasets (e.g. the ChEBI and Gene Ontology datasets). The defining feature is the dynamically created data split into training, validation and test sets. It inherits from `XYBaseDataModule`.\n", - "\n", - "\n", - "\n", - "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", - "\n", - "\n", - "### Explanation\n", - "A ChEBI data class can be configured with the following main parameters:\n", - "\n", - "- **chebi_version (int)**: Specifies the version of the ChEBI dataset to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", - "\n", - "- **chebi_version_train (int, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `chebi_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", - "\n", - "- **single_class (int, optional)**: The ID of the single class to predict. If not set, predictions will be made for all available labels. Defaults to `None`.\n", - "\n", - "- **dynamic_data_split_seed (int, optional)**: The seed for random data splitting, which ensures reproducibility. Defaults to `42`.\n", - "\n", - "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", - "\n", - "- **kwargs**: Additional keyword arguments passed to `XYBaseDataModule`.\n", - "\n", - "These parameters provide flexibility in handling and processing the data, allowing you to set specific versions for different stages of analysis and manage how data is split for training and validation.\n", - "\n", - "### Additional Input Parameters\n", - "\n", - "The `XYBaseDa ChEBI data class, which `ChebaiData` may use internally, includes several important parameters for data loading and processing:\n", - "\n", - "- **batch_size (int)**: The batch size for data loading. Default is `1`.\n", - "\n", - "- **train_split (float)**: The ratio of training data to total data and the ratio of test data to (validation + test) data. Default is `0.85`.\n", - "\n", - "- **reader_kwargs (dict)**: Additional keyword arguments to be passed to the data reader. Default is `None`.\n", - "\n", - "- **prediction_kind (str)**: Specifies the kind of prediction to be performed, relevant only for the `predict_dataloader`. Default is `\"test\"`.\n", - "\n", - "- **data_limit (Optional[int])**: The maximum number of data samples to load. If set to `None`, the complete dataset will be used. Default is `None`.\n", - "\n", - "- **label_filter (Optional[int])**: The index of the label to filter. Default is `None`.\n", - "\n", - "- **balance_after_filter (Optional[float])**: The ratio of negative samples to positive samples after filtering. Default is `None`.\n", - "\n", - "- **num_workers (int)**: The number of worker processes for data loading. Default is `1`.\n", - "\n", - "- **inner_k_folds (int)**: The number of folds for inner cross-validation. Use `-1` to disable inner cross-validation. Default is `-1`.\n", - "\n", - "- **fold_index (Optional[int])**: The index of the fold to use for training and validation. Default is `None`.\n", - "\n", - "- **base_dir (Optional[str])**: The base directory for storing processed and raw data. Default is `None`.\n", - "\n", - "- **kwargs**: Additional keyword arguments.\n", - "\n", - "These parameters allow you to control various aspects of data loading, processing, and splitting, providing flexibility in how datasets are managed throughout your analysis pipeline.\n" - ] - }, - { - "cell_type": "markdown", - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", - "metadata": {}, - "source": [ - "# Available ChEBI Data Classes\n", - "\n", - "## `ChEBIOver100`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", - "\n", - "## `ChEBIOver50`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", - "\n", - "## `ChEBIOver100DeepSMILES`\n", - "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver100SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver50SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", - "\n", - "## `ChEBIOver50Partial`\n", - "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", - "metadata": {}, - "outputs": [], - "source": [ - "from chebai.preprocessing.datasets.chebi import ChEBIOver50" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", - "metadata": {}, - "outputs": [], - "source": [ - "chebi_class = ChEBIOver50(chebi_version=231)" - ] - }, - { - "cell_type": "markdown", - "id": "8456b545-88c5-401d-baa5-47e8ae710f04", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "1655d489-25fe-46de-9feb-eeca5d36936f", - "metadata": {}, - "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", - "\n", - "### Why is Preparation Needed?\n", - "\n", - "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", - "\n", - "### Main Methods for Data Preprocessing\n", - "\n", - "The data preprocessing in a data class involves two main methods:\n", - "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", - "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", - "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", - "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", - "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" - ] - } - ], - "source": [ - "chebi_class.prepare_data()\n", - "chebi_class.setup()" - ] - }, - { - "cell_type": "markdown", - "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", - "metadata": {}, - "source": [ - "# 3. Different Data Files Created and their Structure\n", - "\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", - "\n", - "### Data Files\n", - "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", - " - **Description**: Contains the raw ChEBI ontology data in OBO format, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", - "\n", - "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a Pandas dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", - "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "4. **`classes.txt`**\n", - " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", - "\n", - "5. **`splits.csv`**\n", - " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", - "\n", - "### File Structure and Preprocessing Stages\n", - "\n", - "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", - "\n", - "1. **Raw Data Stage**:\n", - " - **File**: `chebi.obo`\n", - " - **Description**: This stage contains the raw ChEBI ontology data, serving as the initial input for further processing.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", - "\n", - "2. **Processed Data Stage 1**:\n", - " - **File**: `data.pkl`\n", - " - **Description**: This stage includes the data after initial processing. It contains SMILES strings, class columns, and metadata but lacks data splits.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", - " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", - "\n", - "3. **Processed Data Stage 2**:\n", - " - **File**: `data.pt`\n", - " - **Description**: This final stage includes the tokenized data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", - "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", - "\n", - "### Summary of File Paths\n", - "\n", - "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", - "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", - "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", - "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." - ] - }, - { - "cell_type": "markdown", - "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "74adb549-9e02-472d-a535-78a584853b52", - "metadata": {}, - "source": [ - "# 4. Information Stored in the Files\n" - ] - }, - { - "cell_type": "markdown", - "id": "43329709-5134-4ce5-88e7-edd2176bf84d", - "metadata": {}, - "source": [ - "## chebi.obo\n", - "\n", - "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", - "\n", - "### Structure of `chebi.obo`\n", - "\n", - "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", - "\n", - "#### Example of a Term Document\n", - "\n", - "```plaintext\n", - "[Term]\n", - "id: CHEBI:24867\n", - "name: monoatomic ion\n", - "subset: 3_STAR\n", - "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", - "is_a: CHEBI:24870\n", - "is_a: CHEBI:33238\n", - "is_a: CHEBI:3323Relevant 8\n", - "```\n", - "\n", - "### Breakdown of Attributes\n", - "\n", - "Each term document in the `chebi.obo` file consists of the following key attributes:\n", - "\n", - "- **`[Term]`**: \n", - " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct chemical entity.\n", - "\n", - "- **`id: CHEBI:24867`**: \n", - " - **Description**: A unique identifier for the chemical entity within the ChEBI database.\n", - " - **Example**: `CHEBI:24867` refers to the entity \"monoatomic ion.\"\n", - "\n", - "- **`name: monoatomic ion`**: \n", - " - **Description**: The common name of the chemical entity. This is the main descriptor used to identify the term.\n", - " - **Example**: \"monoatomic ion\" is the namcating a related term within the ChEBI ontology.\n", - "\n", - "- **`is_a: CHEBI:24870`** and **`is_a: CHEBI:33238`**: \n", - " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current entity is a subclass or specific instance of the referenced term.\n", - " - **Example**: The entity `CHEBI:24867` (\"monoatomic ion\") is a subclass of both `CHEBI:24870` and `CHEBI:33238`, meaent stages of preprocessing, from raw input files to processed, model-ready formats." - ] - }, - { - "cell_type": "markdown", - "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", - "metadata": {}, - "source": [ - "## `data.pkl` File\n", - "\n", - "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", - "\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the ID of each ChEBI data instance.\n", - "- **Column 1**: Contains the name of each ChEBI data instance.\n", - "- **Column 2**: Contains the SMILES representation of the chemical.\n", - "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "fd490270-59b8-4c1c-8b09-204defddf592", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (129184, 1335)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnameSMILES1722246825712580263430983992...143017143212143813146180147334156473166828166904167497167559
033429monoatomic monoanion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
130151aluminide(1-)[Al-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
216042halide anion[*-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
317051fluoride[F-]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
428741sodium fluoride[F-].[Na+]FalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", - "

5 rows × 1335 columns

\n", - "
" - ], - "text/plain": [ - " id name SMILES 1722 2468 2571 2580 2634 \\\n", - "0 33429 monoatomic monoanion [*-] False False False False False \n", - "1 30151 aluminide(1-) [Al-] False False False False False \n", - "2 16042 halide anion [*-] False False False False False \n", - "3 17051 fluoride [F-] False False False False False \n", - "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", - "\n", - " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", - "0 False False ... False False False False False False False \n", - "1 False False ... False False False False False False False \n", - "2 False False ... False False False False False False False \n", - "3 False False ... False False False False False False False \n", - "4 False False ... False False False False False False False \n", - "\n", - " 166904 167497 167559 \n", - "0 False False False \n", - "1 False False False \n", - "2 False False False \n", - "3 False False False \n", - "4 False False False \n", - "\n", - "[5 rows x 1335 columns]" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", - "metadata": {}, - "source": [ - "## `data.pt` File\n", - "\n", - "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", - "\n", - "### Structure of `data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", - "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", - "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", - "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", - "metadata": {}, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n" - ] - } - ], - "source": [ - "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", - "print(\"Type of loaded data:\", type(data_pt))" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 33429, 'group': None}\n", - "{'features': [11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 30151, 'group': None}\n", - "{'features': [10], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 16042, 'group': None}\n", - "{'features': [12], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 17051, 'group': None}\n", - "{'features': [12, 13, 32], 'labels': array([False, False, False, ..., False, False, False]), 'ident': 28741, 'group': None}\n" - ] - } - ], - "source": [ - "for i in range(5):\n", - " print(data_pt[i])" - ] - }, - { - "cell_type": "markdown", - "id": "861da1c3-0401-49f0-a22f-109814ed95d5", - "metadata": {}, - "source": [ - "## `classes.txt` File\n", - "\n", - "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", - "\n", - "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1722\n", - "2468\n", - "2571\n", - "2580\n", - "2634\n" - ] - } - ], - "source": [ - "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", - " for i in range(5):\n", - " line = file.readline()\n", - " print(line.strip())" - ] - }, - { - "cell_type": "markdown", - "id": "b058714f-e434-4367-89b9-74c129ac727f", - "metadata": {}, - "source": [ - "## `splits.csv` File\n", - "\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsplit
033429train
130151train
217051train
332129train
430340train
\n", - "
" - ], - "text/plain": [ - " id split\n", - "0 33429 train\n", - "1 30151 train\n", - "2 17051 train\n", - "3 32129 train\n", - "4 30340 train" - ] - }, - "execution_count": 98, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", - "csv_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", - "metadata": {}, - "source": [ - "# 5. Example Molecule: Different Encodings\n", - "\n", - "`chebai` supports various encodings for molecules, such as SMILES and SELFIES. Let's take an example molecule and explore its different encodings.\n", - "\n", - "### Explanation:\n", - "- **SMILES (Simplified Molecular Input Line Entry System)**: A linear notation for representing molecular structures.\n", - "- **SELFIES (SELF-referencIng Embedded Strings)**: A more robust encoding that can handle a broader range of chemical structures.\n", - "\n", - "To illustrate different encodings of a molecule, let's consider the molecule **benzene**, which has the chemical formula **C₆H₆**. Here are the different encodings for benzene:\n", - "\n", - "### 1. **SMILES (Simplified Molecular Input Line Entry System)**\n", - " - **Benzene SMILES**: `c1ccccc1`\n", - " - **Explanation**: \n", - " - `c1ccccc1` represents a six-membered aromatic ring, with lowercase `c` indicating aromatic carbon atoms.\n", - "\n", - "### 2. **SELFIES (SELF-referencIng Embedded Strings)**\n", - " - **Benzene SELFIES**: `[C][=C][C][=C][C][=C]`\n", - " - **Explanation**: \n", - " - Each `[C]` represents a carbon atom, and `[=C]` represents a carbon atom with a double bond.\n", - " - SELFIES encodes the alternating single and double bonds in benzene's aromatic ring.\n", - "\n", - "### 3. **InChI (IUPAC International Chemical Identifier)**\n", - " - **Benzene InChI**: `InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H`\n", - " - **Explanation**: \n", - " - This InChI string provides a systematic representation of benzene's structure, showing the connections between the carbon and hydrogen atoms.\n", - "\n", - "### 4. **InChIKey**\n", - " - **Benzene InChIKey**: `UHOVQNZJYSORNB-UHFFFAOYSA-N`\n", - " - **Explanation**: \n", - " - A hashed, fixed-length version of the InChI string, used for easier database searching and indexing.\n", - "\n", - "### 5. **Canonical SMILES**\n", - " - **Benzene Canonical SMILES**: `c1ccccc1`\n", - " - **Explanation**:\n", - " - The canonical SMILES for benzene is identical to the regular SMILES, ensuring a unique and consistent representation for database use.\n", - "\n", - "### 6. **SMARTS (SMILES Arbitrary Target Specification)**\n", - " - **Benzene SMARTS**: `[c]1[c][c][c][c][c]1`\n", - " - **Explanation**: \n", - " - This SMARTS pattern represents the benzene ring structure, which can be used for substructure searching in larger molecules.\n", - "\n", - "These different encodings provide various ways to represent the structure and properties of benzene, each suited to different computational tasks such as molecule identification, database searches, and pattern recognition in cheminformatics." - ] - }, - { - "cell_type": "markdown", - "id": "93e328cf-09f9-4694-b175-28320590937d", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "id": "92e059c6-36a4-482d-bd0b-a8bd9b10ccde", - "metadata": {}, - "source": [ - "# Information for Protein Dataset\n", - "\n", - "The protein dataset follows thsimilarme file structure, class inheritance hierarchy, and methods as described for the ChEBI dataset.\n", - "\n", - "### Configuration Parameters\n", - "\n", - "Data classes related to proteins can be configured using the following main parameters:\n", - "\n", - "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", - "\n", - "- **`dynamic_data_split_seed (int, optional)`**: The seed for random data splitting, ensuring reproducibility. The default is `42`.\n", - "\n", - "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", - "\n", - "- **`kwargs`**: Additional keyword arguments passed to `XYBaseDataModule`.\n", - "\n", - "### Available GOUniProt Data Classes\n", - "\n", - "#### `GOUniProtOver250`\n", - "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", - "\n", - "#### `GOUniProtOver50`\n", - "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", - "\n", - "### Instantiation Example\n", - "\n", - "```python\n", - "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250\n", - "go_class = GOUniProtOver250()\n" - ] - }, - { - "cell_type": "markdown", - "id": "2ffca830-bc0b-421c-8054-0860c95c10f2", - "metadata": {}, - "source": [ - "## GOUniProt Data File Structure\n", - "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", - " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", - " - **File Paths**:\n", - " - `data/GO_UniProt/raw/${filename}.obo`\n", - " - `data/GO_UniProt/raw/${filename}.dat`\n", - "\n", - "2. **`data.pkl`**\n", - " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", - "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "4. **`classes.txt`**\n", - " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", - "\n", - "5. **`splits.csv`**\n", - " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", - "\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" - ] - }, - { - "cell_type": "markdown", - "id": "61bc261e-2328-4968-aca6-14c48bb24348", - "metadata": {}, - "source": [ - "## data.pkl" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "id": "31df4ee7-4c03-4ea2-9798-5e5082a74c2b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Size of the data (rows x columns): (27459, 1050)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
swiss_idaccessiongo_idssequence4175122165209226...2000145200014620001472000241200024320003772001020200114120012332001234
814331_ARATHP42643,Q945M2,Q9M0S7[19222]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 1708, 5634, 5737, 5938, 6611, 7346, 8340...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1014331_MAIZEP49106[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1314332_MAIZEQ01526[3677, 5634, 10468, 44877]MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1414333_ARATHP42644,F4KBI7,Q945L2[5634, 5737, 6995, 9409, 9631, 16036, 19222, 5...MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", - "

5 rows × 1050 columns

\n", - "
" - ], - "text/plain": [ - " swiss_id accession \\\n", - "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", - "9 14331_CAEEL P41932,Q21537 \n", - "10 14331_MAIZE P49106 \n", - "13 14332_MAIZE Q01526 \n", - "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", - "\n", - " go_ids \\\n", - "8 [19222] \n", - "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", - "10 [3677, 5634, 10468, 44877] \n", - "13 [3677, 5634, 10468, 44877] \n", - "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", - "\n", - " sequence 41 75 122 \\\n", - "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", - "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", - "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", - "\n", - " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", - "8 False False False ... False False False False False \n", - "9 False False False ... False False False False False \n", - "10 False False False ... False False False False False \n", - "13 False False False ... False False False False False \n", - "14 False False False ... False False False False False \n", - "\n", - " 2000377 2001020 2001141 2001233 2001234 \n", - "8 False False False False False \n", - "9 False False False False False \n", - "10 False False False False False \n", - "13 False False False False False \n", - "14 False False False False False \n", - "\n", - "[5 rows x 1050 columns]" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "be0078fd-bcf1-4d4c-b8c6-c84e3aeac99c", - "metadata": {}, - "source": [ - "## data.pt" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "id": "a70f9c35-daca-4728-a9ea-b1212866f421", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type of loaded data: \n", - "{'features': [10, 14, 15, 23, 13, 14, 11, 11, 14, 16, 20, 27, 25, 28, 22, 10, 14, 21, 17, 14, 27, 18, 14, 27, 16, 22, 27, 27, 10, 28, 27, 25, 10, 27, 21, 28, 14, 21, 14, 28, 20, 21, 20, 27, 17, 15, 28, 27, 27, 16, 19, 17, 17, 11, 28, 14, 22, 21, 19, 28, 12, 13, 14, 16, 16, 14, 11, 26, 16, 12, 12, 11, 11, 12, 27, 18, 21, 27, 27, 11, 16, 13, 19, 20, 20, 29, 28, 11, 17, 12, 16, 20, 22, 16, 11, 21, 12, 27, 15, 27, 17, 11, 20, 12, 24, 20, 13, 12, 17, 21, 17, 17, 20, 15, 12, 17, 28, 23, 14, 14, 14, 11, 13, 20, 11, 21, 28, 25, 22, 17, 21, 10, 21, 13, 20, 22, 29, 16, 22, 17, 14, 27, 25, 21, 11, 13, 18, 27, 16, 21, 20, 14, 14, 27, 29, 15, 17, 15, 14, 22, 21, 14, 14, 18, 20, 12, 14, 19, 11, 27, 17, 14, 23, 15, 29, 23, 12, 16, 17, 13, 17, 14, 17, 19, 25, 11, 28, 25, 22, 22, 27, 12, 17, 19, 11, 23, 20, 16, 14, 24, 19, 17, 14, 21, 18, 14, 25, 20, 27, 14, 12, 14, 27, 17, 20, 15, 17, 13, 27, 27, 11, 22, 21, 20, 11, 15, 17, 12, 10, 18, 17, 17, 16, 20, 19, 17, 15, 17, 26, 15, 11, 20, 10, 18, 20, 20, 28, 14, 20, 20, 12, 21, 27, 14, 14, 23, 14, 14, 14, 21, 23, 14, 20, 27, 18, 18, 11], 'labels': array([False, False, False, ..., False, False, False]), 'ident': '14331_ARATH', 'group': None}\n" - ] - } - ], - "source": [ - "data_pt = torch.load(r\"data/GO_UniProt/GO250_BP/processed/protein_token/data.pt\")\n", - "print(\"Type of loaded data:\", type(data_pt))\n", - "for i in range(1):\n", - " print(data_pt[i])" - ] - }, - { - "cell_type": "markdown", - "id": "380049c1-2963-4223-b698-a7b59b9fe595", - "metadata": {}, - "source": [ - "## Protein Representation Using Amino Acid Sequence Notation\n", - "\n", - "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", - "\n", - "### Example Protein Sequence\n", - "\n", - "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", - "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", - "\n", - "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", - "- **Sequence Length**: 147\n", - "\n", - "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", - "\n", - "### The 20 Amino Acids and Their One-Letter Notations\n", - "\n", - "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", - "\n", - "| One-Letter Notation | Amino Acid Name | Description |\n", - "|---------------------|----------------------|---------------------------------------------------------|\n", - "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", - "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", - "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", - "| **F** | Phenylalanine | Aromatic, non-polar. |\n", - "| **G** | Glycine | Smallest amino acid, non-polar. |\n", - "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", - "| **I** | Isoleucine | Non-polar, aliphatic. |\n", - "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", - "| **L** | Leucine | Non-polar, aliphatic. |\n", - "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", - "| **N** | Asparagine | Polar, uncharged. |\n", - "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", - "| **Q** | Glutamine | Polar, uncharged. |\n", - "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", - "| **S** | Serine | Polar, can be phosphorylated. |\n", - "| **T** | Threonine | Polar, can be phosphorylated. |\n", - "| **V** | Valine | Non-polar, aliphatic. |\n", - "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", - "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", - "\n", - "### Understanding Protein Sequences\n", - "\n", - "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", - "\n", - "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", - "\n", - "\n", - "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" - ] - }, - { - "cell_type": "markdown", - "id": "702359d6-5338-4391-b196-2328ba5676a1", - "metadata": {}, - "source": [ - "---" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (env_chebai)", - "language": "python", - "name": "env_chebai" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From c6b8d5071b16e99c9b379304ddb22829af9840cf Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 23:35:07 +0200 Subject: [PATCH 11/29] add info on evidence codes + uniprot.data file + changes --- tutorials/data_exploration_go.ipynb | 436 +++++++++++++++++++++++++--- 1 file changed, 402 insertions(+), 34 deletions(-) diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb index 391192a1..2c789ae6 100644 --- a/tutorials/data_exploration_go.ipynb +++ b/tutorials/data_exploration_go.ipynb @@ -18,8 +18,6 @@ "metadata": {}, "cell_type": "markdown", "source": [ - "# Information for Protein Dataset\n", - "\n", "# 1. Instantiation of a Data Class\n", "\n", "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n", @@ -71,31 +69,80 @@ "id": "605bbca601037df2" }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:03.920610Z", + "start_time": "2024-09-30T21:25:03.622407Z" + } + }, "cell_type": "code", "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250", "id": "440f203ceaf7e4b7", "outputs": [], - "execution_count": null + "execution_count": 12 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-09-30T14:08:21.236447Z", - "start_time": "2024-09-30T14:08:21.130242Z" + "end_time": "2024-09-30T21:25:08.863132Z", + "start_time": "2024-09-30T21:25:08.387739Z" } }, "cell_type": "code", "source": "go_class = GOUniProtOver250()", "id": "a648346d81d0dc5e", "outputs": [], - "execution_count": 2 + "execution_count": 13 }, { "metadata": {}, "cell_type": "markdown", "source": [ - "## GOUniProt Data File Structure\n", + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required GOUniProt data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ], + "id": "2328e824c4dafb2d" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ], + "id": "9f77351090560bc4", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# 3. GOUniProt Data File Structure\n", "\n", "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", @@ -123,54 +170,225 @@ ], "id": "ee174b61b36c71aa" }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "# 4. Information Stored in the Files", + "id": "3f92b58e460c08fd" + }, { "metadata": {}, "cell_type": "markdown", "source": [ - "# 2. Preparation / Setup Methods\n", + "## go-basic.obo\n", "\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", - "### Automatic Execution: \n", - "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", "\n", + "### Structure of `go-basic.obo`\n", "\n", - "### Why is Preparation Needed?\n", + "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", "\n", - "- **Data Availability**: The preparation step ensures that the required ChEBI data files are downloaded or loaded, which are essential for analysis.\n", - "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "#### Example of a Term Document\n", "\n", - "### Main Methods for Data Preprocessing\n", + "```plaintext\n", + "[Term]\n", + "id: GO:0000032\n", + "name: cell wall mannoprotein biosynthetic process\n", + "namespace: biological_process\n", + "def: \"The chemical reactions and pathways resulting in the formation of cell wall mannoproteins, any cell wall protein that contains covalently bound mannose residues.\" [GOC:ai]\n", + "synonym: \"cell wall mannoprotein anabolism\" EXACT []\n", + "is_a: GO:0006057 ! mannoprotein biosynthetic process\n", + "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", + "```\n", "\n", - "The data preprocessing in a data class involves two main methods:\n", + "### Breakdown of Attributes\n", "\n", - "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", - " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", "\n", - "2. **`setup` Method**:\n", - " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", - " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct biological process, molecular function, or cellular component.\n", "\n", - "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + "- **`id: GO:0000032`**: \n", + " - **Description**: A unique identifier for the biological term within the GO ontology.\n", + " - **Example**: `GO:0000032` refers to the term \"cell wall mannoprotein biosynthetic process.\"\n", + "\n", + "- **`name: cell wall mannoprotein biosynthetic process`**: \n", + " - **Description**: The name of the biological process, molecular function, or cellular component being described.\n", + " - **Example**: The name \"cell wall mannoprotein biosynthetic process\" is a descriptive label for the GO term with the identifier `GO:0000032`.\n", + "\n", + "- **`namespace: biological_process`**: \n", + " - **Description**: Specifies which ontology the term belongs to. The main namespaces are `biological_process`, `molecular_function`, and `cellular_component`.\n", + "\n", + "- **`is_a: GO:0006057`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", + " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" ], - "id": "2328e824c4dafb2d" + "id": "cca75d881cb8bade" }, { "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, + "cell_type": "markdown", "source": [ - "go_class.prepare_data()\n", - "go_class.setup()" + "## uniprot_sprot.dat\n", + "\n", + "The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", + "\n", + "\n", + "## Structure of `uniprot_sprot.dat`\n", + "\n", + "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", + "\n", + "### Example of a Protein Entry\n", + "\n", + "```plaintext\n", + "ID 002L_FRG3G Reviewed; 320 AA.\n", + "AC Q6GZX3;\n", + "DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot.\n", + "DT 19-JUL-2004, sequence version 1.\n", + "DT 08-NOV-2023, entry version 46.\n", + "DE RecName: Full=Uncharacterized protein 002L;\n", + "GN ORFNames=FV3-002L;\n", + "OS Frog virus 3 (isolate Goorha) (FV-3).\n", + "OC Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes;\n", + "OX NCBI_TaxID=654924;\n", + "OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens).\n", + "RN [1]\n", + "RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n", + "RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;\n", + "RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.;\n", + "RT \"Comparative genomic analyses of frog virus 3, type species of the genus\n", + "RT Ranavirus (family Iridoviridae).\";\n", + "RL Virology 323:70-84(2004).\n", + "CC -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane\n", + "CC protein {ECO:0000305}.\n", + "DR EMBL; AY548484; AAT09661.1; -; Genomic_DNA.\n", + "DR RefSeq; YP_031580.1; NC_005946.1.\n", + "DR GeneID; 2947774; -.\n", + "DR KEGG; vg:2947774; -.\n", + "DR Proteomes; UP000008770; Segment.\n", + "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", + "DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.\n", + "PE 4: Predicted;\n", + "KW Host membrane; Membrane; Reference proteome; Transmembrane;\n", + "KW Transmembrane helix.\n", + "FT CHAIN 1..320\n", + "FT /note=\"Uncharacterized protein 002L\"\n", + "FT /id=\"PRO_0000410509\"\n", + "SQ SEQUENCE 320 AA; 34642 MW; 9E110808B6E328E0 CRC64;\n", + " MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR\n", + " IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL\n", + " AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC\n", + " KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML\n", + " DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK\n", + " VMFFVAGAVL VAILISTVRW\n", + "//\n", + "```\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", + "\n", + "- **`ID`**: \n", + " - **Description**: Contains the unique identifier for the protein and its status (e.g., `Reviewed` indicates the sequence has been manually curated).\n", + " - **Example**: `002L_FRG3G` is the identifier for the protein from Frog virus 3.\n", + "\n", + "- **`AC`**: \n", + " - **Description**: Accession number, a unique identifier for the protein sequence.\n", + " - **Example**: `Q6GZX3` is the accession number for this entry.\n", + "\n", + "- **`DR`**: \n", + " - **Description**: Cross-references to other databases like EMBL, RefSeq, KEGG, and GeneID.\n", + " - **Example**: This entry is cross-referenced with the EMBL database, RefSeq, GO, etc.\n", + "\n", + "- **`GO`**: \n", + " - **Description**: Gene Ontology annotations that describe the cellular component, biological process, or molecular function associated with the protein.\n", + " - **Example**: The protein is associated with the GO terms `GO:0033644` (host cell membrane) and `GO:0016020` (membrane).\n", + "\n", + "- **`SQ`**: \n", + " - **Description**: The amino acid sequence of the protein.\n", + " - **Example**: The sequence consists of 320 amino acids.\n", + "\n", + "The `uniprot_sprot.dat` file is an extensively curated resource, containing comprehensive protein data used for various bioinformatics applications.\n", + "\n", + "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n", + "). \n", + "\n", + "Consider the below line from above example: \n", + "```plaintext\n", + "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", + "```\n", + "\n", + "The line contains a **Gene Ontology (GO) annotation** describing the protein's subcellular location. Here's a detailed breakdown:\n", + "\n", + "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", + "\n", + "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", + "\n", + "### More on GO Evidence Codes\n", + "\n", + "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n", + "\n", + "| **Evidence Code** | **Description** |\n", + "|-------------------|-----------------|\n", + "| **EXP** | Inferred from Experiment |\n", + "| **IDA** | Inferred from Direct Assay |\n", + "| **IPI** | Inferred from Physical Interaction |\n", + "| **IMP** | Inferred from Mutant Phenotype |\n", + "| **IGI** | Inferred from Genetic Interaction |\n", + "| **IEP** | Inferred from Expression Pattern |\n", + "| **TAS** | Traceable Author Statement |\n", + "| **IC** | Inferred by Curator |\n", + "| **IEA** | Inferred from Electronic Annotation (Computational) |\n", + "| **ISS** | Inferred from Sequence or Structural Similarity |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **ISM** | Inferred from Sequence Model |\n", + "| **ISO** | Inferred from Sequence Orthology |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **RCA** | Inferred from Reviewed Computational Analysis |\n", + "| **NAS** | Non-traceable Author Statement |\n", + "| **ND** | No Biological Data Available (placeholder) |\n", + "| **NR** | Not Recorded |\n", + "\n", + "\n", + "### Grouping of Codes:\n", + "\n", + "- **Experimental Evidence Codes**: \n", + " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", + " \n", + "- **Author/Curator Inferred Codes**:\n", + " - **TAS**, **IC**, **NAS**\n", + "\n", + "- **Computational Evidence Codes**:\n", + " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", + "\n", + "- **Others**:\n", + " - **ND** (No Data), **NR** (Not Recorded)\n", + "\n", + "\n", + "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation." ], - "id": "9f77351090560bc4" + "id": "87c841de7d80beef" }, { "metadata": {}, "cell_type": "markdown", - "source": "## data.pkl", + "source": [ + "## data.pkl\n", + "\n", + "The `data.pkl` file, generated during the preprocessing stage, contains the processed GO data in a dataframe format. Below is an example of how this data is structured:\n", + "\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", + "- **Column 1**: Contains the accession of each Protein data instance.\n", + "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", + "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", + "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ], "id": "735844f0b2474ad6" }, { @@ -427,7 +645,20 @@ { "metadata": {}, "cell_type": "markdown", - "source": "## data.pt", + "source": [ + "## data.pt\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ], "id": "2c9f23883c66b48d" }, { @@ -470,6 +701,143 @@ ], "execution_count": 11 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## `classes.txt` File\n", + "\n", + "The `classes.txt` file lists selected Swiss Proteins classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ], + "id": "f69012b3540fd1b6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:30:34.344202Z", + "start_time": "2024-09-30T21:30:34.328318Z" + } + }, + "cell_type": "code", + "source": [ + "with open(r\"data/GO_UniProt/GO250_BP/processed/classes.txt\", \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" + ], + "id": "19200f7ff9a6ebba", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "41\n", + "75\n", + "122\n", + "165\n", + "209\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## `splits.csv` File\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run." + ], + "id": "6661dc11247e9753" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:30:41.586616Z", + "start_time": "2024-09-30T21:30:39.318598Z" + } + }, + "cell_type": "code", + "source": [ + "csv_df = pd.read_csv(r\"data/GO_UniProt/GO250_BP/processed/splits.csv\")\n", + "csv_df.head()" + ], + "id": "88c3ea8f01ba9fac", + "outputs": [ + { + "data": { + "text/plain": [ + " id split\n", + "0 14331_ARATH train\n", + "1 14331_CAEEL train\n", + "2 14331_MAIZE train\n", + "3 14332_MAIZE train\n", + "4 14333_ARATH train" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
014331_ARATHtrain
114331_CAEELtrain
214331_MAIZEtrain
314332_MAIZEtrain
414333_ARATHtrain
\n", + "
" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 16 + }, { "metadata": {}, "cell_type": "markdown", From 4c55b04890861c063370345d3f7f0cc169ec88c5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Mon, 30 Sep 2024 23:51:36 +0200 Subject: [PATCH 12/29] minor formatting changes --- tutorials/data_exploration_chebi.ipynb | 1 - tutorials/data_exploration_go.ipynb | 129 ++++++++++++++++--------- 2 files changed, 86 insertions(+), 44 deletions(-) diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb index 17c3ae33..6ddd3238 100644 --- a/tutorials/data_exploration_chebi.ipynb +++ b/tutorials/data_exploration_chebi.ipynb @@ -291,7 +291,6 @@ "synonym: \"monoatomic ions\" RELATED [ChEBI]\n", "is_a: CHEBI:24870\n", "is_a: CHEBI:33238\n", - "is_a: CHEBI:3323Relevant 8\n", "```\n", "\n", "### Breakdown of Attributes\n", diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb index 2c789ae6..8dc4cb44 100644 --- a/tutorials/data_exploration_go.ipynb +++ b/tutorials/data_exploration_go.ipynb @@ -94,6 +94,12 @@ "outputs": [], "execution_count": 13 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "651ab5c39833bd2c" + }, { "metadata": {}, "cell_type": "markdown", @@ -138,6 +144,12 @@ "outputs": [], "execution_count": null }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "db5b58f2d96823fc" + }, { "metadata": {}, "cell_type": "markdown", @@ -170,6 +182,12 @@ ], "id": "ee174b61b36c71aa" }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "a927ad484c930960" + }, { "metadata": {}, "cell_type": "markdown", @@ -323,49 +341,7 @@ "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", "\n", "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", - "\n", - "### More on GO Evidence Codes\n", - "\n", - "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n", - "\n", - "| **Evidence Code** | **Description** |\n", - "|-------------------|-----------------|\n", - "| **EXP** | Inferred from Experiment |\n", - "| **IDA** | Inferred from Direct Assay |\n", - "| **IPI** | Inferred from Physical Interaction |\n", - "| **IMP** | Inferred from Mutant Phenotype |\n", - "| **IGI** | Inferred from Genetic Interaction |\n", - "| **IEP** | Inferred from Expression Pattern |\n", - "| **TAS** | Traceable Author Statement |\n", - "| **IC** | Inferred by Curator |\n", - "| **IEA** | Inferred from Electronic Annotation (Computational) |\n", - "| **ISS** | Inferred from Sequence or Structural Similarity |\n", - "| **ISA** | Inferred from Sequence Alignment |\n", - "| **ISM** | Inferred from Sequence Model |\n", - "| **ISO** | Inferred from Sequence Orthology |\n", - "| **ISA** | Inferred from Sequence Alignment |\n", - "| **RCA** | Inferred from Reviewed Computational Analysis |\n", - "| **NAS** | Non-traceable Author Statement |\n", - "| **ND** | No Biological Data Available (placeholder) |\n", - "| **NR** | Not Recorded |\n", - "\n", - "\n", - "### Grouping of Codes:\n", - "\n", - "- **Experimental Evidence Codes**: \n", - " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", - " \n", - "- **Author/Curator Inferred Codes**:\n", - " - **TAS**, **IC**, **NAS**\n", - "\n", - "- **Computational Evidence Codes**:\n", - " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", - "\n", - "- **Others**:\n", - " - **ND** (No Data), **NR** (Not Recorded)\n", - "\n", - "\n", - "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation." + "\n" ], "id": "87c841de7d80beef" }, @@ -838,6 +814,12 @@ ], "execution_count": 16 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "e6b1f184a5091b83" + }, { "metadata": {}, "cell_type": "markdown", @@ -893,6 +875,67 @@ "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" ], "id": "481b8c0271ec9636" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "db6d7f2cc446e6f9" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## More on GO Evidence Codes\n", + "\n", + "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the both **experimental** and **non-experimental** GO evidence codes with brief descriptions:\n", + "\n", + "| **Evidence Code** | **Description** |\n", + "|-------------------|-----------------|\n", + "| **EXP** | Inferred from Experiment |\n", + "| **IDA** | Inferred from Direct Assay |\n", + "| **IPI** | Inferred from Physical Interaction |\n", + "| **IMP** | Inferred from Mutant Phenotype |\n", + "| **IGI** | Inferred from Genetic Interaction |\n", + "| **IEP** | Inferred from Expression Pattern |\n", + "| **TAS** | Traceable Author Statement |\n", + "| **IC** | Inferred by Curator |\n", + "| **IEA** | Inferred from Electronic Annotation (Computational) |\n", + "| **ISS** | Inferred from Sequence or Structural Similarity |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **ISM** | Inferred from Sequence Model |\n", + "| **ISO** | Inferred from Sequence Orthology |\n", + "| **ISA** | Inferred from Sequence Alignment |\n", + "| **RCA** | Inferred from Reviewed Computational Analysis |\n", + "| **NAS** | Non-traceable Author Statement |\n", + "| **ND** | No Biological Data Available (placeholder) |\n", + "| **NR** | Not Recorded |\n", + "\n", + "\n", + "### Grouping of Codes:\n", + "\n", + "- **Experimental Evidence Codes**: \n", + " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", + " \n", + "- **Author/Curator Inferred Codes**:\n", + " - **TAS**, **IC**, **NAS**\n", + "\n", + "- **Computational Evidence Codes**:\n", + " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", + "\n", + "- **Others**:\n", + " - **ND** (No Data), **NR** (Not Recorded)\n", + "\n", + "\n", + "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation." + ], + "id": "7f42b928364e5cd1" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "---", + "id": "1c11d6f520b02434" } ], "metadata": { From 33a5e64a1a904b00eec1df3f1bce93f499e4fa2c Mon Sep 17 00:00:00 2001 From: sfluegel Date: Tue, 1 Oct 2024 14:43:21 +0200 Subject: [PATCH 13/29] move commands to the top, restructure section 2 --- tutorials/data_exploration_chebi.ipynb | 162 +++++++++++-------------- 1 file changed, 69 insertions(+), 93 deletions(-) diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb index 6ddd3238..6a7e25ed 100644 --- a/tutorials/data_exploration_chebi.ipynb +++ b/tutorials/data_exploration_chebi.ipynb @@ -1,30 +1,58 @@ { "cells": [ { - "cell_type": "markdown", - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", "metadata": {}, + "cell_type": "markdown", "source": [ "# Introduction\n", "\n", - "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on ChEBI (Chemical Entities of Biological Interest). This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the ChEBI dataset. It covers\n", + "- how to instantiate a data class and generate data\n", + "- how the data is processed and stored\n", + "- and how to work with different molecule encodings.\n", "\n", - "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", "\n", "---\n" - ] + ], + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b" }, { - "cell_type": "markdown", - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", "metadata": {}, + "cell_type": "markdown", "source": [ "# 1. Instantiation of a Data Class\n", "\n", - "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data\n", + "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." + ], + "id": "4550d01fc7af5ae4" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": 18, + "source": "from chebai.preprocessing.datasets.chebi import ChEBIOver50", + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22" + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": {}, + "outputs": [], + "source": [ + "chebi_class = ChEBIOver50(chebi_version=231)" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "\n", "### Inheritance Hierarchy\n", "\n", - "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "ChEBI data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L598), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L23). Specifically:\n", "\n", "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", "\n", @@ -33,8 +61,8 @@ "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", "\n", "\n", - "### Explanation\n", - "A ChEBI data class can be configured with the following main parameters:\n", + "### Input parameters\n", + "A ChEBI data class can be configured with a range of parameters, including:\n", "\n", "- **chebi_version (int)**: Specifies the version of the ChEBI database to be used. The default is `200`. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", "\n", @@ -45,87 +73,64 @@ "### Additional Input Parameters\n", "\n", "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" - ] + ], + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d" }, { - "cell_type": "markdown", - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", "metadata": {}, + "cell_type": "markdown", "source": [ "# Available ChEBI Data Classes\n", "\n", "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py):\n", "\n", - "## `ChEBIOver100`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 100 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", - "\n", - "## `ChEBIOver50`\n", - "A class for extracting data from the ChEBI dataset with a threshold of 50 for selecting classes.\n", + "There is a range of available dataset classes for ChEBI. Usually, you want to use `ChEBIOver100` or `ChEBIOver50`. The number indicates the threshold for selecting label classes: ChEBI classes which have at least 100 / 50 SMILES-annotated subclasses will be used as labels.\n", "\n", - "- **Inheritance**: Inherits from `ChEBIOverX`.\n", + "Both inherit from `ChEBIOverX`. If you need a different threshold, you can create your own subclass. By default, `ChEBIOverX` uses the SMILES encoding (see Section 5). The other implemented encodings are SELFIES and DeepSMILES, used by the classes `ChEBIOverXSELFIES` and `ChEBIOverXDeepSMILES`, respectively. \n", + "They also have subclasses for different thresholds (`ChEBIOver50SELFIES`, `ChEBIOver100SELFIES`, `ChEBIOver100DeepSMILES`).\n", "\n", - "## `ChEBIOver100DeepSMILES`\n", - "A class for extracting data from the ChEBI dataset using the DeepChem SMILES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXDeepSMILES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver100SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 100.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver100`.\n", - "\n", - "## `ChEBIOver50SELFIES`\n", - "A class for extracting data from the ChEBI dataset using the SELFIES reader with a threshold of 50.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXSELFIES` and `ChEBIOver50`.\n", - "\n", - "## `ChEBIOver50Partial`\n", - "A dataset class that extracts a part of ChEBI based on subclasses of a given top class, with a threshold of 50 for selecting classes.\n", - "\n", - "- **Inheritance**: Inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" - ] + "Finally, `ChEBIOver50Partial` selects extracts a part of ChEBI based on a given top class, with a threshold of 50 for selecting labels.\n", + "This class inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" + ], + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a" }, { - "cell_type": "code", - "execution_count": 18, - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", "metadata": {}, - "outputs": [], "source": [ - "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + "---" ] }, { - "cell_type": "code", - "execution_count": 20, - "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", "metadata": {}, - "outputs": [], + "cell_type": "markdown", "source": [ - "chebi_class = ChEBIOver50(chebi_version=231)" - ] + "# 2. Preparation / Setup Methods\n", + "\n", + "Now we have a ChEBI data class with all the relevant parameters. Next, we need to generate the actual dataset." + ], + "id": "ed973fb59df11849" }, { - "cell_type": "markdown", - "id": "8456b545-88c5-401d-baa5-47e8ae710f04", "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "---" - ] + "chebi_class.prepare_data()\n", + "chebi_class.setup()" + ], + "id": "d0a58e2bd9c0e6d9" }, { "cell_type": "markdown", "id": "1655d489-25fe-46de-9feb-eeca5d36936f", "metadata": {}, "source": [ - "# 2. Preparation / Setup Methods\n", "\n", - "Once a ChEBI data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", "### Automatic Execution: \n", - "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", - "\n", + "These methods are executed automatically when using the training command `chebai fit`. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", "\n", "### Why is Preparation Needed?\n", "\n", @@ -137,46 +142,17 @@ "The data preprocessing in a data class involves two main methods:\n", "\n", "1. **`prepare_data` Method**:\n", - " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels. This step is independent of input encodings and all chemicals are stored as SMILES strings.\n", " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", "\n", "2. **`setup` Method**:\n", " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", - " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), tokenizing the input according to the specified encoding. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the tokenization.\n", " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", "\n", "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." ] }, - { - "cell_type": "code", - "execution_count": 36, - "id": "f2df4bd1-cf34-4414-bce4-54379ffac006", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\\smiles_token\n", - "Cross-validation enabled: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Check for processed data in data\\chebi_v231\\ChEBI50\\processed\n", - "saving 771 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\smiles_token\\tokens.txt...\n", - "first 10 tokens: ['[*-]', '[Al-]', '[F-]', '.', '[H]', '[N]', '(', ')', '[Ag+]', 'C']\n" - ] - } - ], - "source": [ - "chebi_class.prepare_data()\n", - "chebi_class.setup()" - ] - }, { "cell_type": "markdown", "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", @@ -202,7 +178,7 @@ " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", "\n", "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes chemical IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values.\n", " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", "\n", "3. **`data.pt`**\n", From 242db56e2331a84a569f154a9215961ca210ad78 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 5 Oct 2024 23:48:13 +0200 Subject: [PATCH 14/29] re-order section 3 and 4 as per suggestion --- tutorials/data_exploration_chebi.ipynb | 365 +++++++++++++++---------- 1 file changed, 216 insertions(+), 149 deletions(-) diff --git a/tutorials/data_exploration_chebi.ipynb b/tutorials/data_exploration_chebi.ipynb index 6a7e25ed..87818cba 100644 --- a/tutorials/data_exploration_chebi.ipynb +++ b/tutorials/data_exploration_chebi.ipynb @@ -1,8 +1,9 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, "source": [ "# Introduction\n", "\n", @@ -14,40 +15,47 @@ "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", "\n", "---\n" - ], - "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "4550d01fc7af5ae4", + "metadata": {}, "source": [ "# 1. Instantiation of a Data Class\n", "\n", "To start working with `chebai`, you first need to instantiate a ChEBI data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." - ], - "id": "4550d01fc7af5ae4" + ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": 1, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, "outputs": [], - "execution_count": 18, - "source": "from chebai.preprocessing.datasets.chebi import ChEBIOver50", - "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22" + "source": [ + "from chebai.preprocessing.datasets.chebi import ChEBIOver50" + ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:07:26.371796Z", + "start_time": "2024-10-05T21:07:26.058728Z" + } + }, "outputs": [], "source": [ "chebi_class = ChEBIOver50(chebi_version=231)" ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", + "metadata": {}, "source": [ "\n", "### Inheritance Hierarchy\n", @@ -73,12 +81,12 @@ "### Additional Input Parameters\n", "\n", "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_ChEBIDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/chebi.py#L108), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" - ], - "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, "source": [ "# Available ChEBI Data Classes\n", "\n", @@ -91,8 +99,7 @@ "\n", "Finally, `ChEBIOver50Partial` selects extracts a part of ChEBI based on a given top class, with a threshold of 50 for selecting labels.\n", "This class inherits from `ChEBIOverXPartial` and `ChEBIOver50`.\n" - ], - "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a" + ] }, { "cell_type": "markdown", @@ -103,25 +110,25 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "ed973fb59df11849", + "metadata": {}, "source": [ "# 2. Preparation / Setup Methods\n", "\n", "Now we have a ChEBI data class with all the relevant parameters. Next, we need to generate the actual dataset." - ], - "id": "ed973fb59df11849" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "d0a58e2bd9c0e6d9", + "metadata": {}, + "outputs": [], "source": [ "chebi_class.prepare_data()\n", "chebi_class.setup()" - ], - "id": "d0a58e2bd9c0e6d9" + ] }, { "cell_type": "markdown", @@ -163,37 +170,10 @@ }, { "cell_type": "markdown", - "id": "8ababadb-003a-4c86-b92d-10e7bd1fba5e", + "id": "bb6e9a81554368f7", "metadata": {}, "source": [ - "# 3. Different Data Files Created and their Structure\n", - "\n", - "\n", - "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their structures.\n", - "\n", - "### Data Files\n", - "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file)\n", - " - **Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", - "\n", - "2. **`data.pkl`**\n", - " - **Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", - "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", - "\n", - "4. **`classes.txt`**\n", - " - **Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", - "\n", - "5. **`splits.csv`**\n", - " - **Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", - "\n", - "### File Structure and Preprocessing Stages\n", + "# 3. Overview of the 3 preprocessing stages\n", "\n", "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", "\n", @@ -214,34 +194,28 @@ " - **File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", "\n", - "### Data Splits\n", - "\n", - "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", - "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n", - "\n", "### Summary of File Paths\n", "\n", "- **Raw Data**: `data/${chebi_version}/${dataset_name}/raw`\n", "- **Processed Data 1**: `data/${chebi_version}/${dataset_name}/processed`\n", "- **Processed Data 2**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}`\n", "\n", - "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments." - ] - }, - { - "cell_type": "markdown", - "id": "a35c1d2b-9d6b-4c10-828b-b5912752c757", - "metadata": {}, - "source": [ - "---" + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" ] }, { "cell_type": "markdown", - "id": "74adb549-9e02-472d-a535-78a584853b52", + "id": "7e172c0d1e8bb93f", "metadata": {}, "source": [ - "# 4. Information Stored in the Files\n" + "# 4. Data Files and their structure\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" ] }, { @@ -249,13 +223,10 @@ "id": "43329709-5134-4ce5-88e7-edd2176bf84d", "metadata": {}, "source": [ - "## chebi.obo\n", + "## chebi.obo File\n", "\n", - "The `chebi.obo` file is a key resource in the ChEBI (Chemical Entities of Biological Interest) dataset, containing the ontology data that defines various chemical entities and their relationships. This file is downloaded directly from the ChEBI database and serves as the foundational raw data for further processing in `chebai`.\n", - "\n", - "### Structure of `chebi.obo`\n", - "\n", - "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "**Description**: Contains the raw ChEBI ontology data, downloaded directly from the ChEBI website. This file serves as the foundation for data processing.\n", + " \n", "\n", "#### Example of a Term Document\n", "\n", @@ -269,6 +240,14 @@ "is_a: CHEBI:33238\n", "```\n", "\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/raw/${filename}.obo`\n", + "\n", + "\n", + "### Structure of `chebi.obo`\n", + "\n", + "The `chebi.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific chemical entity within the ChEBI ontology. These attributes include identifiers, names, relationships to other entities, and more.\n", + "\n", + "\n", "### Breakdown of Attributes\n", "\n", "Each term document in the `chebi.obo` file consists of the following key attributes:\n", @@ -291,46 +270,46 @@ }, { "cell_type": "markdown", - "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "id": "558295e5a7ded456", "metadata": {}, "source": [ - "## `data.pkl` File\n", - "\n", - "The `data.pkl` file, generated during the preprocessing stage, contains the processed ChEBI data in a dataframe format. Below is an example of how this data is structured:\n", - "\n", - "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the ID of each ChEBI data instance.\n", - "- **Column 1**: Contains the name of each ChEBI data instance.\n", - "- **Column 2**: Contains the SMILES representation of the chemical.\n", - "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "## data.pkl File\n", "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + "**Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the CHEBI-IDs, chemical representations (SMILES strings), and columns for each label with boolean values." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 6, "id": "fd490270-59b8-4c1c-8b09-204defddf592", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:09:01.622317Z", + "start_time": "2024-10-05T21:09:01.606698Z" + } + }, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "import os" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 10, "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:11:51.296162Z", + "start_time": "2024-10-05T21:11:44.559304Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Size of the data (rows x columns): (129184, 1335)\n" + "Size of the data (rows x columns): (185007, 1514)\n" ] }, { @@ -358,23 +337,23 @@ " name\n", " SMILES\n", " 1722\n", + " 2440\n", " 2468\n", " 2571\n", " 2580\n", " 2634\n", " 3098\n", - " 3992\n", " ...\n", - " 143017\n", - " 143212\n", - " 143813\n", - " 146180\n", - " 147334\n", - " 156473\n", - " 166828\n", - " 166904\n", - " 167497\n", - " 167559\n", + " 176910\n", + " 177333\n", + " 183508\n", + " 183509\n", + " 189832\n", + " 189840\n", + " 192499\n", + " 194321\n", + " 197504\n", + " 229684\n", " \n", " \n", " \n", @@ -500,73 +479,91 @@ " \n", " \n", "\n", - "

5 rows × 1335 columns

\n", + "

5 rows × 1514 columns

\n", "" ], "text/plain": [ - " id name SMILES 1722 2468 2571 2580 2634 \\\n", + " id name SMILES 1722 2440 2468 2571 2580 \\\n", "0 33429 monoatomic monoanion [*-] False False False False False \n", "1 30151 aluminide(1-) [Al-] False False False False False \n", "2 16042 halide anion [*-] False False False False False \n", "3 17051 fluoride [F-] False False False False False \n", "4 28741 sodium fluoride [F-].[Na+] False False False False False \n", "\n", - " 3098 3992 ... 143017 143212 143813 146180 147334 156473 166828 \\\n", + " 2634 3098 ... 176910 177333 183508 183509 189832 189840 192499 \\\n", "0 False False ... False False False False False False False \n", "1 False False ... False False False False False False False \n", "2 False False ... False False False False False False False \n", "3 False False ... False False False False False False False \n", "4 False False ... False False False False False False False \n", "\n", - " 166904 167497 167559 \n", + " 194321 197504 229684 \n", "0 False False False \n", "1 False False False \n", "2 False False False \n", "3 False False False \n", "4 False False False \n", "\n", - "[5 rows x 1335 columns]" + "[5 rows x 1514 columns]" ] }, - "execution_count": 53, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/chebi_v200/ChEBI50/processed/data.pkl\"))\n", + "pkl_df = pd.DataFrame(\n", + " pd.read_pickle(\n", + " os.path.join(\n", + " chebi_class.processed_dir_main,\n", + " chebi_class.processed_dir_main_file_names_dict[\"data\"],\n", + " )\n", + " )\n", + ")\n", "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", "pkl_df.head()" ] }, { "cell_type": "markdown", - "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", "metadata": {}, "source": [ - "## `data.pt` File\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/data.pkl`\n", "\n", - "The `data.pt` file is an important output of the preprocessing stage in `chebai`. It contains data in a format compatible with PyTorch, specifically as a list of dictionaries. Each dictionary in this list is structured to hold key information used for model training and evaluation.\n", "\n", - "### Structure of `data.pt`\n", - "\n", - "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of each ChEBI data instance.\n", + "- **Column 1**: Contains the name of each ChEBI data instance.\n", + "- **Column 2**: Contains the SMILES representation of the chemical.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", "\n", - "- **`features`**: \n", - " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ba019d2d4324bd0b", + "metadata": {}, + "source": [ + "## data.pt File\n", "\n", - "- **`labels`**: \n", - " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", "\n", - "- **`ident`**: \n", - " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library, specifically as a list of dictionaries. Each dictionary in this list includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input." ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 11, "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:12:49.338943Z", + "start_time": "2024-10-05T21:12:49.323319Z" + } + }, "outputs": [], "source": [ "import torch" @@ -574,9 +571,14 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 13, "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:14:12.892845Z", + "start_time": "2024-10-05T21:13:59.859953Z" + } + }, "outputs": [ { "name": "stdout", @@ -587,15 +589,25 @@ } ], "source": [ - "data_pt = torch.load(r\"data/chebi_v200/ChEBI50/processed/smiles_token/data.pt\")\n", + "data_pt = torch.load(\n", + " os.path.join(\n", + " chebi_class.processed_dir, chebi_class.processed_file_names_dict[\"data\"]\n", + " ),\n", + " weights_only=False,\n", + ")\n", "print(\"Type of loaded data:\", type(data_pt))" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 15, "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:14:21.185027Z", + "start_time": "2024-10-05T21:14:21.169358Z" + } + }, "outputs": [ { "name": "stdout", @@ -616,36 +628,61 @@ }, { "cell_type": "markdown", - "id": "861da1c3-0401-49f0-a22f-109814ed95d5", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", "metadata": {}, "source": [ - "## `classes.txt` File\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", "\n", - "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", "\n", - "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "markdown", + "id": "186ec6f0eed6ecf7", + "metadata": {}, + "source": [ + "## classes.txt File\n", + "\n", + "**Description**: A file containing the list of selected ChEBI classes based on the specified threshold. This file is crucial for ensuring that only relevant classes are included in the dataset." ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 16, "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:15:19.146285Z", + "start_time": "2024-10-05T21:15:18.503284Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1722\n", + "2440\n", "2468\n", "2571\n", - "2580\n", - "2634\n" + "2580\n" ] } ], "source": [ - "with open(r\"data/chebi_v200/ChEBI50/processed/classes.txt\", \"r\") as file:\n", + "with open(os.path.join(chebi_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", " for i in range(5):\n", " line = file.readline()\n", " print(line.strip())" @@ -653,19 +690,37 @@ }, { "cell_type": "markdown", - "id": "b058714f-e434-4367-89b9-74c129ac727f", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", "metadata": {}, "source": [ - "## `splits.csv` File\n", "\n", - "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "The `classes.txt` file lists selected ChEBI (Chemical Entities of Biological Interest) classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique ChEBI class ID, identifying specific chemical entities within the ChEBI ontology.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "fb72be449e52b63f", + "metadata": {}, + "source": [ + "## splits.csv File\n", + "\n", + "**Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." ] }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 17, "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:15:54.575116Z", + "start_time": "2024-10-05T21:15:53.945139Z" + } + }, "outputs": [ { "data": { @@ -731,16 +786,28 @@ "4 30340 train" ] }, - "execution_count": 98, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "csv_df = pd.read_csv(r\"data/chebi_v231/ChEBI50/processed/splits.csv\")\n", + "csv_df = pd.read_csv(os.path.join(chebi_class.processed_dir_main, \"splits.csv\"))\n", "csv_df.head()" ] }, + { + "cell_type": "markdown", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "\n", + "\n", + "**File Path**: `data/${chebi_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + ] + }, { "cell_type": "markdown", "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", From 748eebedc354f64c84932d3d722a4766e41edae5 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 6 Oct 2024 12:02:21 +0200 Subject: [PATCH 15/29] GO: reformat section 3 and 4 as per suggestion --- tutorials/data_exploration_go.ipynb | 625 ++++++++++++++++------------ 1 file changed, 364 insertions(+), 261 deletions(-) diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb index 8dc4cb44..e60e972b 100644 --- a/tutorials/data_exploration_go.ipynb +++ b/tutorials/data_exploration_go.ipynb @@ -1,26 +1,67 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "da687d32ba48b188", + "metadata": {}, "source": [ "# Introduction\n", "\n", - "This notebook serves as a guide for new users of the `chebai` package, which is used for working with chemical data, especially focusing on Gene Ontology (GO) and Swiss UniProt Protein data. This notebook will explain how to instantiate the main data class, how the data files are structured, and how to work with different molecule encodings.\n", + "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the GO-UniProt dataset. It covers\n", + "- how to instantiate a data class and generate data\n", + "- how the data is processed and stored\n", + "- and how to work with different molecule encodings.\n", "\n", - "One key aspect of the package is its **dataset management system**. In the training process, chemical datasets play a critical role by providing the necessary data for model learning and validation. The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that users do not have to manually prepare datasets before running models; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly.\n", + "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", "\n", - "---" - ], - "id": "da687d32ba48b188" + "---\n" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "0bd07c91-bb02-48d4-b759-aa35ecb224bd", + "metadata": {}, "source": [ "# 1. Instantiation of a Data Class\n", "\n", - "To start working with `chebai`, you first need to instantiate a GO_UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data\n", + "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "440f203ceaf7e4b7", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:03.920610Z", + "start_time": "2024-09-30T21:25:03.622407Z" + } + }, + "outputs": [], + "source": [ + "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a648346d81d0dc5e", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:08.863132Z", + "start_time": "2024-09-30T21:25:08.387739Z" + } + }, + "outputs": [], + "source": [ + "go_class = GOUniProtOver250(go_branch=\"BP\")" + ] + }, + { + "cell_type": "markdown", + "id": "64585012b0d7f66f", + "metadata": {}, + "source": [ "### Inheritance Hierarchy\n", "\n", "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", @@ -37,6 +78,11 @@ "Data classes related to proteins can be configured using the following main parameters:\n", "\n", "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + " - **`\"BP\"`**: Biological Process branch.\n", + " - **`\"MF\"`**: Molecular Function branch.\n", + " - **`\"CC\"`**: Cellular Component branch.\n", + "\n", + "This allows for more specific datasets focused on a particular aspect of gene function.\n", "\n", "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", "\n", @@ -44,69 +90,52 @@ "\n", "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", "\n", - "### Available GOUniProt Data Classes\n", - "\n", - "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n", - "\n", - "#### `GOUniProtOver250`\n", "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 250 for selecting classes.\n", + "# Available ChEBI Data Classes\n", "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py):\n", "\n", - "#### `GOUniProtOver50`\n", + "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset.\n", "\n", - "A class for extracting data from the Gene Ontology and Swiss UniProt dataset with a threshold of 50 for selecting classes.\n", + "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n", "\n", - "- **Inheritance**: Inherits from `_GOUniProtOverX`.\n" - ], - "id": "64585012b0d7f66f" + "If you need a different threshold, you can create your own subclass." + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "### Instantiation Example", - "id": "605bbca601037df2" + "id": "651ab5c39833bd2c", + "metadata": {}, + "source": [ + "---" + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:03.920610Z", - "start_time": "2024-09-30T21:25:03.622407Z" - } - }, - "cell_type": "code", - "source": "from chebai.preprocessing.datasets.go_uniprot import GOUniProtOver250", - "id": "440f203ceaf7e4b7", - "outputs": [], - "execution_count": 12 + "cell_type": "markdown", + "id": "a52b4363-7398-44aa-a4cc-8bba14bdd966", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is to generate the actual dataset." + ] }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-30T21:25:08.863132Z", - "start_time": "2024-09-30T21:25:08.387739Z" - } - }, "cell_type": "code", - "source": "go_class = GOUniProtOver250()", - "id": "a648346d81d0dc5e", + "execution_count": null, + "id": "9f77351090560bc4", + "metadata": {}, "outputs": [], - "execution_count": 13 + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "---", - "id": "651ab5c39833bd2c" - }, - { + "id": "2328e824c4dafb2d", "metadata": {}, - "cell_type": "markdown", "source": [ - "# 2. Preparation / Setup Methods\n", - "\n", - "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is necessary to download or load the relevant data files and set up the internal data structures.\n", "### Automatic Execution: \n", "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", "\n", @@ -130,81 +159,86 @@ " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", "\n", "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." - ], - "id": "2328e824c4dafb2d" + ] }, { + "cell_type": "markdown", + "id": "db5b58f2d96823fc", "metadata": {}, - "cell_type": "code", "source": [ - "go_class.prepare_data()\n", - "go_class.setup()" - ], - "id": "9f77351090560bc4", - "outputs": [], - "execution_count": null + "---" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "---", - "id": "db5b58f2d96823fc" - }, - { + "id": "ee174b61b36c71aa", "metadata": {}, - "cell_type": "markdown", "source": [ - "# 3. GOUniProt Data File Structure\n", + "# 3. Overview of the 3 preprocessing stages\n", "\n", - "1. **`Raw Data Files`**: (e.g., `.obo` file and `.dat` file)\n", - " - **Description**: These files contain the raw GO ontology and Swiss UniProt data, which are downloaded directly from their respective websites. They serve as the foundation for data processing. Since there are no versions associated with this dataset, common raw files are used for all subsets of the data.\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `go-basic.obo` and `uniprot_sprot.data`\n", + " - **Description**: This stage contains the raw GO ontology data and raw Swiss-UniProt data, serving as the initial input for further processing.\n", " - **File Paths**:\n", - " - `data/GO_UniProt/raw/${filename}.obo`\n", - " - `data/GO_UniProt/raw/${filename}.dat`\n", + " - `data/GO_UniProt/raw/go-basic.obo`\n", + " - `data/GO_UniProt/raw/uniprot_sprot.dat`\n", "\n", - "2. **`data.pkl`**\n", - " - **Description**: This file is generated by the `prepare_data` method and contains the processed data in a dataframe format. It includes protein IDs, data representations (such as SMILES strings), and class columns with boolean values.\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains sequence strings, class columns, and metadata but lacks data splits.\n", " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", "\n", - "3. **`data.pt`**\n", - " - **Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input.\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", "\n", - "4. **`classes.txt`**\n", - " - **Description**: This file lists the selected GO or UniProt classes based on a specified threshold. It ensures that only the relevant classes are included in the dataset for analysis.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n", "\n", - "5. **`splits.csv`**\n", - " - **Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`.\n", - " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "### Summary of File Paths\n", "\n", - "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n" - ], - "id": "ee174b61b36c71aa" + "- **Raw Data**: `data/GO_UniProt/raw`\n", + "- **Processed Data 1**: `data/GO_UniProt/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "---", - "id": "a927ad484c930960" + "id": "a927ad484c930960", + "metadata": {}, + "source": [ + "---" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "# 4. Information Stored in the Files", - "id": "3f92b58e460c08fd" + "id": "3f92b58e460c08fd", + "metadata": {}, + "source": [ + "# 4. Data Files and their structure\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "cca75d881cb8bade", + "metadata": {}, "source": [ - "## go-basic.obo\n", + "## go-basic.obo File\n", "\n", - "The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", - "\n", - "### Structure of `go-basic.obo`\n", - "\n", - "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", + "**Description**: The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", "\n", "#### Example of a Term Document\n", "\n", @@ -219,6 +253,14 @@ "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", "```\n", "\n", + "**File Path**: `data/GO_UniProt/raw/go-basic.obo`\n", + "\n", + "### Structure of `go-basic.obo`\n", + "\n", + "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", + "\n", + "\n", + "\n", "### Breakdown of Attributes\n", "\n", "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", @@ -240,22 +282,18 @@ "- **`is_a: GO:0006057`**: \n", " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" - ], - "id": "cca75d881cb8bade" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "87c841de7d80beef", + "metadata": {}, "source": [ - "## uniprot_sprot.dat\n", + "## uniprot_sprot.dat File\n", "\n", - "The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", + "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotation. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", "\n", "\n", - "## Structure of `uniprot_sprot.dat`\n", - "\n", - "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", - "\n", "### Example of a Protein Entry\n", "\n", "```plaintext\n", @@ -302,6 +340,13 @@ "//\n", "```\n", "\n", + "**File Path**: `data/GO_UniProt/raw/uniprot_sprot.dat`\n", + "\n", + "\n", + "## Structure of `uniprot_sprot.dat`\n", + "\n", + "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", + "\n", "### Breakdown of Attributes\n", "\n", "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", @@ -341,107 +386,56 @@ "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", "\n", "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", - "\n" - ], - "id": "87c841de7d80beef" + "\n", + "__Note__: For more details on evidence codes check section 5.2" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b7687078-f6b8-4fbf-afa7-dfda89061a5e", + "metadata": {}, "source": [ - "## data.pkl\n", - "\n", - "The `data.pkl` file, generated during the preprocessing stage, contains the processed GO data in a dataframe format. Below is an example of how this data is structured:\n", - "\n", + "## data.pkl File\n", "\n", - "\n", - "### Structure of `data.pkl`\n", - "`data.pkl` as following structure: \n", - "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", - "- **Column 1**: Contains the accession of each Protein data instance.\n", - "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", - "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", - "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", - "\n", - "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" - ], - "id": "735844f0b2474ad6" + "**Description**: This file is generated by the `prepare_data` method and contains the processed GO data in a dataframe format. It includes protein IDs, data representations (such as sequence strings), and class columns with boolean values." + ] }, { + "cell_type": "code", + "execution_count": 5, + "id": "b4da7e73e251e1d1", "metadata": { "ExecuteTime": { "end_time": "2024-09-30T14:08:33.990378Z", "start_time": "2024-09-30T14:08:33.959459Z" } }, - "cell_type": "code", - "source": "import pandas as pd", - "id": "b4da7e73e251e1d1", "outputs": [], - "execution_count": 3 + "source": [ + "import pandas as pd\n", + "import os" + ] }, { + "cell_type": "code", + "execution_count": 8, + "id": "b66fbb9b720d053c", "metadata": { "ExecuteTime": { "end_time": "2024-09-30T14:10:12.796911Z", "start_time": "2024-09-30T14:10:06.052276Z" } }, - "cell_type": "code", - "source": [ - "pkl_df = pd.DataFrame(pd.read_pickle(r\"data/GO_UniProt/GO250_BP/processed/data.pkl\"))\n", - "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", - "pkl_df.head()" - ], - "id": "b66fbb9b720d053c", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Size of the data (rows x columns): (27459, 1050)\n" + "Size of the data (rows x columns): (32933, 1049)\n" ] }, { "data": { - "text/plain": [ - " swiss_id accession \\\n", - "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", - "9 14331_CAEEL P41932,Q21537 \n", - "10 14331_MAIZE P49106 \n", - "13 14332_MAIZE Q01526 \n", - "14 14333_ARATH P42644,F4KBI7,Q945L2 \n", - "\n", - " go_ids \\\n", - "8 [19222] \n", - "9 [132, 1708, 5634, 5737, 5938, 6611, 7346, 8340... \n", - "10 [3677, 5634, 10468, 44877] \n", - "13 [3677, 5634, 10468, 44877] \n", - "14 [5634, 5737, 6995, 9409, 9631, 16036, 19222, 5... \n", - "\n", - " sequence 41 75 122 \\\n", - "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", - "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", - "10 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "13 MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVE... False False False \n", - "14 MSTREENVYMAKLAEQAERYEEMVEFMEKVAKTVDVEELSVEERNL... False False False \n", - "\n", - " 165 209 226 ... 2000145 2000146 2000147 2000241 2000243 \\\n", - "8 False False False ... False False False False False \n", - "9 False False False ... False False False False False \n", - "10 False False False ... False False False False False \n", - "13 False False False ... False False False False False \n", - "14 False False False ... False False False False False \n", - "\n", - " 2000377 2001020 2001141 2001233 2001234 \n", - "8 False False False False False \n", - "9 False False False False False \n", - "10 False False False False False \n", - "13 False False False False False \n", - "14 False False False False False \n", - "\n", - "[5 rows x 1050 columns]" - ], "text/html": [ "
\n", "