From 203cf744339faf1a86484f01d1730f66bb1ff1b5 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Wed, 23 Apr 2025 10:00:15 -0400 Subject: [PATCH 1/3] Revise comments in chunking notebook Signed-off-by: Ali Maredia --- .../doc-preprocessing-to-sdg/chunking.ipynb | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb b/notebooks/doc-preprocessing-to-sdg/chunking.ipynb index 3bb61d6..4229232 100644 --- a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb +++ b/notebooks/doc-preprocessing-to-sdg/chunking.ipynb @@ -5,12 +5,20 @@ "id": "7b33678f-67d2-48a1-801f-302622e43e0f", "metadata": {}, "source": [ - "## Goal\n", + "## Chunking\n", "The goal of chunking for InstructLab SDG is to provide the teacher model small and logical pieces of the source document to generate data off of.\n", "\n", "In this notebook we are doing chunking with Docling[https://docling-project.github.io/docling/examples/hybrid_chunking/#hybrid-chunking].\n", "\n", - "First let's ensure docling is installed." + "The input to this notebook is a docling JSON file created after a docling conversion, or a directory of docling JSON files." + ] + }, + { + "cell_type": "markdown", + "id": "d9f268fd-35d2-4c7a-8cfa-47630de00837", + "metadata": {}, + "source": [ + "### Dependencies" ] }, { @@ -272,8 +280,7 @@ " c = dict(chunk=chunk, file=file.stem)\n", " all_chunks.append(c)\n", " except ConversionError as e:\n", - " print(f\"Skipping file {file}\")\n", - "# print(all_chunks)" + " print(f\"Skipping file {file}\")" ] }, { @@ -286,6 +293,16 @@ "To view the chunks, run through the following cell. As you can see the document is broken into small pieces with metadata about the chunk based on the document's format" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ff88cf5c-1315-4eca-afcd-25706eaf7d6b", + "metadata": {}, + "outputs": [], + "source": [ + "# print(all_chunks)" + ] + }, { "cell_type": "markdown", "id": "84826055-a7f1-4334-a12b-bbc07a523199", @@ -293,7 +310,9 @@ "tags": [] }, "source": [ - "## Save the chunks to a text file each" + "## Save the chunks to a text file for each chunk\n", + "\n", + "Each chunk is saved to an individual text file in the format: `{docling-json-file-name}-{chunk #}.txt`. Having chunking in this format is important as an input to create-sdg-seed-data notebook." ] }, { From 0058f67d9225d744e440f38e5d0bced4aa628ac5 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Wed, 23 Apr 2025 07:34:33 -0400 Subject: [PATCH 2/3] feat: Add sdg seed dataset creation notebook create_seed_dataset.py used in this notebook is heavily inspired by docprocessor.py from the sdg-hub repo. Co-authored-by: Abhishek B Co-authored-by: shiv Signed-off-by: Ali Maredia --- .../create-sdg-seed-dataset.ipynb | 3508 +++++++++++++++++ .../doc-preprocessing-to-sdg/requirements.txt | 3 - .../utils/create_seed_dataset.py | 182 + 3 files changed, 3690 insertions(+), 3 deletions(-) create mode 100644 notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb delete mode 100644 notebooks/doc-preprocessing-to-sdg/requirements.txt create mode 100644 notebooks/doc-preprocessing-to-sdg/utils/create_seed_dataset.py diff --git a/notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb b/notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb new file mode 100644 index 0000000..b8ed47c --- /dev/null +++ b/notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb @@ -0,0 +1,3508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "442ab400-7c39-4bc3-a800-c153796d9891", + "metadata": { + "tags": [] + }, + "source": [ + "# Create Seed Dataset for SDG\n", + "\n", + "This notebook combines the contents from the qna.yaml and the chunks from the source document to create a seed dataset for the synthetic data generation process.\n", + "\n", + "To run this notebook you need a directory that contains N chunks named `{original-file-name}-{N}.txt` and a `qna.yaml` in the same directory.\n", + "\n", + "This notebook outputs a `seed.jsonl` file in the `output_dir` that you set." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f693c458-8f24-4574-9de5-3754cfcd97b1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: datasets in ./venv/lib/python3.12/site-packages (3.5.0)\n", + "Requirement already satisfied: transformers in ./venv/lib/python3.12/site-packages (4.51.3)\n", + "Requirement already satisfied: filelock in ./venv/lib/python3.12/site-packages (from datasets) (3.18.0)\n", + "Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.12/site-packages (from datasets) (2.2.5)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in ./venv/lib/python3.12/site-packages (from datasets) (19.0.1)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in ./venv/lib/python3.12/site-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in ./venv/lib/python3.12/site-packages (from datasets) (2.2.3)\n", + "Requirement already satisfied: requests>=2.32.2 in ./venv/lib/python3.12/site-packages (from datasets) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.66.3 in ./venv/lib/python3.12/site-packages (from datasets) (4.67.1)\n", + "Requirement already satisfied: xxhash in ./venv/lib/python3.12/site-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in ./venv/lib/python3.12/site-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in ./venv/lib/python3.12/site-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.12.0)\n", + "Requirement already satisfied: aiohttp in ./venv/lib/python3.12/site-packages (from datasets) (3.11.18)\n", + "Requirement already satisfied: huggingface-hub>=0.24.0 in ./venv/lib/python3.12/site-packages (from datasets) (0.30.2)\n", + "Requirement already satisfied: packaging in ./venv/lib/python3.12/site-packages (from datasets) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in ./venv/lib/python3.12/site-packages (from datasets) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in ./venv/lib/python3.12/site-packages (from transformers) (2024.11.6)\n", + "Requirement already satisfied: tokenizers<0.22,>=0.21 in ./venv/lib/python3.12/site-packages (from transformers) (0.21.1)\n", + "Requirement already satisfied: safetensors>=0.4.3 in ./venv/lib/python3.12/site-packages (from transformers) (0.5.3)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (1.3.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (1.6.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (6.4.3)\n", + "Requirement already satisfied: propcache>=0.2.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (0.3.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (1.20.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./venv/lib/python3.12/site-packages (from huggingface-hub>=0.24.0->datasets) (4.13.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (1.26.20)\n", + "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (2025.1.31)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in ./venv/lib/python3.12/site-packages (from pandas->datasets) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.12/site-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in ./venv/lib/python3.12/site-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n" + ] + } + ], + "source": [ + "!pip install datasets transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "92ac4630-a4d6-4941-8df1-b5648a524d06", + "metadata": { + "papermill": { + "duration": 2.095642, + "end_time": "2025-04-10T14:51:20.049966", + "exception": false, + "start_time": "2025-04-10T14:51:17.954324", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from utils.create_seed_dataset import get_seed_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7f32c2fc-5d3c-489c-b36c-88204709beb5", + "metadata": { + "papermill": { + "duration": 0.143124, + "end_time": "2025-04-10T14:51:20.197602", + "exception": false, + "start_time": "2025-04-10T14:51:20.054478", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output_dir = 'output'\n", + "input_dir = 'output/chunks'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "774ac1cc-360d-4bd4-8b91-5fb96a127b64", + "metadata": { + "papermill": { + "duration": 0.11758, + "end_time": "2025-04-10T14:51:20.319940", + "exception": false, + "start_time": "2025-04-10T14:51:20.202360", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8fa1b08952b943b499a82afc4f8cecaa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/15 [00:00 Dataset: + """ + Creates a seed dataset from a path + Args: + path (str): Path to directory of qna.yaml and chunks + Returns: + ds (Dataset): Transformers Dataset to be used to create a jsonl + of seed data for the knowledge generation pipeline in + SDG. + """ + valid_path = is_dir_valid(path) + ds = create_dataset_from_dir(valid_path) + + return ds + +def is_dir_valid(path: str) -> Path: + """ + Returns whether or not a directory contains a qna.yaml and one or more .txt chunks + Args: + path (str): Path to directory of qna.yaml and chunks + Returns: + base_path (Path): pathlib.Path to a directory that can create a jsonl + of seed data + """ + base_path = Path(path) + if not base_path.is_dir(): + raise ValueError("Base path must be a directory") + + files = list(base_path.iterdir()) + has_qna = any(f.name == 'qna.yaml' for f in files) + has_txt = any(f.suffix == '.txt' for f in files) + if not has_qna or not has_txt: + raise ValueError("Directory does not contain a qna.yaml and chunks") + + return base_path + +def read_chunks(path: Path) -> Dict[str, str]: + """ + Returns a dictionary with all of the .txt chunks in a directory + The chunks may originate from one or more different files + Args: + path (Path): Path to directory of chunks + Returns: + chunks_dict (Dict[str,str]: Dictionary with key of the original file name + and a list of chunks as the value + """ + chunk_files = path.glob('*.txt') + + chunks_dict = {} + for file in chunk_files: + chunks = [] + match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name) + if match: + orig_filename = match.group(1) + + with file.open('r', encoding='utf-8') as f: + chunk = f.read() + + if orig_filename not in chunks_dict: + chunks_dict[orig_filename] = [] + chunks_dict[orig_filename].append(chunk) + + else: + print(f"Ignoring .txt file {file}, file name is not the right format") + + return chunks_dict + +def create_dataset_from_dir(path: Path) -> Dataset: + """ + Process a directory with chunks and a qna.yaml return a dataset. + Args: + path (Path): Path to directory of chunks and qna.yaml. + Returns: + Dataset: Dataset object. + """ + + qna_yaml_path = path / "qna.yaml" + + with open(qna_yaml_path, 'r') as f: + qna_yaml = yaml.safe_load(f) + + # Check for required fields + if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']): + raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields") + + chunks_dict = read_chunks(path) + + datasets = [] + for filename in chunks_dict.keys(): + chunks = chunks_dict[filename] + chunk_ds = Dataset.from_dict( + { + "document": chunks, + "document_outline": [qna_yaml["document_outline"]] + * len(chunks), + "document_title": [filename] * len(chunks), # TODO: is this really a necessary field? + "domain": [qna_yaml["domain"]] * len(chunks), + } + ) + chunk_ds_with_icls = add_icls(qna_yaml, chunk_ds) + datasets.append(chunk_ds_with_icls) + + return safe_concatenate_datasets(datasets) + +def safe_concatenate_datasets(datasets: list[Dataset]) -> Dataset: + """ + Concatenate datasets safely, ignoring any datasets that are None or empty. + Args: + datasets (list[Dataset]): List of Dataset objects to concatenate. + Returns: + Dataset: Dataset object with concatenated datasets. + """ + filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0] + + if not filtered_datasets: + return None + + return concatenate_datasets(filtered_datasets) + +def get_token_count(text, tokenizer): + return len(tokenizer.tokenize(text)) + +def add_icls(qna_yaml: Dict[str, str], chunked_document: Dataset) -> Dataset: + """ + Add the ICLS label to the dataset. + Args: + qna_yaml (Dict): object representing qna.yaml file. + dataset (Dataset): Dataset object. + Returns: + Dataset: Dataset object with ICLS label. + """ + # TODO: make the tokenizer configurable at some level + tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab") + icl = qna_yaml["seed_examples"] + chunked_document_all_icl = [] + for icl_ in icl: + chunked_document_all_icl.append( + chunked_document.map( + lambda x: { + "icl_document": icl_["context"], + "icl_query_1": icl_["questions_and_answers"][0]["question"], + "icl_response_1": icl_["questions_and_answers"][0]["answer"], + "icl_query_2": icl_["questions_and_answers"][1]["question"], + "icl_response_2": icl_["questions_and_answers"][1]["answer"], + "icl_query_3": icl_["questions_and_answers"][2]["question"], + "icl_response_3": icl_["questions_and_answers"][2]["answer"], + } + ) + ) + chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl) + chunked_document_all_icl = chunked_document_all_icl.map( + lambda x: { + "chunks": chunk_document( + [x["document"]], server_ctx_size=4096, chunk_word_count=1024 + ) + if get_token_count(x["document"], tokenizer) > 1024 + else [x["document"]] + } + ) + df = chunked_document_all_icl.to_pandas() + df_exploded = df.explode("chunks").reset_index(drop=True) + new_ds = Dataset.from_pandas(df_exploded) + new_ds = new_ds.remove_columns("document").rename_columns( + {"chunks": "document"} + ) + + # Only keep document greater than 100 tokens + new_ds = new_ds.filter( + lambda x: get_token_count(x["document"], tokenizer) > 100 + ) + return new_ds From cc69a063a436e308d2454528d68eab60eaf39590 Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 5 May 2025 14:57:23 -0400 Subject: [PATCH 3/3] rename doc preprocessing-to-sdg Signed-off-by: Ali Maredia --- .../chunking.ipynb | 0 .../create-sdg-seed-dataset.ipynb | 8 ++++++++ .../docling-conversion.ipynb | 0 .../utils/create_seed_dataset.py | 0 4 files changed, 8 insertions(+) rename notebooks/{doc-preprocessing-to-sdg => instructlab-knowledge-pipeline}/chunking.ipynb (100%) rename notebooks/{doc-preprocessing-to-sdg => instructlab-knowledge-pipeline}/create-sdg-seed-dataset.ipynb (99%) rename notebooks/{doc-preprocessing-to-sdg => instructlab-knowledge-pipeline}/docling-conversion.ipynb (100%) rename notebooks/{doc-preprocessing-to-sdg => instructlab-knowledge-pipeline}/utils/create_seed_dataset.py (100%) diff --git a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb b/notebooks/instructlab-knowledge-pipeline/chunking.ipynb similarity index 100% rename from notebooks/doc-preprocessing-to-sdg/chunking.ipynb rename to notebooks/instructlab-knowledge-pipeline/chunking.ipynb diff --git a/notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb b/notebooks/instructlab-knowledge-pipeline/create-sdg-seed-dataset.ipynb similarity index 99% rename from notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb rename to notebooks/instructlab-knowledge-pipeline/create-sdg-seed-dataset.ipynb index b8ed47c..1500d32 100644 --- a/notebooks/doc-preprocessing-to-sdg/create-sdg-seed-dataset.ipynb +++ b/notebooks/instructlab-knowledge-pipeline/create-sdg-seed-dataset.ipynb @@ -582,6 +582,14 @@ "source": [ "seed_data.to_json(f'{output_dir}/seed_data.jsonl', orient='records', lines=True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5180bcd9-2434-4118-bf68-75eec2712cbf", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/doc-preprocessing-to-sdg/docling-conversion.ipynb b/notebooks/instructlab-knowledge-pipeline/docling-conversion.ipynb similarity index 100% rename from notebooks/doc-preprocessing-to-sdg/docling-conversion.ipynb rename to notebooks/instructlab-knowledge-pipeline/docling-conversion.ipynb diff --git a/notebooks/doc-preprocessing-to-sdg/utils/create_seed_dataset.py b/notebooks/instructlab-knowledge-pipeline/utils/create_seed_dataset.py similarity index 100% rename from notebooks/doc-preprocessing-to-sdg/utils/create_seed_dataset.py rename to notebooks/instructlab-knowledge-pipeline/utils/create_seed_dataset.py