diff --git a/notebooks/doc-preprocessing-to-sdg/requirements.txt b/notebooks/doc-preprocessing-to-sdg/requirements.txt deleted file mode 100644 index 3e31635..0000000 --- a/notebooks/doc-preprocessing-to-sdg/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -docling -elyra -jupyterlab diff --git a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb b/notebooks/instructlab-knowledge-pipeline/chunking.ipynb similarity index 99% rename from notebooks/doc-preprocessing-to-sdg/chunking.ipynb rename to notebooks/instructlab-knowledge-pipeline/chunking.ipynb index 3bb61d6..4229232 100644 --- a/notebooks/doc-preprocessing-to-sdg/chunking.ipynb +++ b/notebooks/instructlab-knowledge-pipeline/chunking.ipynb @@ -5,12 +5,20 @@ "id": "7b33678f-67d2-48a1-801f-302622e43e0f", "metadata": {}, "source": [ - "## Goal\n", + "## Chunking\n", "The goal of chunking for InstructLab SDG is to provide the teacher model small and logical pieces of the source document to generate data off of.\n", "\n", "In this notebook we are doing chunking with Docling[https://docling-project.github.io/docling/examples/hybrid_chunking/#hybrid-chunking].\n", "\n", - "First let's ensure docling is installed." + "The input to this notebook is a docling JSON file created after a docling conversion, or a directory of docling JSON files." + ] + }, + { + "cell_type": "markdown", + "id": "d9f268fd-35d2-4c7a-8cfa-47630de00837", + "metadata": {}, + "source": [ + "### Dependencies" ] }, { @@ -272,8 +280,7 @@ " c = dict(chunk=chunk, file=file.stem)\n", " all_chunks.append(c)\n", " except ConversionError as e:\n", - " print(f\"Skipping file {file}\")\n", - "# print(all_chunks)" + " print(f\"Skipping file {file}\")" ] }, { @@ -286,6 +293,16 @@ "To view the chunks, run through the following cell. As you can see the document is broken into small pieces with metadata about the chunk based on the document's format" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ff88cf5c-1315-4eca-afcd-25706eaf7d6b", + "metadata": {}, + "outputs": [], + "source": [ + "# print(all_chunks)" + ] + }, { "cell_type": "markdown", "id": "84826055-a7f1-4334-a12b-bbc07a523199", @@ -293,7 +310,9 @@ "tags": [] }, "source": [ - "## Save the chunks to a text file each" + "## Save the chunks to a text file for each chunk\n", + "\n", + "Each chunk is saved to an individual text file in the format: `{docling-json-file-name}-{chunk #}.txt`. Having chunking in this format is important as an input to create-sdg-seed-data notebook." ] }, { diff --git a/notebooks/instructlab-knowledge-pipeline/create-sdg-seed-dataset.ipynb b/notebooks/instructlab-knowledge-pipeline/create-sdg-seed-dataset.ipynb new file mode 100644 index 0000000..1500d32 --- /dev/null +++ b/notebooks/instructlab-knowledge-pipeline/create-sdg-seed-dataset.ipynb @@ -0,0 +1,3516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "442ab400-7c39-4bc3-a800-c153796d9891", + "metadata": { + "tags": [] + }, + "source": [ + "# Create Seed Dataset for SDG\n", + "\n", + "This notebook combines the contents from the qna.yaml and the chunks from the source document to create a seed dataset for the synthetic data generation process.\n", + "\n", + "To run this notebook you need a directory that contains N chunks named `{original-file-name}-{N}.txt` and a `qna.yaml` in the same directory.\n", + "\n", + "This notebook outputs a `seed.jsonl` file in the `output_dir` that you set." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f693c458-8f24-4574-9de5-3754cfcd97b1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: datasets in ./venv/lib/python3.12/site-packages (3.5.0)\n", + "Requirement already satisfied: transformers in ./venv/lib/python3.12/site-packages (4.51.3)\n", + "Requirement already satisfied: filelock in ./venv/lib/python3.12/site-packages (from datasets) (3.18.0)\n", + "Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.12/site-packages (from datasets) (2.2.5)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in ./venv/lib/python3.12/site-packages (from datasets) (19.0.1)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in ./venv/lib/python3.12/site-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in ./venv/lib/python3.12/site-packages (from datasets) (2.2.3)\n", + "Requirement already satisfied: requests>=2.32.2 in ./venv/lib/python3.12/site-packages (from datasets) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.66.3 in ./venv/lib/python3.12/site-packages (from datasets) (4.67.1)\n", + "Requirement already satisfied: xxhash in ./venv/lib/python3.12/site-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in ./venv/lib/python3.12/site-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in ./venv/lib/python3.12/site-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.12.0)\n", + "Requirement already satisfied: aiohttp in ./venv/lib/python3.12/site-packages (from datasets) (3.11.18)\n", + "Requirement already satisfied: huggingface-hub>=0.24.0 in ./venv/lib/python3.12/site-packages (from datasets) (0.30.2)\n", + "Requirement already satisfied: packaging in ./venv/lib/python3.12/site-packages (from datasets) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in ./venv/lib/python3.12/site-packages (from datasets) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in ./venv/lib/python3.12/site-packages (from transformers) (2024.11.6)\n", + "Requirement already satisfied: tokenizers<0.22,>=0.21 in ./venv/lib/python3.12/site-packages (from transformers) (0.21.1)\n", + "Requirement already satisfied: safetensors>=0.4.3 in ./venv/lib/python3.12/site-packages (from transformers) (0.5.3)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (1.3.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (1.6.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (6.4.3)\n", + "Requirement already satisfied: propcache>=0.2.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (0.3.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in ./venv/lib/python3.12/site-packages (from aiohttp->datasets) (1.20.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./venv/lib/python3.12/site-packages (from huggingface-hub>=0.24.0->datasets) (4.13.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (1.26.20)\n", + "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests>=2.32.2->datasets) (2025.1.31)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in ./venv/lib/python3.12/site-packages (from pandas->datasets) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.12/site-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in ./venv/lib/python3.12/site-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: six>=1.5 in ./venv/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n" + ] + } + ], + "source": [ + "!pip install datasets transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "92ac4630-a4d6-4941-8df1-b5648a524d06", + "metadata": { + "papermill": { + "duration": 2.095642, + "end_time": "2025-04-10T14:51:20.049966", + "exception": false, + "start_time": "2025-04-10T14:51:17.954324", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from utils.create_seed_dataset import get_seed_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7f32c2fc-5d3c-489c-b36c-88204709beb5", + "metadata": { + "papermill": { + "duration": 0.143124, + "end_time": "2025-04-10T14:51:20.197602", + "exception": false, + "start_time": "2025-04-10T14:51:20.054478", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output_dir = 'output'\n", + "input_dir = 'output/chunks'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "774ac1cc-360d-4bd4-8b91-5fb96a127b64", + "metadata": { + "papermill": { + "duration": 0.11758, + "end_time": "2025-04-10T14:51:20.319940", + "exception": false, + "start_time": "2025-04-10T14:51:20.202360", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8fa1b08952b943b499a82afc4f8cecaa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/15 [00:00 Dataset: + """ + Creates a seed dataset from a path + Args: + path (str): Path to directory of qna.yaml and chunks + Returns: + ds (Dataset): Transformers Dataset to be used to create a jsonl + of seed data for the knowledge generation pipeline in + SDG. + """ + valid_path = is_dir_valid(path) + ds = create_dataset_from_dir(valid_path) + + return ds + +def is_dir_valid(path: str) -> Path: + """ + Returns whether or not a directory contains a qna.yaml and one or more .txt chunks + Args: + path (str): Path to directory of qna.yaml and chunks + Returns: + base_path (Path): pathlib.Path to a directory that can create a jsonl + of seed data + """ + base_path = Path(path) + if not base_path.is_dir(): + raise ValueError("Base path must be a directory") + + files = list(base_path.iterdir()) + has_qna = any(f.name == 'qna.yaml' for f in files) + has_txt = any(f.suffix == '.txt' for f in files) + if not has_qna or not has_txt: + raise ValueError("Directory does not contain a qna.yaml and chunks") + + return base_path + +def read_chunks(path: Path) -> Dict[str, str]: + """ + Returns a dictionary with all of the .txt chunks in a directory + The chunks may originate from one or more different files + Args: + path (Path): Path to directory of chunks + Returns: + chunks_dict (Dict[str,str]: Dictionary with key of the original file name + and a list of chunks as the value + """ + chunk_files = path.glob('*.txt') + + chunks_dict = {} + for file in chunk_files: + chunks = [] + match = re.match(r"^(.*?)[-_]\d+\.txt$", file.name) + if match: + orig_filename = match.group(1) + + with file.open('r', encoding='utf-8') as f: + chunk = f.read() + + if orig_filename not in chunks_dict: + chunks_dict[orig_filename] = [] + chunks_dict[orig_filename].append(chunk) + + else: + print(f"Ignoring .txt file {file}, file name is not the right format") + + return chunks_dict + +def create_dataset_from_dir(path: Path) -> Dataset: + """ + Process a directory with chunks and a qna.yaml return a dataset. + Args: + path (Path): Path to directory of chunks and qna.yaml. + Returns: + Dataset: Dataset object. + """ + + qna_yaml_path = path / "qna.yaml" + + with open(qna_yaml_path, 'r') as f: + qna_yaml = yaml.safe_load(f) + + # Check for required fields + if not all(key in qna_yaml for key in ['document_outline', 'domain', 'seed_examples']): + raise ValueError("qna.yaml file is missing document_outline, domain, or seed_examples fields") + + chunks_dict = read_chunks(path) + + datasets = [] + for filename in chunks_dict.keys(): + chunks = chunks_dict[filename] + chunk_ds = Dataset.from_dict( + { + "document": chunks, + "document_outline": [qna_yaml["document_outline"]] + * len(chunks), + "document_title": [filename] * len(chunks), # TODO: is this really a necessary field? + "domain": [qna_yaml["domain"]] * len(chunks), + } + ) + chunk_ds_with_icls = add_icls(qna_yaml, chunk_ds) + datasets.append(chunk_ds_with_icls) + + return safe_concatenate_datasets(datasets) + +def safe_concatenate_datasets(datasets: list[Dataset]) -> Dataset: + """ + Concatenate datasets safely, ignoring any datasets that are None or empty. + Args: + datasets (list[Dataset]): List of Dataset objects to concatenate. + Returns: + Dataset: Dataset object with concatenated datasets. + """ + filtered_datasets = [ds for ds in datasets if ds is not None and ds.num_rows > 0] + + if not filtered_datasets: + return None + + return concatenate_datasets(filtered_datasets) + +def get_token_count(text, tokenizer): + return len(tokenizer.tokenize(text)) + +def add_icls(qna_yaml: Dict[str, str], chunked_document: Dataset) -> Dataset: + """ + Add the ICLS label to the dataset. + Args: + qna_yaml (Dict): object representing qna.yaml file. + dataset (Dataset): Dataset object. + Returns: + Dataset: Dataset object with ICLS label. + """ + # TODO: make the tokenizer configurable at some level + tokenizer = AutoTokenizer.from_pretrained("instructlab/granite-7b-lab") + icl = qna_yaml["seed_examples"] + chunked_document_all_icl = [] + for icl_ in icl: + chunked_document_all_icl.append( + chunked_document.map( + lambda x: { + "icl_document": icl_["context"], + "icl_query_1": icl_["questions_and_answers"][0]["question"], + "icl_response_1": icl_["questions_and_answers"][0]["answer"], + "icl_query_2": icl_["questions_and_answers"][1]["question"], + "icl_response_2": icl_["questions_and_answers"][1]["answer"], + "icl_query_3": icl_["questions_and_answers"][2]["question"], + "icl_response_3": icl_["questions_and_answers"][2]["answer"], + } + ) + ) + chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl) + chunked_document_all_icl = chunked_document_all_icl.map( + lambda x: { + "chunks": chunk_document( + [x["document"]], server_ctx_size=4096, chunk_word_count=1024 + ) + if get_token_count(x["document"], tokenizer) > 1024 + else [x["document"]] + } + ) + df = chunked_document_all_icl.to_pandas() + df_exploded = df.explode("chunks").reset_index(drop=True) + new_ds = Dataset.from_pandas(df_exploded) + new_ds = new_ds.remove_columns("document").rename_columns( + {"chunks": "document"} + ) + + # Only keep document greater than 100 tokens + new_ds = new_ds.filter( + lambda x: get_token_count(x["document"], tokenizer) > 100 + ) + return new_ds