From e3b80d4834b640c3ed39ecfd7c3291f52268e382 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Fri, 28 Jun 2024 19:04:25 -0400 Subject: [PATCH 01/10] Remove the iterBlock and use openai's 'n' parameter instead --- src/instructlab/sdg/default_flows.py | 69 +++++++++++++--------------- src/instructlab/sdg/iterblock.py | 29 ------------ src/instructlab/sdg/pipeline.py | 6 --- 3 files changed, 32 insertions(+), 72 deletions(-) delete mode 100644 src/instructlab/sdg/iterblock.py diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 582afd29..177cdb15 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -7,7 +7,6 @@ # Local from .filterblock import FilterByValueBlock -from .iterblock import IterBlock from .llmblock import LLMBlock from .utilblocks import CombineColumnsBlock @@ -47,12 +46,10 @@ class _SimpleFlow(Flow): def get_flow(self) -> list: return [ { - "block_type": IterBlock, - "block_config": { - "block_name": "", # must be set by subclass - "num_iters": self.num_iters, - "block_type": LLMBlock, - "block_kwargs": { + "block_type": LLMBlock, + "block_name": "", # must be set by subclass + "num_iters": self.num_iters, + "block_kwargs": { "block_name": "", # must be set by subclass "config_path": "", # must be set by subclass "client": self.client, @@ -63,13 +60,13 @@ def get_flow(self) -> list: "num_procs": 8, "batched": self.batched, }, - }, - "gen_kwargs": { - "max_tokens": 2048, - "temperature": 0.7, - }, - "drop_duplicates": ["output"], }, + "gen_kwargs": { + "max_tokens": 2048, + "temperature": 0.7, + "n": 1 + }, + "drop_duplicates": ["output"], } ] @@ -382,30 +379,28 @@ class SynthGroundedSkillsFlow(Flow): def get_flow(self) -> list: return [ { - "block_type": IterBlock, - "block_config": { - "block_name": "context_iter", - "num_iters": 10, - "block_type": LLMBlock, - "block_kwargs": { - "block_name": "gen_contexts", - "config_path": os.path.join( - self.sdg_base, - "configs/skills/contexts.yaml", - ), - "client": self.client, - "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), - "output_cols": ["context"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, - }, - "gen_kwargs": { - "temperature": 0.7, - "max_tokens": 2048, - }, + "block_type": LLMBlock, + "block_name": "context_iter", + "block_kwargs": { + "block_name": "gen_contexts", + "config_path": os.path.join( + self.sdg_base, + "configs/skills/contexts.yaml", + ), + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["context"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, + } + }, + "gen_kwargs": { + "num_samples": 30, + "temperature": 0.7, + "max_tokens": 2048, + "n": 10 }, }, { diff --git a/src/instructlab/sdg/iterblock.py b/src/instructlab/sdg/iterblock.py deleted file mode 100644 index 21a20470..00000000 --- a/src/instructlab/sdg/iterblock.py +++ /dev/null @@ -1,29 +0,0 @@ -# Third Party -from datasets import Dataset - -# Local -from .block import Block -from .logger_config import setup_logger - -logger = setup_logger(__name__) - - -class IterBlock(Block): - def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs): - super().__init__(block_name) - self.num_iters = num_iters - self.block = block_type(**block_kwargs) - self.gen_kwargs = kwargs.get("gen_kwargs", {}) - self.gen_kwargs = kwargs.get("gen_kwargs", {}) - - def generate(self, samples, **gen_kwargs) -> Dataset: - generated_samples = [] - num_iters = self.num_iters - - for _ in range(num_iters): - batch_generated = self.block.generate( - samples, **{**self.gen_kwargs, **gen_kwargs} - ) - generated_samples.extend(batch_generated) - - return Dataset.from_list(generated_samples) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index fc93f78d..982a6ecb 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -39,12 +39,6 @@ def generate(self, dataset) -> Dataset: drop_duplicates_cols = block_prop.get("drop_duplicates", False) block = block_type(**block_config) - if block_type == IterBlock: - block_kwargs = block_config.pop("block_kwargs") - block = block_type(**block_config, block_kwargs=block_kwargs) - else: - block = block_type(**block_config) - logger.info("Running block: %s", block_config["block_name"]) logger.info(dataset) From 9a9cb4ea786999728075e0a15b2bcdb672682e22 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Fri, 28 Jun 2024 20:01:10 -0400 Subject: [PATCH 02/10] some debug --- scripts/test_grounded_skills.py | 2 +- src/instructlab/sdg/default_flows.py | 30 ++++++++++++---------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index 338edb6c..b4edcd59 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 177cdb15..55b73932 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -29,11 +29,10 @@ def _get_model_prompt(model_family): class Flow(ABC): - def __init__(self, client, model_family, model_id, num_iters, batched=True) -> None: + def __init__(self, client, model_family, model_id, batched=True) -> None: self.client = client self.model_family = model_family self.model_id = model_id - self.num_iters = num_iters self.batched = batched self.sdg_base = resources.files(__package__) @@ -47,19 +46,17 @@ def get_flow(self) -> list: return [ { "block_type": LLMBlock, - "block_name": "", # must be set by subclass - "num_iters": self.num_iters, - "block_kwargs": { - "block_name": "", # must be set by subclass - "config_path": "", # must be set by subclass - "client": self.client, - "model_id": self.model_id, - "model_prompt": _get_model_prompt(self.model_family), - "output_cols": ["output"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, + "block_config": { + "block_name": "", # must be set by subclass + "config_path": "", # must be set by subclass + "client": self.client, + "model_id": self.model_id, + "model_prompt": _get_model_prompt(self.model_family), + "output_cols": ["output"], + "batch_kwargs": { + "num_procs": 8, + "batched": self.batched, + }, }, "gen_kwargs": { "max_tokens": 2048, @@ -380,8 +377,7 @@ def get_flow(self) -> list: return [ { "block_type": LLMBlock, - "block_name": "context_iter", - "block_kwargs": { + "block_config": { "block_name": "gen_contexts", "config_path": os.path.join( self.sdg_base, From 777e05270efbcfd5a8d523b4b34343220fc3f845 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Sun, 30 Jun 2024 23:35:03 -0400 Subject: [PATCH 03/10] fix zipping of samples and outputs --- scripts/test_freeform_skills.py | 2 +- scripts/test_knowledge.py | 4 ++-- src/instructlab/sdg/llmblock.py | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index a8612c09..9b5ce810 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aeedcf59..75bd7783 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 7952609a..ce333071 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -123,8 +123,11 @@ def generate(self, samples, **gen_kwargs) -> Dataset: outputs = [self._generate([sample], **gen_kwargs)[0] for sample in samples] logger.debug("Generated outputs: {}".format(outputs)) + num_parallel_samples = gen_kwargs.get("n", 1) + n_samples = [item for item in samples for i in range(num_parallel_samples)] + new_data = [] - for sample, output in zip(samples, outputs): + for sample, output in zip(n_samples, outputs): parsed_outputs = self._parse(output) # pylint: disable=consider-using-generator max_length = max([len(value) for value in parsed_outputs.values()]) From 9ca5578061bb2af77e8cb0571344019fef216154 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Mon, 1 Jul 2024 14:13:51 -0400 Subject: [PATCH 04/10] some refactoring --- src/instructlab/sdg/llmblock.py | 6 ++++-- src/instructlab/sdg/pipeline.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ce333071..338b8d2b 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -124,10 +124,12 @@ def generate(self, samples, **gen_kwargs) -> Dataset: logger.debug("Generated outputs: {}".format(outputs)) num_parallel_samples = gen_kwargs.get("n", 1) - n_samples = [item for item in samples for i in range(num_parallel_samples)] + extended_samples = [] + for item in samples: + extended_samples.extend([item] * num_parallel_samples) new_data = [] - for sample, output in zip(n_samples, outputs): + for sample, output in zip(extended_samples, outputs): parsed_outputs = self._parse(output) # pylint: disable=consider-using-generator max_length = max([len(value) for value in parsed_outputs.values()]) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 982a6ecb..bc570a83 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -3,7 +3,6 @@ from datasets import Dataset # Local -from .iterblock import IterBlock from .logger_config import setup_logger logger = setup_logger(__name__) From b5a4862ad1b558ef2770406e0514291c71f4b4fc Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 15:44:54 -0400 Subject: [PATCH 05/10] fix the num_samples location --- src/instructlab/sdg/default_flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 55b73932..ae1f4e92 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -388,12 +388,12 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["context"], "batch_kwargs": { + "num_samples": 30, "num_procs": 8, "batched": self.batched, } }, "gen_kwargs": { - "num_samples": 30, "temperature": 0.7, "max_tokens": 2048, "n": 10 From e0866fcf9329e44f9ad1f652612226e13db9443b Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 17:40:29 -0400 Subject: [PATCH 06/10] update generate_data API calls --- scripts/test_freeform_skills.py | 2 +- scripts/test_grounded_skills.py | 2 +- scripts/test_knowledge.py | 4 ++-- src/instructlab/sdg/default_flows.py | 13 +++++++------ src/instructlab/sdg/generate_data.py | 8 ++++---- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 9b5ce810..9b1f443a 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index b4edcd59..abbce46f 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 75bd7783..aa7bfbcd 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index ae1f4e92..2bd199ec 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -29,10 +29,11 @@ def _get_model_prompt(model_family): class Flow(ABC): - def __init__(self, client, model_family, model_id, batched=True) -> None: + def __init__(self, client, model_family, model_id, num_instructions_to_generate, batched=True) -> None: self.client = client self.model_family = model_family self.model_id = model_id + self.num_instructions_to_generate = num_instructions_to_generate self.batched = batched self.sdg_base = resources.files(__package__) @@ -61,7 +62,7 @@ def get_flow(self) -> list: "gen_kwargs": { "max_tokens": 2048, "temperature": 0.7, - "n": 1 + "n": self.num_instructions_to_generate }, "drop_duplicates": ["output"], } @@ -281,7 +282,7 @@ def get_flow(self) -> list: "output_cols": ["question"], "batch_kwargs": { "num_procs": 8, - "num_samples": 30, + "num_samples": self.num_instructions_to_generate, "batched": self.batched, }, }, @@ -388,7 +389,6 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["context"], "batch_kwargs": { - "num_samples": 30, "num_procs": 8, "batched": self.batched, } @@ -396,8 +396,9 @@ def get_flow(self) -> list: "gen_kwargs": { "temperature": 0.7, "max_tokens": 2048, - "n": 10 + "n": self.num_instructions_to_generate }, + "drop_duplicates": ["context"], }, { "block_type": LLMBlock, @@ -412,6 +413,7 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["question"], "batch_kwargs": { + "num_samples": 3, "num_procs": 8, "batched": self.batched, }, @@ -433,7 +435,6 @@ def get_flow(self) -> list: "batch_kwargs": { "num_procs": 8, "batched": self.batched, - "num_samples": 10, }, }, }, diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 2d9b932a..e252c142 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -124,7 +124,7 @@ def _gen_test_data( outfile.write("\n") -def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): +def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched): knowledge_flow_types = [] freeform_skill_flow_types = [] grounded_skill_flow_types = [] @@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in knowledge_flow_types @@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in freeform_skill_flow_types @@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched): [ Pipeline( flow_type( - client, model_family, model_name, num_iters, batched + client, model_family, model_name, num_instructions_to_generate, batched ).get_flow() ) for flow_type in grounded_skill_flow_types From 85069cf3ee77c5a077a4957ddafee936b6a37a75 Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 17:46:01 -0400 Subject: [PATCH 07/10] change back the defaults in test scripts --- scripts/test_freeform_skills.py | 2 +- scripts/test_grounded_skills.py | 2 +- scripts/test_knowledge.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py index 9b1f443a..a8612c09 100644 --- a/scripts/test_freeform_skills.py +++ b/scripts/test_freeform_skills.py @@ -49,7 +49,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() +skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 1).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py index abbce46f..338edb6c 100644 --- a/scripts/test_grounded_skills.py +++ b/scripts/test_grounded_skills.py @@ -97,7 +97,7 @@ ds = Dataset.from_list(samples) -skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow() +skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 10).get_flow() skills_pipe = Pipeline(skills_flow) sdg = SDG([skills_pipe]) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index aa7bfbcd..aeedcf59 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -38,8 +38,8 @@ ds = Dataset.from_list(samples) -mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow() -knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow() +mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 1).get_flow() +knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 1).get_flow() knowledge_pipe = Pipeline(knowledge_flow) mmlu_pipe = Pipeline(mmlu_flow) From ed8d95ba83887f7d2bdf2ba91946b05ef6059efb Mon Sep 17 00:00:00 2001 From: Nikhil Palaskar Date: Tue, 2 Jul 2024 18:24:33 -0400 Subject: [PATCH 08/10] fix SimpleFlows --- src/instructlab/sdg/default_flows.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 2bd199ec..11cf7b8e 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -72,10 +72,9 @@ def get_flow(self) -> list: class SimpleKnowledgeFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( self.sdg_base, "configs/knowledge/simple_generate_qa.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_knowledge" flow[0]["block_config"]["block_name"] = "gen_knowledge" return flow @@ -83,10 +82,9 @@ def get_flow(self) -> list: class SimpleFreeformSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( self.sdg_base, "configs/skills/simple_generate_qa_freeform.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_freeform" flow[0]["block_config"]["block_name"] = "gen_skill_freeform" return flow @@ -94,10 +92,9 @@ def get_flow(self) -> list: class SimpleGroundedSkillFlow(_SimpleFlow): def get_flow(self) -> list: flow = super().get_flow() - flow[0]["block_config"]["block_kwargs"]["config_path"] = os.path.join( + flow[0]["block_config"]["config_path"] = os.path.join( self.sdg_base, "configs/skills/simple_generate_qa_grounded.yaml" ) - flow[0]["block_config"]["block_kwargs"]["block_name"] = "gen_skill_grounded" flow[0]["block_config"]["block_name"] = "gen_skill_grounded" return flow From 6ed1246b975d73816f6cb0c09b96a6a15dbaf4a2 Mon Sep 17 00:00:00 2001 From: njhill Date: Wed, 3 Jul 2024 13:49:43 -0700 Subject: [PATCH 09/10] Properly support batched/non-batched with vllm/llama.cpp and other streamlining --- src/instructlab/sdg/default_flows.py | 59 +--------------- src/instructlab/sdg/generate_data.py | 14 ++-- src/instructlab/sdg/llmblock.py | 100 +++++++++++++++++---------- 3 files changed, 68 insertions(+), 105 deletions(-) diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py index 11cf7b8e..b8e9c6c0 100644 --- a/src/instructlab/sdg/default_flows.py +++ b/src/instructlab/sdg/default_flows.py @@ -29,12 +29,11 @@ def _get_model_prompt(model_family): class Flow(ABC): - def __init__(self, client, model_family, model_id, num_instructions_to_generate, batched=True) -> None: + def __init__(self, client, model_family, model_id, num_instructions_to_generate) -> None: self.client = client self.model_family = model_family self.model_id = model_id self.num_instructions_to_generate = num_instructions_to_generate - self.batched = batched self.sdg_base = resources.files(__package__) @abstractmethod @@ -54,10 +53,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["output"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, "gen_kwargs": { "max_tokens": 2048, @@ -114,10 +109,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["mmlubench_question", "mmlubench_answer"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, "gen_kwargs": { "temperature": 0, @@ -143,10 +134,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["question", "response"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, "parser_kwargs": { "parser_name": "custom", "parsing_pattern": r"\[(?:Question|QUESTION)\]\s*(.*?)\s*\[(?:Answer|ANSWER)\]\s*(.*?)\s*(?=\[(?:Question|QUESTION)\]|$)", @@ -169,10 +156,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["explanation", "judgment"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, "gen_kwargs": { "max_tokens": 2048, @@ -202,10 +185,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["feedback", "score"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, "gen_kwargs": { "max_tokens": 2048, @@ -236,10 +215,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["explanation", "rating"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, "gen_kwargs": { "max_tokens": 2048, @@ -278,9 +253,7 @@ def get_flow(self) -> list: "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["question"], "batch_kwargs": { - "num_procs": 8, "num_samples": self.num_instructions_to_generate, - "batched": self.batched, }, }, "drop_duplicates": ["question"], @@ -297,10 +270,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["evaluation", "score"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, }, { @@ -329,10 +298,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["response"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, }, { @@ -347,10 +312,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["evaluation", "score"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, }, { @@ -385,10 +346,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["context"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - } }, "gen_kwargs": { "temperature": 0.7, @@ -411,8 +368,6 @@ def get_flow(self) -> list: "output_cols": ["question"], "batch_kwargs": { "num_samples": 3, - "num_procs": 8, - "batched": self.batched, }, }, "drop_duplicates": ["question"], @@ -429,10 +384,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["evaluation", "score"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, }, { @@ -461,10 +412,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["response"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, }, { @@ -479,10 +426,6 @@ def get_flow(self) -> list: "model_id": self.model_id, "model_prompt": _get_model_prompt(self.model_family), "output_cols": ["evaluation", "score"], - "batch_kwargs": { - "num_procs": 8, - "batched": self.batched, - }, }, }, { diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index e252c142..dcc1d5ee 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -124,7 +124,7 @@ def _gen_test_data( outfile.write("\n") -def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched): +def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate): knowledge_flow_types = [] freeform_skill_flow_types = [] grounded_skill_flow_types = [] @@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_ge [ Pipeline( flow_type( - client, model_family, model_name, num_instructions_to_generate, batched + client, model_family, model_name, num_instructions_to_generate ).get_flow() ) for flow_type in knowledge_flow_types @@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_ge [ Pipeline( flow_type( - client, model_family, model_name, num_instructions_to_generate, batched + client, model_family, model_name, num_instructions_to_generate ).get_flow() ) for flow_type in freeform_skill_flow_types @@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_ge [ Pipeline( flow_type( - client, model_family, model_name, num_instructions_to_generate, batched + client, model_family, model_name, num_instructions_to_generate ).get_flow() ) for flow_type in grounded_skill_flow_types @@ -242,17 +242,12 @@ def generate_data( else: model_family = MODEL_FAMILY_MERLINITE - # TODO -- llama-cpp doesn't support batching, we need to get a hint from the CLI - # about whether we can turn this on (whether vllm is used or not) - batched = False - sdg_knowledge, sdg_freeform_skill, sdg_grounded_skill = _sdg_init( pipeline, client, model_family, model_name, num_instructions_to_generate, - batched, ) if console_output: @@ -267,7 +262,6 @@ def generate_data( if not samples: raise utils.GenerateException("Error: No samples found in leaf node.") - sdg = None if samples[0].get("document"): sdg = sdg_knowledge elif samples[0].get("context"): diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 338b8d2b..ad690415 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -3,6 +3,7 @@ from typing import Any, Dict import re +import openai # Third Party from datasets import Dataset @@ -45,8 +46,13 @@ def __init__( "model": self.model, "temperature": 0, "max_tokens": 12000, + #"seed": 12345, TBD } + # Whether the LLM server supports a list of input prompts + # and supports the n parameter to generate n outputs per input + self.server_supports_batched = server_supports_batched(client, model_id) + def _parse(self, generated_string) -> dict: matches = {} @@ -84,19 +90,33 @@ def _parse(self, generated_string) -> dict: return matches + def _format_prompt(self, sample: Dict) -> str: + return self.prompt_template.format(**sample).strip() + def _generate(self, samples, **gen_kwargs) -> list: prompts = [ - self.model_prompt.format( - prompt=self.prompt_template.format(**sample).strip() - ) + self.model_prompt.format(prompt=self._format_prompt(sample)) for sample in samples ] - response = self.client.completions.create( - prompt=prompts, **{**self.defaults, **gen_kwargs} - ) - return [choice.text.strip() for choice in response.choices] + generate_args = {**self.defaults, **gen_kwargs} + + if self.server_supports_batched: + response = self.client.completions.create( + prompt=prompts, **generate_args + ) + return [choice.text.strip() for choice in response.choices] + + n = gen_kwargs.get("n", 1) + results = [] + for prompt in prompts: + for _ in range(n): + response = self.client.completions.create( + prompt=prompt, **generate_args + ) + results.append(response.choices[0].text.strip()) + return results - def generate(self, samples, **gen_kwargs) -> Dataset: + def generate(self, samples: Dataset, **gen_kwargs) -> Dataset: """ Generate the output from the block. This method should first validate the input data, then generate the output, and finally parse the generated output before returning it. @@ -104,7 +124,6 @@ def generate(self, samples, **gen_kwargs) -> Dataset: :return: The parsed output after generation. """ num_samples = self.batch_params.get("num_samples", None) - batched = self.batch_params.get("batched", False) logger.debug("Generating outputs for {} samples".format(len(samples))) if (num_samples is not None) and ("num_samples" not in samples.column_names): @@ -113,29 +132,32 @@ def generate(self, samples, **gen_kwargs) -> Dataset: # validate each sample for sample in samples: if not self._validate(self.prompt_template, sample): - return None + logger.warn("Sample failed validation") #TODO add details + #TODO remove sample from samples + + if len(samples) == 0: + return Dataset.from_list([]) # generate the output - outputs = [] - if batched: - outputs = self._generate(samples, **gen_kwargs) - else: - outputs = [self._generate([sample], **gen_kwargs)[0] for sample in samples] - logger.debug("Generated outputs: {}".format(outputs)) + + outputs = self._generate(samples, **gen_kwargs) + logger.debug("Generated outputs: %s", outputs) num_parallel_samples = gen_kwargs.get("n", 1) extended_samples = [] for item in samples: extended_samples.extend([item] * num_parallel_samples) + print(f"num outputs is {len(outputs)}") new_data = [] for sample, output in zip(extended_samples, outputs): parsed_outputs = self._parse(output) - # pylint: disable=consider-using-generator + max_length = max([len(value) for value in parsed_outputs.values()]) for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())): new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))}) + print(f"num output after parse is {len(new_data)}") return Dataset.from_list(new_data) @@ -172,29 +194,33 @@ def __init__( **self._load_config(config) ) - def _generate(self, samples, **gen_kwargs) -> str: + def _format_prompt(self, sample: Dict) -> str: if isinstance(self.prompt_template, dict): - prompts = [ - self.model_prompt.format( - prompt=self.prompt_template[sample[self.selector_column_name]] - .format(**sample) - .strip() - ) - for sample in samples - ] - else: - prompts = [ - self.model_prompt.format( - prompt=self.prompt_template.format(**sample).strip() - ) - for sample in samples - ] - response = self.client.completions.create( - prompt=prompts, **{**self.defaults, **gen_kwargs} - ) - return [choice.text.strip() for choice in response.choices] + return (self.prompt_template[sample[self.selector_column_name]] + .format(**sample).strip()) + + return self.prompt_template.format(**sample).strip() def validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: if isinstance(prompt_template, dict): prompt_template = prompt_template[input_dict[self.selector_column_name]] return super()._validate(prompt_template, input_dict) + + +def server_supports_batched(client, model_id: str) -> bool: + supported = getattr(client, "server_supports_batched", None) + if supported is not None: + return supported + try: + # Make a test call to the server to determine whether it supports + # multiple input prompts per request and also the n parameter + response = client.completions.create( + model=model_id, prompt=["test1", "test2"], max_tokens=1, n=3 + ) + # Number outputs should be 2 * 3 = 6 + supported = len(response.choices) == 6 + except openai.InternalServerError: + supported = False + client.server_supports_batched = supported + logger.info(f"LLM server supports batched inputs: {client.server_supports_batched}") + return supported From 53c14f9231b5483940d2edc75883643969ca9bc0 Mon Sep 17 00:00:00 2001 From: njhill Date: Wed, 3 Jul 2024 16:10:50 -0700 Subject: [PATCH 10/10] Address review comments --- src/instructlab/sdg/llmblock.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ad690415..6fc588c4 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -132,7 +132,7 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset: # validate each sample for sample in samples: if not self._validate(self.prompt_template, sample): - logger.warn("Sample failed validation") #TODO add details + logger.warning("Sample failed validation") #TODO add details #TODO remove sample from samples if len(samples) == 0: @@ -145,19 +145,20 @@ def generate(self, samples: Dataset, **gen_kwargs) -> Dataset: num_parallel_samples = gen_kwargs.get("n", 1) extended_samples = [] + # Duplicate each input sample n times, where n is the number + # of output sequences generated per input, so that we can + # pair up the inputs and outputs. for item in samples: extended_samples.extend([item] * num_parallel_samples) - print(f"num outputs is {len(outputs)}") new_data = [] for sample, output in zip(extended_samples, outputs): parsed_outputs = self._parse(output) - max_length = max([len(value) for value in parsed_outputs.values()]) + max_length = max(len(value) for value in parsed_outputs.values()) for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())): new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))}) - print(f"num output after parse is {len(new_data)}") return Dataset.from_list(new_data)