From c5d10999aa32d220c1fec529f810e5fd99102384 Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Wed, 26 Jun 2024 18:15:34 +0000 Subject: [PATCH 1/9] adding knowledge blocks Signed-off-by: Abhi.B --- src/instructlab/sdg/llmblock.py | 48 +++++++++++++++++++++++++++++++ src/instructlab/sdg/utilblocks.py | 13 +++++---- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index ad429b75..bba5403c 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -107,3 +107,51 @@ def generate(self, samples, **gen_kwargs) -> Dataset: new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))}) return Dataset.from_list(new_data) + + +class ConditionalLLMBlock(LLMBlock): + def __init__(self, block_name, config_paths, client, model_id, output_cols, selector_column_name, parser_name, model_prompt="{prompt}", **batch_kwargs) -> None: + super().__init__(block_name, config_paths[0][0], client, model_id, output_cols, model_prompt=model_prompt, **batch_kwargs) + self.selector_column_name = selector_column_name + self.prompt_template = {} + self.parser_name = parser_name + if len(config_paths) == 1 and config_paths[0][1] == 'All': + self.prompt_template = self.prompt_struct.format(**self.block_config) + else: + for (config, config_key) in config_paths: + self.prompt_template[config_key] = self.prompt_struct.format(**self._load_config(config)) + + def _parse(self, generated_string): + if self.parser_name == 'default': + return matches + elif self.parser_name == 'multi-line-logical-section': + return {self.output_cols[0]: self.extract_multiline_logical_section(generated_string)} + + def extract_multiline_logical_section(self, text): + """ + Extracts multi-line points from the provided text into a list, removing the point numbers. + + Args: + text (str): The input text containing multi-line points. + + Returns: + list: A list of multi-line points without the point numbers. + """ + pattern = re.compile(r'## Logical Section \d+: (.*?)(?=## Logical Section \d+:|$)', re.DOTALL) + sections = pattern.findall(text) + + return sections + + def _generate(self, samples, **gen_kwargs) -> str: + if isinstance(self.prompt_template, dict): + prompts = [self.model_prompt.format(prompt=self.prompt_template[sample[self.selector_column_name]].format(**sample).strip()) for sample in samples] + else: + prompts = [self.model_prompt.format(prompt=self.prompt_template.format(**sample).strip()) for sample in samples] + response = self.client.completions.create(prompt=prompts, **{**self.defaults, **gen_kwargs}) + return [choice.text.strip() for choice in response.choices] + + + def _validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: + if isinstance(prompt_template, dict): + prompt_template = prompt_template[input_dict[self.selector_column_name]] + return super()._validate(prompt_template, input_dict) \ No newline at end of file diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 5f3c0407..157c7948 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -9,15 +9,19 @@ logger = setup_logger(__name__) + class SamplePopulatorBlock(Block): - def __init__(self, config_paths, column_name, **batch_kwargs) -> None: - super().__init__(block_name=self.__class__.__name__) + def __init__(self, config_paths, column_name, post_fix="", **batch_kwargs) -> None: self.configs = {} for config in config_paths: + if post_fix: + config_name = config.replace('.yaml', f'_{post_fix}.yaml') + else: + config_name = config config_key = config.split("/")[-1].split(".")[0] - self.configs[config_key] = self._load_config(config) + self.configs[config_key] = self._load_config(config_name) self.column_name = column_name - self.num_procs = batch_kwargs.get("num_procs", 8) + self.num_procs = batch_kwargs.get('num_procs', 8) def _generate(self, sample) -> dict: sample = {**sample, **self.configs[sample[self.column_name]]} @@ -27,7 +31,6 @@ def generate(self, samples) -> Dataset: samples = samples.map(self._generate, num_proc=self.num_procs) return samples - class SelectorBlock(Block): def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None: super().__init__(block_name=self.__class__.__name__) From 1cb224a3aaaabfcf477ca1e2aad01d5eeee3c0be Mon Sep 17 00:00:00 2001 From: "Abhi.B" Date: Wed, 26 Jun 2024 18:55:21 +0000 Subject: [PATCH 2/9] :ambulance: parse func bug fix Signed-off-by: Abhi.B --- src/instructlab/sdg/llmblock.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index bba5403c..d7832ad2 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -8,6 +8,7 @@ # Local from .block import Block from .logger_config import setup_logger +from typing import Any, Dict, Union logger = setup_logger(__name__) @@ -56,9 +57,8 @@ def _parse(self, generated_string) -> dict: pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag) all_matches = re.findall(pattern, generated_string, re.DOTALL) matches[output_col] = ( - [match.strip() for match in all_matches] if all_matches else None + [match.strip() for match in all_matches] if all_matches else [] ) - return matches def _generate(self, samples, **gen_kwargs) -> list: @@ -123,7 +123,7 @@ def __init__(self, block_name, config_paths, client, model_id, output_cols, sele def _parse(self, generated_string): if self.parser_name == 'default': - return matches + return super()._parse(generated_string) elif self.parser_name == 'multi-line-logical-section': return {self.output_cols[0]: self.extract_multiline_logical_section(generated_string)} From 18642d69c910a626a520e2646d093f95288a7990 Mon Sep 17 00:00:00 2001 From: abhi1092 Date: Wed, 26 Jun 2024 19:23:32 +0000 Subject: [PATCH 3/9] updating knowledge generation template Signed-off-by: abhi1092 --- .../knowledge/generate_questions_responses.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml index b424f517..d3b4b741 100644 --- a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml +++ b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml @@ -48,24 +48,24 @@ examples: | For this {domain} domain here are some sample questions: [Start of Question] - {question_1} + {icl_query_1} [End of Question] [Start of Response] - {response_1} + {icl_response_1} [End of Response] [Start of Question] - {question_2} + {icl_query_2} [End of Question] [Start of Response] - {response_2} + {icl_response_2} [End of Response] [Start of Question] - {question_3} + {icl_query_3} [End of Question] [Start of Response] - {response_3} + {icl_response_3} [End of Response] Here is the document: From cea9e167966cbeda11edd446690521ce40f762c7 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Wed, 26 Jun 2024 15:25:58 -0400 Subject: [PATCH 4/9] lint fixes Signed-off-by: Oindrilla Chatterjee --- src/instructlab/sdg/llmblock.py | 70 ++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index d7832ad2..07753505 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +from typing import Any, Dict, Union import re # Third Party @@ -8,7 +9,6 @@ # Local from .block import Block from .logger_config import setup_logger -from typing import Any, Dict, Union logger = setup_logger(__name__) @@ -110,22 +110,47 @@ def generate(self, samples, **gen_kwargs) -> Dataset: class ConditionalLLMBlock(LLMBlock): - def __init__(self, block_name, config_paths, client, model_id, output_cols, selector_column_name, parser_name, model_prompt="{prompt}", **batch_kwargs) -> None: - super().__init__(block_name, config_paths[0][0], client, model_id, output_cols, model_prompt=model_prompt, **batch_kwargs) + def __init__( + self, + block_name, + config_paths, + client, + model_id, + output_cols, + selector_column_name, + parser_name, + model_prompt="{prompt}", + **batch_kwargs, + ) -> None: + super().__init__( + block_name, + config_paths[0][0], + client, + model_id, + output_cols, + model_prompt=model_prompt, + **batch_kwargs, + ) self.selector_column_name = selector_column_name self.prompt_template = {} self.parser_name = parser_name - if len(config_paths) == 1 and config_paths[0][1] == 'All': + if len(config_paths) == 1 and config_paths[0][1] == "All": self.prompt_template = self.prompt_struct.format(**self.block_config) else: - for (config, config_key) in config_paths: - self.prompt_template[config_key] = self.prompt_struct.format(**self._load_config(config)) + for config, config_key in config_paths: + self.prompt_template[config_key] = self.prompt_struct.format( + **self._load_config(config) + ) def _parse(self, generated_string): - if self.parser_name == 'default': + if self.parser_name == "default": return super()._parse(generated_string) - elif self.parser_name == 'multi-line-logical-section': - return {self.output_cols[0]: self.extract_multiline_logical_section(generated_string)} + elif self.parser_name == "multi-line-logical-section": + return { + self.output_cols[0]: self.extract_multiline_logical_section( + generated_string + ) + } def extract_multiline_logical_section(self, text): """ @@ -137,21 +162,36 @@ def extract_multiline_logical_section(self, text): Returns: list: A list of multi-line points without the point numbers. """ - pattern = re.compile(r'## Logical Section \d+: (.*?)(?=## Logical Section \d+:|$)', re.DOTALL) + pattern = re.compile( + r"## Logical Section \d+: (.*?)(?=## Logical Section \d+:|$)", re.DOTALL + ) sections = pattern.findall(text) return sections def _generate(self, samples, **gen_kwargs) -> str: if isinstance(self.prompt_template, dict): - prompts = [self.model_prompt.format(prompt=self.prompt_template[sample[self.selector_column_name]].format(**sample).strip()) for sample in samples] + prompts = [ + self.model_prompt.format( + prompt=self.prompt_template[sample[self.selector_column_name]] + .format(**sample) + .strip() + ) + for sample in samples + ] else: - prompts = [self.model_prompt.format(prompt=self.prompt_template.format(**sample).strip()) for sample in samples] - response = self.client.completions.create(prompt=prompts, **{**self.defaults, **gen_kwargs}) + prompts = [ + self.model_prompt.format( + prompt=self.prompt_template.format(**sample).strip() + ) + for sample in samples + ] + response = self.client.completions.create( + prompt=prompts, **{**self.defaults, **gen_kwargs} + ) return [choice.text.strip() for choice in response.choices] - def _validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: if isinstance(prompt_template, dict): prompt_template = prompt_template[input_dict[self.selector_column_name]] - return super()._validate(prompt_template, input_dict) \ No newline at end of file + return super()._validate(prompt_template, input_dict) From 1c90263e7a22fa2797d87c98f4474499e4dbbc9f Mon Sep 17 00:00:00 2001 From: abhi1092 Date: Wed, 26 Jun 2024 19:37:24 +0000 Subject: [PATCH 5/9] fixing col remove bug Signed-off-by: abhi1092 --- src/instructlab/sdg/pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py index 0de65d1b..fc93f78d 100644 --- a/src/instructlab/sdg/pipeline.py +++ b/src/instructlab/sdg/pipeline.py @@ -34,7 +34,7 @@ def generate(self, dataset) -> Dataset: for block_prop in self.chained_blocks: block_type = block_prop["block_type"] block_config = block_prop["block_config"] - drop_columns = block_prop.get("drop_columns", None) + drop_columns = block_prop.get("drop_columns", []) gen_kwargs = block_prop.get("gen_kwargs", {}) drop_duplicates_cols = block_prop.get("drop_duplicates", False) block = block_type(**block_config) @@ -50,8 +50,9 @@ def generate(self, dataset) -> Dataset: dataset = block.generate(dataset, **gen_kwargs) + drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names] if drop_columns: - dataset = dataset.remove_columns(drop_columns) + dataset = dataset.remove_columns(drop_columns_in_ds) if drop_duplicates_cols: dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols) From 87a95fd7af7f64d0bf953ff7a053e12b5faec44f Mon Sep 17 00:00:00 2001 From: abhi1092 Date: Wed, 26 Jun 2024 19:38:25 +0000 Subject: [PATCH 6/9] renaming the test example structure with new prompt variables --- scripts/test_knowledge.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index e800d3b2..692077ff 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -24,16 +24,15 @@ samples = [ { - "question_1": "what is the location of the tubal tonsils?", - "response_1": "The location of the tubal tonsils is the roof of the pharynx.", - "question_2": "How long does the adenoid grow?", + "icl_query_1": "what is the location of the tubal tonsils?", + "icl_response_1": "The location of the tubal tonsils is the roof of the pharynx.", + "icl_query_2": "How long does the adenoid grow?", "task_description": "Teaching about human anatomy, specifically tonsils", - "response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.", - "question_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?", - "response_3": "The tonsils are the immune systems first line of defense.", + "icl_response_2": "The adenoid grows until the age of 5, starts to shrink at the age of 7 and becomes small in adulthood.", + "icl_query_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?", + "icl_response_3": "The tonsils are the immune systems first line of defense.", "document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat", - "domain": "textbook", - } + "domain": "textbook", } ] ds = Dataset.from_list(samples) From faaff0ec4da6b001c94b7e2eb8880175ac710732 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Wed, 26 Jun 2024 16:48:05 -0400 Subject: [PATCH 7/9] =?UTF-8?q?=F0=9F=9A=A8lint=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Oindrilla Chatterjee --- scripts/test_knowledge.py | 3 ++- src/instructlab/sdg/utilblocks.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py index 692077ff..d777c8c3 100644 --- a/scripts/test_knowledge.py +++ b/scripts/test_knowledge.py @@ -32,7 +32,8 @@ "icl_query_3": "What is the immune systems first line of defense against ingested or inhaled foreign pathogens?", "icl_response_3": "The tonsils are the immune systems first line of defense.", "document": "The **tonsils** are a set of lymphoid organs facing into the aerodigestive tract, which is known as Waldeyer's tonsillar ring and consists of the adenoid tonsil or pharyngeal tonsil, two tubal tonsils, two palatine tonsils, and the lingual tonsils. These organs play an important role in the immune system. When used unqualified, the term most commonly refers specifically to the palatine tonsils, which are two lymphoid organs situated at either side of the back of the human throat. The palatine tonsils and the adenoid tonsil are organs consisting of lymphoepithelial tissue located near the oropharynx and nasopharynx parts of the throat", - "domain": "textbook", } + "domain": "textbook", + } ] ds = Dataset.from_list(samples) diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 157c7948..78c9e2a5 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -9,19 +9,18 @@ logger = setup_logger(__name__) - class SamplePopulatorBlock(Block): def __init__(self, config_paths, column_name, post_fix="", **batch_kwargs) -> None: self.configs = {} for config in config_paths: if post_fix: - config_name = config.replace('.yaml', f'_{post_fix}.yaml') + config_name = config.replace(".yaml", f"_{post_fix}.yaml") else: config_name = config config_key = config.split("/")[-1].split(".")[0] self.configs[config_key] = self._load_config(config_name) self.column_name = column_name - self.num_procs = batch_kwargs.get('num_procs', 8) + self.num_procs = batch_kwargs.get("num_procs", 8) def _generate(self, sample) -> dict: sample = {**sample, **self.configs[sample[self.column_name]]} @@ -31,6 +30,7 @@ def generate(self, samples) -> Dataset: samples = samples.map(self._generate, num_proc=self.num_procs) return samples + class SelectorBlock(Block): def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None: super().__init__(block_name=self.__class__.__name__) From bddb6f6620a4168778b82db23a23e739f80bc7a6 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Wed, 26 Jun 2024 16:57:16 -0400 Subject: [PATCH 8/9] =?UTF-8?q?=F0=9F=93=9Dupdates=20to=20the=20template?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Oindrilla Chatterjee --- .../sdg/configs/knowledge/generate_questions_responses.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml index d3b4b741..4d4a49ef 100644 --- a/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml +++ b/src/instructlab/sdg/configs/knowledge/generate_questions_responses.yaml @@ -68,11 +68,13 @@ examples: | {icl_response_3} [End of Response] +generation: | + Now generate the question and answer pairs, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. + Here is the document: {document} -generation: | - Now generate the question and answer pairs, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Return each question between [Start of Question] and [End of Question] tags and answer between [Start of Response] and [End of Response] tags. + Return each question between [Start of Question] and [End of Question] tags and answer between [Start of Response] and [End of Response] tags. start_tags: ["[Start of Question]", "[Start of Response]"] end_tags: ["[End of Question]", "[End of Response]"] From a8ba7dfee6f5a0e882059946bf3cbe53ff5bd0e7 Mon Sep 17 00:00:00 2001 From: Oindrilla Chatterjee Date: Wed, 26 Jun 2024 17:57:44 -0400 Subject: [PATCH 9/9] =?UTF-8?q?=F0=9F=9A=A8other=20linting=20issues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Oindrilla Chatterjee --- src/instructlab/sdg/llmblock.py | 8 ++++---- src/instructlab/sdg/utilblocks.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py index 07753505..45a20343 100644 --- a/src/instructlab/sdg/llmblock.py +++ b/src/instructlab/sdg/llmblock.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Standard -from typing import Any, Dict, Union +from typing import Any, Dict import re # Third Party @@ -86,7 +86,7 @@ def generate(self, samples, **gen_kwargs) -> Dataset: if (num_samples is not None) and ("num_samples" not in samples.column_names): samples = samples.add_column("num_samples", [num_samples] * len(samples)) - # validate the each sample + # validate each sample for sample in samples: if not self._validate(self.prompt_template, sample): return None @@ -145,7 +145,7 @@ def __init__( def _parse(self, generated_string): if self.parser_name == "default": return super()._parse(generated_string) - elif self.parser_name == "multi-line-logical-section": + if self.parser_name == "multi-line-logical-section": return { self.output_cols[0]: self.extract_multiline_logical_section( generated_string @@ -191,7 +191,7 @@ def _generate(self, samples, **gen_kwargs) -> str: ) return [choice.text.strip() for choice in response.choices] - def _validate(self, prompt_template: str, input_dict: Dict[str, Any]) -> bool: + def _validate(self, prompt_template: str, input_dict: Dict[str, Any], extra_arg=None) -> bool: if isinstance(prompt_template, dict): prompt_template = prompt_template[input_dict[self.selector_column_name]] return super()._validate(prompt_template, input_dict) diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py index 78c9e2a5..e72c735b 100644 --- a/src/instructlab/sdg/utilblocks.py +++ b/src/instructlab/sdg/utilblocks.py @@ -11,6 +11,7 @@ class SamplePopulatorBlock(Block): def __init__(self, config_paths, column_name, post_fix="", **batch_kwargs) -> None: + super().__init__(block_name=self.__class__.__name__) # Call the base class's __init__ self.configs = {} for config in config_paths: if post_fix: