Fix ruff formatting check

Signed-off-by: Russell Bryant <[email protected]> Ignore new sdg lib files in mypy checks Signed-off-by: Russell Bryant <[email protected]> Remove unused python imports Signed-off-by: Russell Bryant <[email protected]> Remove duplicate definition of SynthGroundedSkillsFlow Signed-off-by: Russell Bryant <[email protected]> Add missing calls to parent class init methods Signed-off-by: Russell Bryant <[email protected]> Ignore the last couple of lint warnings Signed-off-by: Russell Bryant <[email protected]>
instructlab · Jun 26, 2024 · 5fc070f · 5fc070f
1 parent 9063aa7
commit 5fc070f
Show file tree

Hide file tree

Showing 9 changed files with 19 additions and 127 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,6 +96,9 @@ disable_error_code = ["import-not-found", "import-untyped"]
 exclude = [
     "^src/instructlab/sdg/generate_data\\.py$",
     "^src/instructlab/sdg/utils\\.py$",
+    "^src/instructlab/sdg/default_flows\\.py$",
+    "^src/instructlab/sdg/llmblock\\.py$",
+    "^src/instructlab/sdg/utilblocks\\.py$",
 ]
 # honor excludes by not following there through imports
 follow_imports = "silent"
diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
@@ -7,7 +7,6 @@
 from src.instructlab.sdg.default_flows import SynthSkillsFlow
 from src.instructlab.sdg.pipeline import Pipeline
 
-
 # for vLLM endpoints, the api_key remains "EMPTY"
 openai_api_key = "EMPTY"
 openai_api_base = "Add model endpoint here"

diff --git a/src/instructlab/sdg/block.py b/src/instructlab/sdg/block.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # Standard
-from abc import ABC, abstractmethod
+from abc import ABC
 from collections import ChainMap
 from typing import Any, Dict, Union
 
 # Third Party
-from datasets import Dataset
 import yaml
 
 # Local

diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
@@ -7,8 +7,8 @@
 
 # Local
 from .filterblock import FilterByValueBlock
-from .llmblock import LLMBlock
 from .iterblock import IterBlock
+from .llmblock import LLMBlock
 
 
 class Flow(ABC):
@@ -389,121 +389,3 @@ def get_flow(self) -> list:
                 },
             },
         ]
-
-
-class SynthGroundedSkillsFlow(Flow):
-    def get_flow(self) -> list:
-        return [
-            {
-                'block_type': IterBlock,
-                'block_config': {
-                    'block_name': 'context_iter',
-                    'num_iters': 10,
-                    'block_type': LLMBlock,
-                    'block_kwargs': {
-                        'block_name': 'gen_contexts',
-                        'config_path': 'src/instructlab/sdg/configs/skills/contexts.yaml',
-                        'client': self.client,
-                        'model_id': self.model_id,
-                        'model_prompt': '<s> [INST] {prompt} [/INST]',
-                        'output_cols': ['context'],
-                        'batch_kwargs': {
-                            'num_procs': 8,
-                            'batched': True,
-                        },
-                    },
-                    'gen_kwargs': {
-                        'temperature': 0.7,
-                        'max_tokens': 2048,
-                    },
-                },
-            },
-            {
-                'block_type': LLMBlock,
-                'block_config': {
-                    'block_name': 'gen_grounded_questions',
-                    'config_path': 'src/instructlab/sdg/configs/skills/grounded_questions.yaml',
-                    'client': self.client,
-                    'model_id': self.model_id,
-                    'model_prompt': '<s> [INST] {prompt} [/INST]',
-                    'output_cols': ['question'],
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                        'batched': True,
-                    },
-                },
-                'drop_duplicates': ['question'],
-            },
-            {
-                'block_type': LLMBlock,
-                'block_config': {
-                    'block_name': 'eval_grounded_questions',
-                    'config_path': 'src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml',
-                    'client': self.client,
-                    'model_id': self.model_id,
-                    'model_prompt': '<s> [INST] {prompt} [/INST]',
-                    'output_cols': ['evaluation', 'score'],
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                        'batched': True,
-                    },
-                },
-            },
-            {
-                'block_type': FilterByValueBlock,
-                'block_config': {
-                    'block_name': 'filter_grounded_questions',
-                    'filter_column': 'score',
-                    'filter_value': 1,
-                    'operation': operator.eq,
-                    'convert_dtype': int,
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                    },
-                },
-                'drop_columns': ['evaluation', 'score', 'num_samples']
-            },
-            {
-                'block_type': LLMBlock,
-                'block_config': {
-                    'block_name': 'gen_grounded_responses',
-                    'config_path': 'src/instructlab/sdg/configs/skills/grounded_responses.yaml',
-                    'client': self.client,
-                    'model_id': self.model_id,
-                    'model_prompt': '<s> [INST] {prompt} [/INST]',
-                    'output_cols': ['answer'],
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                        'batched': True,
-                    },
-                },
-            },
-            {
-                'block_type': LLMBlock,
-                'block_config': {
-                    'block_name': 'evaluate_grounded_qa_pair',
-                    'config_path': 'src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml',
-                    'client': self.client,
-                    'model_id': self.model_id,
-                    'model_prompt': '<s> [INST] {prompt} [/INST]',
-                    'output_cols': ['evaluation', 'score'],
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                        'batched': True,
-                    },
-                },
-            },
-            {
-                'block_type': FilterByValueBlock,
-                'block_config': {
-                    'block_name': 'filter_grounded_qa_pair',
-                    'filter_column': 'score',
-                    'filter_value': 2,
-                    'operation': operator.ge,
-                    'convert_dtype': int,
-                    'batch_kwargs': {
-                        'num_procs': 8,
-                    },
-                },
-            }
-        ]
diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
@@ -13,6 +13,7 @@ class FilterByValueBlock(Block):
     def __init__(
         self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
     ) -> None:
+        super().__init__(block_name=self.__class__.__name__)
         self.value = filter_value
         self.column_name = filter_column
         self.operation = operation

diff --git a/src/instructlab/sdg/iterblock.py b/src/instructlab/sdg/iterblock.py
@@ -1,16 +1,19 @@
-import re
+# Third Party
+from datasets import Dataset
+
+# Local
 from .block import Block
 from .logger_config import setup_logger
-from datasets import Dataset
 
 logger = setup_logger(__name__)
 
+
 class IterBlock(Block):
     def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
         super().__init__(block_name)
         self.num_iters = num_iters
         self.block = block_type(**block_kwargs)
-        self.gen_kwargs = kwargs.get('gen_kwargs', {})
+        self.gen_kwargs = kwargs.get("gen_kwargs", {})
         self.gen_kwargs = kwargs.get("gen_kwargs", {})
 
     def generate(self, samples, **gen_kwargs) -> Dataset:

diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
@@ -13,6 +13,7 @@
 
 
 class LLMBlock(Block):
+    # pylint: disable=too-many-instance-attributes
     def __init__(
         self,
         block_name,
@@ -100,6 +101,7 @@ def generate(self, samples, **gen_kwargs) -> Dataset:
         new_data = []
         for sample, output in zip(samples, outputs):
             parsed_outputs = self._parse(output)
+            # pylint: disable=consider-using-generator
             max_length = max([len(value) for value in parsed_outputs.values()])
             for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
                 new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))})

diff --git a/src/instructlab/sdg/pipeline.py b/src/instructlab/sdg/pipeline.py
@@ -3,8 +3,8 @@
 from datasets import Dataset
 
 # Local
-from .logger_config import setup_logger
 from .iterblock import IterBlock
+from .logger_config import setup_logger
 
 logger = setup_logger(__name__)
 
@@ -40,7 +40,7 @@ def generate(self, dataset) -> Dataset:
             block = block_type(**block_config)
 
             if block_type == IterBlock:
-                block_kwargs = block_config.pop('block_kwargs')
+                block_kwargs = block_config.pop("block_kwargs")
                 block = block_type(**block_config, block_kwargs=block_kwargs)
             else:
                 block = block_type(**block_config)

diff --git a/src/instructlab/sdg/utilblocks.py b/src/instructlab/sdg/utilblocks.py
@@ -11,6 +11,7 @@
 
 class SamplePopulatorBlock(Block):
     def __init__(self, config_paths, column_name, **batch_kwargs) -> None:
+        super().__init__(block_name=self.__class__.__name__)
         self.configs = {}
         for config in config_paths:
             config_key = config.split("/")[-1].split(".")[0]
@@ -29,6 +30,7 @@ def generate(self, samples) -> Dataset:
 
 class SelectorBlock(Block):
     def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None:
+        super().__init__(block_name=self.__class__.__name__)
         self.choice_map = choice_map
         self.choice_col = choice_col
         self.output_col = output_col
@@ -45,6 +47,7 @@ def generate(self, samples: Dataset) -> Dataset:
 
 class CombineColumnsBlock(Block):
     def __init__(self, columns, output_col, separator="\n\n", **batch_kwargs) -> None:
+        super().__init__(block_name=self.__class__.__name__)
         self.columns = columns
         self.output_col = output_col
         self.separator = separator