Skip to content

Commit

Permalink
Fix ruff formatting check
Browse files Browse the repository at this point in the history
Signed-off-by: Russell Bryant <[email protected]>

Ignore new sdg lib files in mypy checks

Signed-off-by: Russell Bryant <[email protected]>

Remove unused python imports

Signed-off-by: Russell Bryant <[email protected]>

Remove duplicate definition of SynthGroundedSkillsFlow

Signed-off-by: Russell Bryant <[email protected]>

Add missing calls to parent class init methods

Signed-off-by: Russell Bryant <[email protected]>

Ignore the last couple of lint warnings

Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
russellb authored and aakankshaduggal committed Jun 26, 2024
1 parent 9063aa7 commit 5fc070f
Show file tree
Hide file tree
Showing 9 changed files with 19 additions and 127 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ disable_error_code = ["import-not-found", "import-untyped"]
exclude = [
"^src/instructlab/sdg/generate_data\\.py$",
"^src/instructlab/sdg/utils\\.py$",
"^src/instructlab/sdg/default_flows\\.py$",
"^src/instructlab/sdg/llmblock\\.py$",
"^src/instructlab/sdg/utilblocks\\.py$",
]
# honor excludes by not following there through imports
follow_imports = "silent"
1 change: 0 additions & 1 deletion scripts/test_freeform_skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from src.instructlab.sdg.default_flows import SynthSkillsFlow
from src.instructlab.sdg.pipeline import Pipeline


# for vLLM endpoints, the api_key remains "EMPTY"
openai_api_key = "EMPTY"
openai_api_base = "Add model endpoint here"
Expand Down
3 changes: 1 addition & 2 deletions src/instructlab/sdg/block.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# Standard
from abc import ABC, abstractmethod
from abc import ABC
from collections import ChainMap
from typing import Any, Dict, Union

# Third Party
from datasets import Dataset
import yaml

# Local
Expand Down
120 changes: 1 addition & 119 deletions src/instructlab/sdg/default_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

# Local
from .filterblock import FilterByValueBlock
from .llmblock import LLMBlock
from .iterblock import IterBlock
from .llmblock import LLMBlock


class Flow(ABC):
Expand Down Expand Up @@ -389,121 +389,3 @@ def get_flow(self) -> list:
},
},
]


class SynthGroundedSkillsFlow(Flow):
def get_flow(self) -> list:
return [
{
'block_type': IterBlock,
'block_config': {
'block_name': 'context_iter',
'num_iters': 10,
'block_type': LLMBlock,
'block_kwargs': {
'block_name': 'gen_contexts',
'config_path': 'src/instructlab/sdg/configs/skills/contexts.yaml',
'client': self.client,
'model_id': self.model_id,
'model_prompt': '<s> [INST] {prompt} [/INST]',
'output_cols': ['context'],
'batch_kwargs': {
'num_procs': 8,
'batched': True,
},
},
'gen_kwargs': {
'temperature': 0.7,
'max_tokens': 2048,
},
},
},
{
'block_type': LLMBlock,
'block_config': {
'block_name': 'gen_grounded_questions',
'config_path': 'src/instructlab/sdg/configs/skills/grounded_questions.yaml',
'client': self.client,
'model_id': self.model_id,
'model_prompt': '<s> [INST] {prompt} [/INST]',
'output_cols': ['question'],
'batch_kwargs': {
'num_procs': 8,
'batched': True,
},
},
'drop_duplicates': ['question'],
},
{
'block_type': LLMBlock,
'block_config': {
'block_name': 'eval_grounded_questions',
'config_path': 'src/instructlab/sdg/configs/skills/evaluate_grounded_questions.yaml',
'client': self.client,
'model_id': self.model_id,
'model_prompt': '<s> [INST] {prompt} [/INST]',
'output_cols': ['evaluation', 'score'],
'batch_kwargs': {
'num_procs': 8,
'batched': True,
},
},
},
{
'block_type': FilterByValueBlock,
'block_config': {
'block_name': 'filter_grounded_questions',
'filter_column': 'score',
'filter_value': 1,
'operation': operator.eq,
'convert_dtype': int,
'batch_kwargs': {
'num_procs': 8,
},
},
'drop_columns': ['evaluation', 'score', 'num_samples']
},
{
'block_type': LLMBlock,
'block_config': {
'block_name': 'gen_grounded_responses',
'config_path': 'src/instructlab/sdg/configs/skills/grounded_responses.yaml',
'client': self.client,
'model_id': self.model_id,
'model_prompt': '<s> [INST] {prompt} [/INST]',
'output_cols': ['answer'],
'batch_kwargs': {
'num_procs': 8,
'batched': True,
},
},
},
{
'block_type': LLMBlock,
'block_config': {
'block_name': 'evaluate_grounded_qa_pair',
'config_path': 'src/instructlab/sdg/configs/skills/evaluate_grounded_pair.yaml',
'client': self.client,
'model_id': self.model_id,
'model_prompt': '<s> [INST] {prompt} [/INST]',
'output_cols': ['evaluation', 'score'],
'batch_kwargs': {
'num_procs': 8,
'batched': True,
},
},
},
{
'block_type': FilterByValueBlock,
'block_config': {
'block_name': 'filter_grounded_qa_pair',
'filter_column': 'score',
'filter_value': 2,
'operation': operator.ge,
'convert_dtype': int,
'batch_kwargs': {
'num_procs': 8,
},
},
}
]
1 change: 1 addition & 0 deletions src/instructlab/sdg/filterblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class FilterByValueBlock(Block):
def __init__(
self, filter_column, filter_value, operation, convert_dtype=None, **batch_kwargs
) -> None:
super().__init__(block_name=self.__class__.__name__)
self.value = filter_value
self.column_name = filter_column
self.operation = operation
Expand Down
9 changes: 6 additions & 3 deletions src/instructlab/sdg/iterblock.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import re
# Third Party
from datasets import Dataset

# Local
from .block import Block
from .logger_config import setup_logger
from datasets import Dataset

logger = setup_logger(__name__)


class IterBlock(Block):
def __init__(self, block_name, num_iters, block_type, block_kwargs, **kwargs):
super().__init__(block_name)
self.num_iters = num_iters
self.block = block_type(**block_kwargs)
self.gen_kwargs = kwargs.get('gen_kwargs', {})
self.gen_kwargs = kwargs.get("gen_kwargs", {})
self.gen_kwargs = kwargs.get("gen_kwargs", {})

def generate(self, samples, **gen_kwargs) -> Dataset:
Expand Down
2 changes: 2 additions & 0 deletions src/instructlab/sdg/llmblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@


class LLMBlock(Block):
# pylint: disable=too-many-instance-attributes
def __init__(
self,
block_name,
Expand Down Expand Up @@ -100,6 +101,7 @@ def generate(self, samples, **gen_kwargs) -> Dataset:
new_data = []
for sample, output in zip(samples, outputs):
parsed_outputs = self._parse(output)
# pylint: disable=consider-using-generator
max_length = max([len(value) for value in parsed_outputs.values()])
for values in zip(*(lst[:max_length] for lst in parsed_outputs.values())):
new_data.append({**sample, **dict(zip(parsed_outputs.keys(), values))})
Expand Down
4 changes: 2 additions & 2 deletions src/instructlab/sdg/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from datasets import Dataset

# Local
from .logger_config import setup_logger
from .iterblock import IterBlock
from .logger_config import setup_logger

logger = setup_logger(__name__)

Expand Down Expand Up @@ -40,7 +40,7 @@ def generate(self, dataset) -> Dataset:
block = block_type(**block_config)

if block_type == IterBlock:
block_kwargs = block_config.pop('block_kwargs')
block_kwargs = block_config.pop("block_kwargs")
block = block_type(**block_config, block_kwargs=block_kwargs)
else:
block = block_type(**block_config)
Expand Down
3 changes: 3 additions & 0 deletions src/instructlab/sdg/utilblocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

class SamplePopulatorBlock(Block):
def __init__(self, config_paths, column_name, **batch_kwargs) -> None:
super().__init__(block_name=self.__class__.__name__)
self.configs = {}
for config in config_paths:
config_key = config.split("/")[-1].split(".")[0]
Expand All @@ -29,6 +30,7 @@ def generate(self, samples) -> Dataset:

class SelectorBlock(Block):
def __init__(self, choice_map, choice_col, output_col, **batch_kwargs) -> None:
super().__init__(block_name=self.__class__.__name__)
self.choice_map = choice_map
self.choice_col = choice_col
self.output_col = output_col
Expand All @@ -45,6 +47,7 @@ def generate(self, samples: Dataset) -> Dataset:

class CombineColumnsBlock(Block):
def __init__(self, columns, output_col, separator="\n\n", **batch_kwargs) -> None:
super().__init__(block_name=self.__class__.__name__)
self.columns = columns
self.output_col = output_col
self.separator = separator
Expand Down

0 comments on commit 5fc070f

Please sign in to comment.