instructlab · markmc · Jul 16, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/docs/pipeline_config.md b/docs/pipeline_config.md
@@ -0,0 +1,13 @@
+# Pipeline Configuration
+
+Built-in pipeline configurations can be found in [`src/instructlab/sdg/pipelines/`](../src/instructlab/sdg/pipelines/).
+
+## Pipeline Configuration Schema
+
+A schema for validating pipeline configuration can be found in [`src/instructlab/sdg/pipelines/schema/v1.json`](../src//instructlab/sdg/pipelines/schema/v1.json)
+
+## Version History
+
+| Version | Description |
+| ---     | --- |
+| 1.0     | Initial version |
diff --git a/src/instructlab/sdg/llmblock.py b/src/instructlab/sdg/llmblock.py
@@ -60,6 +60,7 @@ def __init__(
         block_name,
         config_path,
         output_cols,
+        model_prompt=None,
         parser_kwargs={},
         batch_kwargs={},
     ) -> None:
@@ -69,7 +70,7 @@ def __init__(
             """{system}\n{introduction}\n{principles}\n{examples}\n{generation}"""
         )
         self.prompt_template = self.prompt_struct.format(**self.block_config)
-        self.model_prompt = _get_model_prompt(self.ctx.model_family)
+        self.model_prompt = model_prompt
         self.output_cols = output_cols
         self.batch_params = batch_kwargs
         self.parser_name = parser_kwargs.get("parser_name", None)
@@ -124,8 +125,20 @@ def _parse(self, generated_string) -> dict:
 
         return matches
 
+    # There are three cases to handle for self.model_prompt
+    # 1. None - no model_prompt specified, look one up based on model family
+    # 2. Non-empty string - the pipeline has specified a custom model prompt
+    # 3. Empty string - the pipeline has specified that no model prompt is needed
     def _format_prompt(self, sample: Dict) -> str:
-        return self.prompt_template.format(**sample).strip()
+        prompt = self.prompt_template.format(**sample).strip()
+
+        model_prompt = None
+        if self.model_prompt is None:
+            model_prompt = _get_model_prompt(self.ctx.model_family)
+        elif self.model_prompt:
+            model_prompt = self.model_prompt
+
+        return prompt if model_prompt is None else model_prompt.format(prompt=prompt)
 
     def _gen_kwargs(self, **gen_kwargs):
         gen_kwargs = {**self.defaults, **gen_kwargs}
@@ -136,10 +149,7 @@ def _gen_kwargs(self, **gen_kwargs):
         return gen_kwargs
 
     def _generate(self, samples, **gen_kwargs) -> list:
-        prompts = [
-            self.model_prompt.format(prompt=self._format_prompt(sample))
-            for sample in samples
-        ]
+        prompts = [self._format_prompt(sample) for sample in samples]
         generate_args = self._gen_kwargs(**gen_kwargs)
 
         if self.server_supports_batched:
@@ -221,6 +231,7 @@ def __init__(
         config_paths,
         output_cols,
         selector_column_name,
+        model_prompt=None,
         parser_kwargs={},
         batch_kwargs={},
     ) -> None:
@@ -230,6 +241,7 @@ def __init__(
             block_name,
             config_paths[0][0],
             output_cols,
+            model_prompt=model_prompt,
             parser_kwargs=parser_kwargs,
             batch_kwargs=batch_kwargs,
         )

diff --git a/src/instructlab/sdg/pipelines/schema/v1.json b/src/instructlab/sdg/pipelines/schema/v1.json
@@ -126,6 +126,9 @@
                       "type": "string"
                     }
                   },
+                  "model_prompt": {
+                    "type": "string"
+                  },
                   "parser_kwargs": {
                     "type": "object",
                     "properties": {
@@ -171,6 +174,9 @@
                       "type": "string"
                     }
                   },
+                  "model_prompt": {
+                    "type": "string"
+                  },
                   "selector_column_name": {
                     "type": "string"
                   },

diff --git a/tests/test_llmblock.py b/tests/test_llmblock.py
@@ -0,0 +1,86 @@
+# Standard
+from unittest.mock import MagicMock, patch
+import unittest
+
+# Third Party
+from datasets import Dataset, Features, Value
+
+# First Party
+from src.instructlab.sdg.llmblock import LLMBlock
+
+
+class TestLLMBlockModelPrompt(unittest.TestCase):
+    def setUp(self):
+        self.mock_ctx = MagicMock()
+        self.mock_ctx.model_family = "mixtral"
+        self.mock_ctx.model_id = "test_model"
+        self.mock_pipe = MagicMock()
+        self.config_return_value = {
+            "system": "{fruit}",
+            "introduction": "introduction",
+            "principles": "principles",
+            "examples": "examples",
+            "generation": "generation",
+        }
+        self.dataset = Dataset.from_dict(
+            {"fruit": ["apple", "pear", "mango"]},
+            features=Features({"fruit": Value("string")}),
+        )
+
+    @patch("src.instructlab.sdg.block.Block._load_config")
+    def test_model_prompt_empty_string(self, mock_load_config):
+        mock_load_config.return_value = self.config_return_value
+        # Ensure that if an empty model_prompt is not specified, no model prompt is used.
+        block = LLMBlock(
+            ctx=self.mock_ctx,
+            pipe=self.mock_pipe,
+            block_name="test_block",
+            config_path="",
+            output_cols=[],
+            model_prompt="",
+        )
+        prompt = block._format_prompt(self.dataset[0])
+        self.assertEqual(
+            prompt,
+            "apple\nintroduction\nprinciples\nexamples\ngeneration",
+            "no model prompt should be used when explicitly set to an empty string",
+        )
+
+    @patch("src.instructlab.sdg.block.Block._load_config")
+    def test_model_prompt_none(self, mock_load_config):
+        mock_load_config.return_value = self.config_return_value
+        # Ensure that if a custom model_prompt is not specified, it defaults to setting it to
+        # something based on the model family (i.e. mixtral).
+        block = LLMBlock(
+            ctx=self.mock_ctx,
+            pipe=self.mock_pipe,
+            block_name="test_block",
+            config_path="",
+            output_cols=[],
+            model_prompt=None,  # Or simply omit model_prompt as it defaults to None
+        )
+        prompt = block._format_prompt(self.dataset[1])
+        self.assertEqual(
+            prompt,
+            "<s> [INST] pear\nintroduction\nprinciples\nexamples\ngeneration [/INST]",
+            "model_prompt based on model_family should be used set to None",
+        )
+
+    @patch("src.instructlab.sdg.block.Block._load_config")
+    def test_model_prompt_none(self, mock_load_config):
+        mock_load_config.return_value = self.config_return_value
+        # Ensure that if a custom model_prompt is specified, it is used correctly
+        block = LLMBlock(
+            ctx=self.mock_ctx,
+            pipe=self.mock_pipe,
+            block_name="test_block",
+            config_path="",
+            output_cols=[],
+            model_prompt="FOO {prompt} BAR",
+        )
+        prompt = block._format_prompt(self.dataset[1])
+        self.assertEqual(
+            prompt,
+            "FOO pear\nintroduction\nprinciples\nexamples\ngeneration BAR",
+            "model_prompt should be a non-empty string when set to None",
+        )