Basic Update

didiforgithub · Dec 4, 2024 · 7ec14de · 7ec14de
1 parent e2cdcfb
commit 7ec14de
Show file tree

Hide file tree

Showing 10 changed files with 650 additions and 3 deletions.
diff --git a/examples/aflow/optimize.py b/examples/aflow/optimize.py
@@ -43,6 +43,7 @@ def __init__(self, dataset: str, question_type: str, operators: List[str]):
         dataset="MBPP",
         question_type="code",
         operators=["Custom", "CustomCodeGenerate", "ScEnsemble", "Test"],
+        # operators=["Custom", "CustomCodeGenerate"]
     ),
     "HumanEval": ExperimentConfig(
         dataset="HumanEval",
@@ -70,7 +71,9 @@ def parse_args():
     )
     parser.add_argument("--initial_round", type=int, default=1, help="Initial round")
     parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds")
-    parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop")
+    parser.add_argument(
+        "--check_convergence", type=lambda x: x.lower() == "true", default=True, help="Whether to enable early stop"
+    )
     parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds")
     parser.add_argument(
         "--if_first_optimize",
@@ -87,6 +90,7 @@ def parse_args():
     download(["datasets", "initial_rounds"], if_first_download=args.if_first_optimize)
     config = EXPERIMENT_CONFIGS[args.dataset]
 
+    four_o_llm_config = ModelsConfig.default().get("gpt-4o")
     mini_llm_config = ModelsConfig.default().get("gpt-4o-mini")
     claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
 

diff --git a/io_humaneval.py b/io_humaneval.py
@@ -0,0 +1,143 @@
+from typing import List, Tuple
+
+from pydantic import BaseModel, Field
+
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
+from metagpt.ext.aflow.scripts.operator import Operator
+from metagpt.ext.aflow.scripts.workflow import Workflow
+from metagpt.llm import LLM
+
+HUMANEVAL_PROMPT_IO = """
+{question}\nGenerate an answer to this question, without any additional test cases. 
+"""
+
+WITHOUT_PROMPT = """
+{question}
+"""
+
+SC_ENSEMBLE_PROMPT = """
+Given the question described as follows: {question}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="Python Solution For This Question.")
+
+
+class ScEnsembleOp(BaseModel):
+    thought: str = Field(default="", description="The thought of the most consistent solution.")
+    solution_letter: str = Field(default="", description="The letter of most consistent solution.")
+
+
+class Generate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(llm, name)
+
+    async def __call__(self, question: str, entry_point: str) -> Tuple[str, str]:
+        prompt = WITHOUT_PROMPT.format(question=question)
+        response = await self._fill_node(GenerateOp, prompt, mode="code_fill", function_name=entry_point)
+        return response
+
+
+class JSONGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "JSONGenerate"):
+        super().__init__(llm, name)
+
+    async def __call__(self, question: str, entry_point: str) -> Tuple[str, str]:
+        prompt = WITHOUT_PROMPT.format(question=question)
+        response = await self._fill_node(GenerateOp, mode="single_fill", prompt=prompt)
+        return response
+
+
+class ScEnsemble(Operator):
+    def __init__(self, llm: LLM, name: str = "ScEnsemble"):
+        super().__init__(llm, name)
+
+    async def __call__(self, solutions: List[str], problem: str):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(question=problem, solutions=solution_text)
+        response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill")
+
+        answer = response.get("solution_letter", "")
+        answer = answer.strip().upper()
+
+        return {"response": solutions[answer_mapping[answer]]}
+
+
+class IOSolveGraph(Workflow):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.generate = Generate(self.llm)
+
+    async def __call__(self, question, entry_point):
+        solution = await self.generate(question, entry_point)
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+
+class JSONSolveGraph(Workflow):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.generate = Generate(self.llm)
+
+    async def __call__(self, question, entry_point):
+        solution = await self.generate(question, entry_point)
+        return solution["solution"], self.llm.cost_manager.total_cost
+
+
+class SelfConsistencyGraph(Workflow):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.generate = Generate(llm=self.llm)
+        self.sc_ensemble = ScEnsemble(llm=self.llm)
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        for i in range(5):
+            solution = await self.generate(problem, function_name)
+            solutions.append(solution["solution"])
+        solution = await self.sc_ensemble(solutions, problem)
+        return solution["response"], self.llm.cost_manager.total_cost
+
+
+if __name__ == "__main__":
+
+    async def main():
+        # llm_config = ModelsConfig.default().get("llama-3.2-90b-vision-instruct")
+        llm_config = ModelsConfig.default().get("meta-llama/Meta-Llama-3.1-70B-Instruct")
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-4o")
+        # llm_config = ModelsConfig.default().get("deepseek-chat")
+        # llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
+        # graph = IOSolveGraph(name="IO", llm_config=llm_config, dataset="HumanEval")
+        # graph = JSONSolveGraph(name="JSON", llm_config=llm_config, dataset="HumanEval")
+        graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="HumanEval")
+        benchmark = HumanEvalBenchmark(
+            name="HumanEval",
+            file_path="/Users/trl/Github_project/MetaGPT-MathAI/metagpt/ext/aflow/data/humaneval_test.jsonl",
+            log_path="",
+        )
+        avg_score = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
+        return avg_score
+
+    import asyncio
+
+    asyncio.run(main())
+
+
+# llama + markdown + prompt 0.75573
+
+# gpt-4o-mini + markdown + prompt  0.871
+# gpt-4o-mini + json + prompt  0.81679
+# gpt-4o-mini
diff --git a/metagpt/ext/aflow/benchmark/benchmark.py b/metagpt/ext/aflow/benchmark/benchmark.py
@@ -84,7 +84,7 @@ def calculate_score(self, expected_output: Any, prediction: Any) -> Tuple[float,
     def get_result_columns(self) -> List[str]:
         pass
 
-    async def evaluate_all_problems(self, data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
+    async def evaluate_all_problems(self, data: List[dict], graph: Callable, max_concurrent_tasks: int = 25):
         semaphore = asyncio.Semaphore(max_concurrent_tasks)
 
         async def sem_evaluate(problem):
@@ -94,6 +94,14 @@ async def sem_evaluate(problem):
         tasks = [sem_evaluate(problem) for problem in data]
         return await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {self.name} problems", total=len(data))
 
+    async def baseline_evaluation(self, graph: Callable, max_concurrent_tasks: int = 50):
+        data = await self.load_data()
+        results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)
+        columns = self.get_result_columns()
+        df = pd.DataFrame(results, columns=columns)
+        avg_score = df["score"].mean()
+        logger.info(f"Average score on {self.name} dataset: {avg_score:.5f}")
+
     async def run_evaluation(self, graph: Callable, va_list: List[int], max_concurrent_tasks: int = 50):
         data = await self.load_data(va_list)
         results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)

diff --git a/metagpt/ext/aflow/scripts/operator.py b/metagpt/ext/aflow/scripts/operator.py
@@ -44,8 +44,8 @@
 
 class Operator:
     def __init__(self, llm: LLM, name: str):
-        self.name = name
         self.llm = llm
+        self.name = name
 
     def __call__(self, *args, **kwargs):
         raise NotImplementedError

diff --git a/metagpt/ext/eflow/experience/consistency_humaneval.py b/metagpt/ext/eflow/experience/consistency_humaneval.py
@@ -0,0 +1,49 @@
+import asyncio
+
+from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
+from metagpt.ext.eflow.src.abstract import Workflow
+from metagpt.ext.eflow.src.operators import CodeGenerate, Custom, ScEnsemble
+
+llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"]
+
+
+class MutliLLMWorkflow(Workflow):
+    def __init__(self, name: str, llm_names: list, dataset: str):
+        super().__init__(name, llm_names, dataset)
+        self.custom = Custom(self.llm_dict["gpt-4o-mini"])
+        self.code_generate = CodeGenerate(self.llm_dict["gpt-4o-mini"])
+        self.sc_ensemble = ScEnsemble(self.llm_dict["claude-3-5-sonnet-20240620"])
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        tasks = [
+            self.code_generate(problem, function_name, self.llm_dict["claude-3-5-sonnet-20240620"]),
+            self.code_generate(problem, function_name, self.llm_dict["gpt-4o-mini"]),
+            self.code_generate(problem, function_name, self.llm_dict["gpt-4o"]),
+            self.code_generate(problem, function_name, self.llm_dict["deepseek-chat"]),
+        ]
+        claude_solution, four_o_mini_solution, four_o_solution, deepseek_solution = await asyncio.gather(*tasks)
+
+        solutions.append(claude_solution)
+        solutions.append(four_o_mini_solution)
+        solutions.append(four_o_solution)
+        solutions.append(deepseek_solution)
+
+        solution = await self.sc_ensemble(solutions, problem)
+
+        return solution["response"], self.get_cost()
+
+
+if __name__ == "__main__":
+
+    async def main():
+        graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval")
+        benchmark = HumanEvalBenchmark(
+            name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_validate.jsonl", log_path=""
+        )
+        avg_score = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
+        return avg_score
+
+    import asyncio
+
+    asyncio.run(main())
diff --git a/metagpt/ext/eflow/src/abstract.py b/metagpt/ext/eflow/src/abstract.py
@@ -0,0 +1,74 @@
+from typing import Dict, List
+
+from pydantic import BaseModel, Field
+
+from metagpt.actions.action_node import ActionNode
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.ext.aflow.scripts.evaluator import DatasetType
+from metagpt.llm import LLM
+from metagpt.provider.llm_provider_registry import create_llm_instance
+from metagpt.utils.cost_manager import CostManager
+
+
+class Schema(BaseModel):
+    def __init__(self, attributes: List[Dict]):
+        self._create_schemas(attributes)
+
+    def _create_schemas(self, attributes: List[Dict]):
+        for attribute in attributes:
+            self.model_fields[attribute["name"]] = Field(
+                default="", description=attribute["description"], type=attribute["type"]
+            )
+
+
+class Operator:
+    def __init__(self, model: LLM, name: str):
+        self.default_model = model
+        self.name = name
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError
+
+    async def _fill_node(self, op_schema, prompt, format=None, model: LLM = None, **extra_kwargs):
+        fill_kwargs = {"context": prompt, "llm": model}
+        if model is None:
+            model = self.default_model
+        if format:
+            fill_kwargs["mode"] = format
+
+        fill_kwargs.update(extra_kwargs)
+        node = await ActionNode.from_pydantic(op_schema).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+
+class Workflow:
+    def __init__(
+        self,
+        name: str,
+        llm_names: list,
+        dataset: DatasetType,
+    ) -> None:
+        self.name = name
+        self.dataset = dataset
+        self.llm_dict = self.create_llms(llm_names)
+
+    def get_cost(self):
+        # 累计所有llm的cost
+        total_cost = 0
+        for llm in self.llm_dict.values():
+            total_cost += llm.cost_manager.total_cost
+        return total_cost
+
+    def create_llms(self, llm_names: list):
+        llm_dict = {}
+        for llm_name in llm_names:
+            llm_config = ModelsConfig.default().get(llm_name)
+            llm_dict[llm_name] = create_llm_instance(llm_config)
+            llm_dict[llm_name].cost_manager = CostManager()
+        return llm_dict
+
+    async def __call__(self, problem: str):
+        """
+        Implementation of the workflow
+        """
+        raise NotImplementedError("This method should be implemented by the subclass")
diff --git a/metagpt/ext/eflow/src/operators.py b/metagpt/ext/eflow/src/operators.py
@@ -0,0 +1,67 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from metagpt.ext.eflow.src.abstract import Operator
+from metagpt.ext.eflow.src.prompt import GENERATE_PROMPT, SC_ENSEMBLE_PROMPT
+from metagpt.llm import LLM
+
+# Schema List[{name: str, type: str, description: str}]
+# # 看起来Schema 需要一个单独的Class，方便进行生成
+# 这里的抽象太乱了
+# 多LLM应该在调用的时候出现，而不是在Workflow创建的时候出现
+# 应该放开Node调整的条件，还是放开Operator的条件？
+
+
+class GenerateOp(BaseModel):
+    response: str = Field(default="", description="Your solution for this problem")
+
+
+class Custom(Operator):
+    def __init__(self, model: LLM):
+        super().__init__(model, "Custom")
+        self.schema = [{"name": "response", "type": "str", "description": "Your solution for this problem"}]
+
+    async def __call__(self, input: str, prompt: str, op_schema: BaseModel = None, model: LLM = None, **extra_kwargs):
+        if op_schema is None:
+            op_schema = self.schema
+        context = prompt + input
+        response = await self._fill_node(op_schema=op_schema, prompt=context, model=model, **extra_kwargs)
+        return response
+
+
+class CodeGenerate(Operator):
+    def __init__(self, model: LLM):
+        super().__init__(model, "CodeGenerate")
+        self.schema = [{"name": "response", "type": "str", "description": "Your solution for this problem"}]
+
+    async def __call__(self, problem: str, function_name: str, model: LLM = None):
+        prompt = GENERATE_PROMPT.format(problem=problem)
+        response = await self._fill_node(
+            op_schema=GenerateOp, prompt=prompt, format="xml_fill", model=model, function_name=function_name
+        )
+        return response
+
+
+class ScEnsemble(Operator):
+    def __init__(self, model: LLM):
+        super().__init__(model, "ScEnsemble")
+        self.schema = [
+            {"name": "thought", "type": "str", "description": "The thought of the most consistent solution."},
+            {"name": "solution_letter", "type": "str", "description": "The letter of most consistent solution."},
+        ]
+
+    async def __call__(self, solutions: List[str], problem: str, model: LLM = None):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(question=problem, solutions=solution_text)
+        response = await self._fill_node(op_schema=self.schema, prompt=prompt, format="xml_fill", model=model)
+
+        answer = response.get("solution_letter", "")
+        answer = answer.strip().upper()
+
+        return {"response": solutions[answer_mapping[answer]]}