Update MoA & Visual

didiforgithub · Dec 9, 2024 · b4b63a5 · b4b63a5
1 parent 6faee49
commit b4b63a5
Show file tree

Hide file tree

Showing 13 changed files with 836 additions and 40 deletions.
diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py
@@ -25,6 +25,8 @@
 from metagpt.utils.common import OutputParser, general_after_log
 from metagpt.utils.human_interaction import HumanInteraction
 from metagpt.utils.sanitize import sanitize
+from metagpt.utils.common import encode_image
+from pathlib import Path
 
 
 class ReviewMode(Enum):
@@ -628,6 +630,9 @@ async def fill(
         if self.schema:
             schema = self.schema
 
+        if images:
+            images = [encode_image(Path(image)) for image in images]
+
         if mode == FillMode.CODE_FILL.value:
             result = await self.code_fill(context, function_name, timeout)
             self.instruct_content = self.create_class()(**result)

diff --git a/metagpt/ext/aflow/benchmark/benchmark.py b/metagpt/ext/aflow/benchmark/benchmark.py
@@ -98,11 +98,10 @@ async def baseline_evaluation(self, graph: Callable, max_concurrent_tasks: int =
         data = await self.load_data()
         results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)
         columns = self.get_result_columns()
-        df = pd.DataFrame(results, columns=columns)
-        avg_score = df["score"].mean()
-        total_cost = df["cost"].max()
-        logger.info(f"Average score on {self.name} dataset: {avg_score:.5f}")
+        average_score, average_cost, total_cost = self.save_results_to_csv(results, columns)
+        logger.info(f"Average score on {self.name} dataset: {average_score:.5f}")
         logger.info(f"Total Cost: {total_cost:.5f}")
+        return average_score, average_cost, total_cost
 
     async def run_evaluation(self, graph: Callable, va_list: List[int], max_concurrent_tasks: int = 50):
         data = await self.load_data(va_list)

diff --git a/metagpt/ext/aflow/benchmark/humaneval.py b/metagpt/ext/aflow/benchmark/humaneval.py
@@ -105,7 +105,7 @@ def check_solution(self, solution, test, entry_point):
     @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), retry=retry_if_exception_type(Exception), reraise=True)
     async def _generate_output(self, graph, prompt, entry_point):
         # Generate output with a timeout of 60 seconds
-        return await asyncio.wait_for(graph(prompt, entry_point), timeout=60)
+        return await asyncio.wait_for(graph(prompt, entry_point), timeout=1000)
 
     async def evaluate_problem(self, data: dict, graph: Callable) -> Tuple[str, str, str, float, float]:
         input_text = data["prompt"]

diff --git a/metagpt/ext/aflow/benchmark/matplotbench.py b/metagpt/ext/aflow/benchmark/matplotbench.py
@@ -0,0 +1,94 @@
+import base64
+from typing import Any, Callable, List, Tuple
+from pathlib import Path
+
+import re
+from metagpt.provider.llm_provider_registry import create_llm_instance
+from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
+from metagpt.logs import logger
+from metagpt.ext.eflow.src.abstract import Operator
+from metagpt.llm import LLM
+
+class VisualizationCompare(Operator):
+    def __init__(self, model:LLM):
+        super().__init__(model, "VisualizationCompare")
+        self.schema = [
+            {"name": "thought", "type": "str", "description": "The thought process of score"},
+            {"name": "score", "type": "str", "description": "The score of the generated plot, from 0-100"}
+        ]
+
+    async def __call__(self, query:str, images: List[str]):
+        prompt = """
+You are an excellent judge at evaluating visualization plots between a model-generated plot and the ground truth.
+You will be giving scores on how well it matches the ground truth plot.
+The generated plot will be given to you as the first figure. If the first figure is blank, that means the code failed to generate a figure.
+Another plot will be given to you as the second figure, which is the desired outcome of the user query, meaning it is the ground truth for you to reference.
+Please compare the two figures head to head and rate them. Suppose the second figure has a score of 100, rate the first figure on a scale from 0 to 100.
+Scoring should be carried out regarding the plot correctness: Compare closely between the generated plot and the ground truth, the more resemblance the generated plot has compared to the ground truth, the higher the score. 
+The score should be proportionate to the resemblance between the two plots.
+In some rare occurrences, see if the data points are generated randomly according to the query, if so, the generated plot may not perfectly match the ground truth, but it is correct nonetheless.
+Only rate the first figure, the second figure is only for reference.
+If the first figure is blank, that means the code failed to generate a figure. Give a score of 0 on the Plot correctness.
+After scoring from the above aspect, please give a final score. 
+The user query is {query}
+        """
+        prompt = prompt.format(query=query)
+        response = await self._fill_node(op_schema=self.schema, prompt=prompt, format="xml_fill", images=images)
+        return response["score"]
+
+
+class MatPlotBench(BaseBenchmark):
+    def __init__(self, name: str, file_path: str, log_path: str, llm_config):
+        super().__init__(name, file_path, log_path)
+        self.eval_llm = create_llm_instance(llm_config)
+
+    def encode_image(self, image_path: str) -> str:
+        """将图片编码为base64字符串"""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    async def evaluate_problem(self, data: dict, graph: Callable) -> Tuple[str, str, str, float, float]:
+        """评估单个可视化问题
+        
+        Args:
+            data: 包含测试数据的字典
+            graph: 生成图表的函数
+        
+        Returns:
+            Tuple[输入文本, 生成的图片路径, 期望的图片路径, 分数, 成本]
+        """
+        query = data["simple_instruction"]
+        test_id = data["test_id"]
+
+        try:
+            # 生成图表
+            generated_image, cost = await graph(query)
+            ground_truth = f"./benchmark_data/ground_truth/example_{test_id}.png"
+
+            # 如果生成的图片不存在，返回0分
+            if not Path(generated_image).exists():
+                logger.warning(f"Generated image does not exist: {generated_image}")
+                return query, generated_image, ground_truth, 0.0, cost
+
+            # 评估图表质量
+            score = await self._evaluate_plot(ground_truth, generated_image)
+
+            return query, generated_image, ground_truth, score, cost
+
+        except Exception as e:
+            logger.error(f"Error evaluating plot: {e}")
+            return query, "", ground_truth, 0.0, 0.0
+
+    async def _evaluate_plot(self, ground_truth: str, generated_image: str) -> float:
+        """使用GPT-4V评估图表质量"""
+        visual_compare = VisualizationCompare(self.eval_llm)
+        score = await visual_compare([generated_image, ground_truth])
+        return score
+
+    def calculate_score(self, expected_output: Any, prediction: Any) -> Tuple[float, Any]:
+        """分数计算已在evaluate_problem中完成，这里仅返回占位值"""
+        return 0.0, prediction
+
+    def get_result_columns(self) -> List[str]:
+        """定义结果CSV的列名"""
+        return ["query", "generated_image", "ground_truth", "score", "cost"]
diff --git a/metagpt/ext/aflow/scripts/operator.py b/metagpt/ext/aflow/scripts/operator.py
@@ -136,6 +136,7 @@ def run_code(code):
             "PyQt5",
             "wx",
             "pyglet",
+            "matplotlib",
         ]
 
         # Check for prohibited imports

diff --git a/metagpt/ext/eflow/experience/moa_aflow.py b/metagpt/ext/eflow/experience/moa_aflow.py
@@ -0,0 +1,98 @@
+from metagpt.ext.eflow.src.abstract import Workflow
+from metagpt.ext.eflow.src.operators import Custom, Test, MOAGenerate, MOATest
+
+llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"]
+
+IMPROVE_CODE_PROMPT = """
+The previous solution failed some test cases. Please analyze the problem carefully and provide an improved solution that addresses all edge cases and requirements. Ensure your code is efficient and follows best practices.
+"""
+
+class MoaAflowWorkflow(Workflow):
+    def __init__(
+        self,
+        name: str,
+        llm_names: list,
+        dataset: str,
+    ) -> None:
+        super().__init__(name, llm_names, dataset)
+        self.custom = Custom(self.llm_dict["gpt-4o-mini"])
+        self.test = Test(self.llm_dict["gpt-4o"])
+        self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])
+
+    async def __call__(self, problem: str, entry_point: str):
+        """
+        Implementation of the MOA workflow
+        """
+        solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
+        test_result = await self.test(problem=problem, solution=solution['solution'], entry_point=entry_point)
+
+        if test_result['result']:
+            return test_result['solution'], self.get_cost()
+        else:
+            # If the test fails, try to generate a new solution with MOA
+            problem = problem + "\n" + IMPROVE_CODE_PROMPT
+            new_solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
+            return new_solution['solution'], self.get_cost()
+
+
+class MoaAflowTestWorkflow(Workflow):
+    def __init__(
+        self,
+        name: str,
+        llm_names: list,
+        dataset: str,
+    ) -> None:
+        super().__init__(name, llm_names, dataset)
+        self.custom = Custom(self.llm_dict["gpt-4o-mini"])
+        self.test = Test(self.llm_dict["gpt-4o"])
+        self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])
+        self.moa_test = MOATest(self.llm_dict["gpt-4o"])
+
+    async def __call__(self, problem: str, entry_point: str):
+        """
+        Implementation of the MOA workflow
+        """
+        solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
+        test_result = await self.moa_test(problem=problem, solution=solution['solution'], entry_point=entry_point, models=self.llm_dict.values())
+
+        if test_result['result']:
+            return test_result['solution'], self.get_cost()
+        else:
+            # If the test fails, try to generate a new solution with MOA
+            problem = problem + "\n" + IMPROVE_CODE_PROMPT
+            new_solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
+            return new_solution['solution'], self.get_cost()
+
+if __name__ == "__main__":
+    import asyncio
+    from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
+
+    async def main():
+        graph = MoaAflowWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
+        benchmark = HumanEvalBenchmark(
+            name="HumanEval", 
+            file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", 
+            log_path=""
+        )
+        avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
+        return avg_score, avg_cost, total_cost
+
+    async def single_task():
+        graph = MoaAflowWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
+        task = "\ndef sort_array(arr):\n    \"\"\"\n    In this Kata, you have to sort an array of non-negative integers according to\n    number of ones in their binary representation in ascending order.\n    For similar number of ones, sort based on decimal value.\n\n    It must be implemented like this:\n    >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n    >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n    >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n    \"\"\"\n"
+        function_name = "sort_array" 
+        solution, cost = await graph(task, function_name)
+        print(solution)
+        print(cost)
+
+    async def single_task_test():
+        graph = MoaAflowTestWorkflow(name="MoaTest", llm_names=llm_name_list, dataset="HumanEval")
+        task = "\ndef sort_array(arr):\n    \"\"\"\n    In this Kata, you have to sort an array of non-negative integers according to\n    number of ones in their binary representation in ascending order.\n    For similar number of ones, sort based on decimal value.\n\n    It must be implemented like this:\n    >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n    >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n    >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n    \"\"\"\n"
+        function_name = "sort_array" 
+        solution, cost = await graph(task, function_name)
+        print(solution)
+        print(cost)
+
+    asyncio.run(single_task_test())
+    # score, cost, total_cost = asyncio.run(main())
+    # print(f"Moa: {score}, {cost}, {total_cost}")
diff --git a/metagpt/ext/eflow/experience/moa_sc_humaneval.py b/metagpt/ext/eflow/experience/moa_sc_humaneval.py
@@ -0,0 +1,66 @@
+import asyncio
+from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
+from metagpt.ext.eflow.src.abstract import Workflow
+from metagpt.ext.eflow.src.operators import CodeGenerate, Custom, ScEnsemble, MOAGenerate
+
+llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"]
+
+class MutliLLMWorkflow(Workflow):  
+    def __init__(self, name: str, llm_names: list, dataset: str):
+        super().__init__(name, llm_names, dataset)
+        self.custom = Custom(self.llm_dict["gpt-4o-mini"])
+        self.code_generate = CodeGenerate(self.llm_dict["gpt-4o-mini"])
+        self.sc_ensemble = ScEnsemble(self.llm_dict["gpt-4o"])
+        self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])
+
+    async def __call__(self, problem, function_name):
+        solutions = []
+        for i in range(3):
+            solution = await self.moa_generate(problem, function_name, models=self.llm_dict.values())
+            solutions.append(solution["solution"])
+
+        solution = await self.sc_ensemble(solutions, problem)
+        return solution["response"], self.get_cost()
+
+
+class MoaWorkflow(Workflow):
+    def __init__(self, name: str, llm_names: list, dataset: str):
+        super().__init__(name, llm_names, dataset)
+        self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])
+
+    async def __call__(self, problem, function_name):
+        solution = await self.moa_generate(problem, function_name, models=self.llm_dict.values())
+        return solution["solution"], self.get_cost()
+
+
+if __name__ == "__main__":
+
+    async def main():
+        graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval")
+        benchmark = HumanEvalBenchmark(
+            name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", log_path=""
+        )
+        avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
+        return avg_score, avg_cost, total_cost
+
+    async def single_task():
+        graph = MoaWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
+        # graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval")
+        task = "\n\ndef sum_to_n(n: int):\n    \"\"\"sum_to_n is a function that sums numbers from 1 to n.\n    >>> sum_to_n(30)\n    465\n    >>> sum_to_n(100)\n    5050\n    >>> sum_to_n(5)\n    15\n    >>> sum_to_n(10)\n    55\n    >>> sum_to_n(1)\n    1\n    \"\"\"\n"
+        function_name = "sum_to_n" 
+        solution, cost = await graph(task, function_name)
+        print(solution)
+        print(cost)
+
+    async def moa_workflow():
+        graph = MoaWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
+        benchmark = HumanEvalBenchmark(
+            name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", log_path=""
+        )
+        avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
+        return avg_score, avg_cost, total_cost
+
+    sc_moa_score, sc_moa_cost, sc_moa_total_cost = asyncio.run(main())
+    moa_score, moa_cost, moa_total_cost = asyncio.run(moa_workflow())
+    print(f"SelfConsistency: {sc_moa_score}, {sc_moa_cost}, {sc_moa_total_cost}")
+    print(f"Moa: {moa_score}, {moa_cost}, {moa_total_cost}")