forked from geekan/MetaGPT
-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6faee49
commit b4b63a5
Showing
13 changed files
with
836 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
import base64 | ||
from typing import Any, Callable, List, Tuple | ||
from pathlib import Path | ||
|
||
import re | ||
from metagpt.provider.llm_provider_registry import create_llm_instance | ||
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark | ||
from metagpt.logs import logger | ||
from metagpt.ext.eflow.src.abstract import Operator | ||
from metagpt.llm import LLM | ||
|
||
class VisualizationCompare(Operator): | ||
def __init__(self, model:LLM): | ||
super().__init__(model, "VisualizationCompare") | ||
self.schema = [ | ||
{"name": "thought", "type": "str", "description": "The thought process of score"}, | ||
{"name": "score", "type": "str", "description": "The score of the generated plot, from 0-100"} | ||
] | ||
|
||
async def __call__(self, query:str, images: List[str]): | ||
prompt = """ | ||
You are an excellent judge at evaluating visualization plots between a model-generated plot and the ground truth. | ||
You will be giving scores on how well it matches the ground truth plot. | ||
The generated plot will be given to you as the first figure. If the first figure is blank, that means the code failed to generate a figure. | ||
Another plot will be given to you as the second figure, which is the desired outcome of the user query, meaning it is the ground truth for you to reference. | ||
Please compare the two figures head to head and rate them. Suppose the second figure has a score of 100, rate the first figure on a scale from 0 to 100. | ||
Scoring should be carried out regarding the plot correctness: Compare closely between the generated plot and the ground truth, the more resemblance the generated plot has compared to the ground truth, the higher the score. | ||
The score should be proportionate to the resemblance between the two plots. | ||
In some rare occurrences, see if the data points are generated randomly according to the query, if so, the generated plot may not perfectly match the ground truth, but it is correct nonetheless. | ||
Only rate the first figure, the second figure is only for reference. | ||
If the first figure is blank, that means the code failed to generate a figure. Give a score of 0 on the Plot correctness. | ||
After scoring from the above aspect, please give a final score. | ||
The user query is {query} | ||
""" | ||
prompt = prompt.format(query=query) | ||
response = await self._fill_node(op_schema=self.schema, prompt=prompt, format="xml_fill", images=images) | ||
return response["score"] | ||
|
||
|
||
class MatPlotBench(BaseBenchmark): | ||
def __init__(self, name: str, file_path: str, log_path: str, llm_config): | ||
super().__init__(name, file_path, log_path) | ||
self.eval_llm = create_llm_instance(llm_config) | ||
|
||
def encode_image(self, image_path: str) -> str: | ||
"""将图片编码为base64字符串""" | ||
with open(image_path, "rb") as image_file: | ||
return base64.b64encode(image_file.read()).decode('utf-8') | ||
|
||
async def evaluate_problem(self, data: dict, graph: Callable) -> Tuple[str, str, str, float, float]: | ||
"""评估单个可视化问题 | ||
Args: | ||
data: 包含测试数据的字典 | ||
graph: 生成图表的函数 | ||
Returns: | ||
Tuple[输入文本, 生成的图片路径, 期望的图片路径, 分数, 成本] | ||
""" | ||
query = data["simple_instruction"] | ||
test_id = data["test_id"] | ||
|
||
try: | ||
# 生成图表 | ||
generated_image, cost = await graph(query) | ||
ground_truth = f"./benchmark_data/ground_truth/example_{test_id}.png" | ||
|
||
# 如果生成的图片不存在,返回0分 | ||
if not Path(generated_image).exists(): | ||
logger.warning(f"Generated image does not exist: {generated_image}") | ||
return query, generated_image, ground_truth, 0.0, cost | ||
|
||
# 评估图表质量 | ||
score = await self._evaluate_plot(ground_truth, generated_image) | ||
|
||
return query, generated_image, ground_truth, score, cost | ||
|
||
except Exception as e: | ||
logger.error(f"Error evaluating plot: {e}") | ||
return query, "", ground_truth, 0.0, 0.0 | ||
|
||
async def _evaluate_plot(self, ground_truth: str, generated_image: str) -> float: | ||
"""使用GPT-4V评估图表质量""" | ||
visual_compare = VisualizationCompare(self.eval_llm) | ||
score = await visual_compare([generated_image, ground_truth]) | ||
return score | ||
|
||
def calculate_score(self, expected_output: Any, prediction: Any) -> Tuple[float, Any]: | ||
"""分数计算已在evaluate_problem中完成,这里仅返回占位值""" | ||
return 0.0, prediction | ||
|
||
def get_result_columns(self) -> List[str]: | ||
"""定义结果CSV的列名""" | ||
return ["query", "generated_image", "ground_truth", "score", "cost"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -136,6 +136,7 @@ def run_code(code): | |
"PyQt5", | ||
"wx", | ||
"pyglet", | ||
"matplotlib", | ||
] | ||
|
||
# Check for prohibited imports | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
from metagpt.ext.eflow.src.abstract import Workflow | ||
from metagpt.ext.eflow.src.operators import Custom, Test, MOAGenerate, MOATest | ||
|
||
llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"] | ||
|
||
IMPROVE_CODE_PROMPT = """ | ||
The previous solution failed some test cases. Please analyze the problem carefully and provide an improved solution that addresses all edge cases and requirements. Ensure your code is efficient and follows best practices. | ||
""" | ||
|
||
class MoaAflowWorkflow(Workflow): | ||
def __init__( | ||
self, | ||
name: str, | ||
llm_names: list, | ||
dataset: str, | ||
) -> None: | ||
super().__init__(name, llm_names, dataset) | ||
self.custom = Custom(self.llm_dict["gpt-4o-mini"]) | ||
self.test = Test(self.llm_dict["gpt-4o"]) | ||
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"]) | ||
|
||
async def __call__(self, problem: str, entry_point: str): | ||
""" | ||
Implementation of the MOA workflow | ||
""" | ||
solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values()) | ||
test_result = await self.test(problem=problem, solution=solution['solution'], entry_point=entry_point) | ||
|
||
if test_result['result']: | ||
return test_result['solution'], self.get_cost() | ||
else: | ||
# If the test fails, try to generate a new solution with MOA | ||
problem = problem + "\n" + IMPROVE_CODE_PROMPT | ||
new_solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values()) | ||
return new_solution['solution'], self.get_cost() | ||
|
||
|
||
class MoaAflowTestWorkflow(Workflow): | ||
def __init__( | ||
self, | ||
name: str, | ||
llm_names: list, | ||
dataset: str, | ||
) -> None: | ||
super().__init__(name, llm_names, dataset) | ||
self.custom = Custom(self.llm_dict["gpt-4o-mini"]) | ||
self.test = Test(self.llm_dict["gpt-4o"]) | ||
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"]) | ||
self.moa_test = MOATest(self.llm_dict["gpt-4o"]) | ||
|
||
async def __call__(self, problem: str, entry_point: str): | ||
""" | ||
Implementation of the MOA workflow | ||
""" | ||
solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values()) | ||
test_result = await self.moa_test(problem=problem, solution=solution['solution'], entry_point=entry_point, models=self.llm_dict.values()) | ||
|
||
if test_result['result']: | ||
return test_result['solution'], self.get_cost() | ||
else: | ||
# If the test fails, try to generate a new solution with MOA | ||
problem = problem + "\n" + IMPROVE_CODE_PROMPT | ||
new_solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values()) | ||
return new_solution['solution'], self.get_cost() | ||
|
||
if __name__ == "__main__": | ||
import asyncio | ||
from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark | ||
|
||
async def main(): | ||
graph = MoaAflowWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval") | ||
benchmark = HumanEvalBenchmark( | ||
name="HumanEval", | ||
file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", | ||
log_path="" | ||
) | ||
avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5) | ||
return avg_score, avg_cost, total_cost | ||
|
||
async def single_task(): | ||
graph = MoaAflowWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval") | ||
task = "\ndef sort_array(arr):\n \"\"\"\n In this Kata, you have to sort an array of non-negative integers according to\n number of ones in their binary representation in ascending order.\n For similar number of ones, sort based on decimal value.\n\n It must be implemented like this:\n >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n \"\"\"\n" | ||
function_name = "sort_array" | ||
solution, cost = await graph(task, function_name) | ||
print(solution) | ||
print(cost) | ||
|
||
async def single_task_test(): | ||
graph = MoaAflowTestWorkflow(name="MoaTest", llm_names=llm_name_list, dataset="HumanEval") | ||
task = "\ndef sort_array(arr):\n \"\"\"\n In this Kata, you have to sort an array of non-negative integers according to\n number of ones in their binary representation in ascending order.\n For similar number of ones, sort based on decimal value.\n\n It must be implemented like this:\n >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n \"\"\"\n" | ||
function_name = "sort_array" | ||
solution, cost = await graph(task, function_name) | ||
print(solution) | ||
print(cost) | ||
|
||
asyncio.run(single_task_test()) | ||
# score, cost, total_cost = asyncio.run(main()) | ||
# print(f"Moa: {score}, {cost}, {total_cost}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import asyncio | ||
from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark | ||
from metagpt.ext.eflow.src.abstract import Workflow | ||
from metagpt.ext.eflow.src.operators import CodeGenerate, Custom, ScEnsemble, MOAGenerate | ||
|
||
llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"] | ||
|
||
class MutliLLMWorkflow(Workflow): | ||
def __init__(self, name: str, llm_names: list, dataset: str): | ||
super().__init__(name, llm_names, dataset) | ||
self.custom = Custom(self.llm_dict["gpt-4o-mini"]) | ||
self.code_generate = CodeGenerate(self.llm_dict["gpt-4o-mini"]) | ||
self.sc_ensemble = ScEnsemble(self.llm_dict["gpt-4o"]) | ||
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"]) | ||
|
||
async def __call__(self, problem, function_name): | ||
solutions = [] | ||
for i in range(3): | ||
solution = await self.moa_generate(problem, function_name, models=self.llm_dict.values()) | ||
solutions.append(solution["solution"]) | ||
|
||
solution = await self.sc_ensemble(solutions, problem) | ||
return solution["response"], self.get_cost() | ||
|
||
|
||
class MoaWorkflow(Workflow): | ||
def __init__(self, name: str, llm_names: list, dataset: str): | ||
super().__init__(name, llm_names, dataset) | ||
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"]) | ||
|
||
async def __call__(self, problem, function_name): | ||
solution = await self.moa_generate(problem, function_name, models=self.llm_dict.values()) | ||
return solution["solution"], self.get_cost() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
async def main(): | ||
graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval") | ||
benchmark = HumanEvalBenchmark( | ||
name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", log_path="" | ||
) | ||
avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5) | ||
return avg_score, avg_cost, total_cost | ||
|
||
async def single_task(): | ||
graph = MoaWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval") | ||
# graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval") | ||
task = "\n\ndef sum_to_n(n: int):\n \"\"\"sum_to_n is a function that sums numbers from 1 to n.\n >>> sum_to_n(30)\n 465\n >>> sum_to_n(100)\n 5050\n >>> sum_to_n(5)\n 15\n >>> sum_to_n(10)\n 55\n >>> sum_to_n(1)\n 1\n \"\"\"\n" | ||
function_name = "sum_to_n" | ||
solution, cost = await graph(task, function_name) | ||
print(solution) | ||
print(cost) | ||
|
||
async def moa_workflow(): | ||
graph = MoaWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval") | ||
benchmark = HumanEvalBenchmark( | ||
name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", log_path="" | ||
) | ||
avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5) | ||
return avg_score, avg_cost, total_cost | ||
|
||
sc_moa_score, sc_moa_cost, sc_moa_total_cost = asyncio.run(main()) | ||
moa_score, moa_cost, moa_total_cost = asyncio.run(moa_workflow()) | ||
print(f"SelfConsistency: {sc_moa_score}, {sc_moa_cost}, {sc_moa_total_cost}") | ||
print(f"Moa: {moa_score}, {moa_cost}, {moa_total_cost}") |
Oops, something went wrong.