Skip to content

Commit

Permalink
Update MoA & Visual
Browse files Browse the repository at this point in the history
  • Loading branch information
didiforgithub committed Dec 9, 2024
1 parent 6faee49 commit b4b63a5
Show file tree
Hide file tree
Showing 13 changed files with 836 additions and 40 deletions.
5 changes: 5 additions & 0 deletions metagpt/actions/action_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from metagpt.utils.common import OutputParser, general_after_log
from metagpt.utils.human_interaction import HumanInteraction
from metagpt.utils.sanitize import sanitize
from metagpt.utils.common import encode_image
from pathlib import Path


class ReviewMode(Enum):
Expand Down Expand Up @@ -628,6 +630,9 @@ async def fill(
if self.schema:
schema = self.schema

if images:
images = [encode_image(Path(image)) for image in images]

if mode == FillMode.CODE_FILL.value:
result = await self.code_fill(context, function_name, timeout)
self.instruct_content = self.create_class()(**result)
Expand Down
7 changes: 3 additions & 4 deletions metagpt/ext/aflow/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,10 @@ async def baseline_evaluation(self, graph: Callable, max_concurrent_tasks: int =
data = await self.load_data()
results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)
columns = self.get_result_columns()
df = pd.DataFrame(results, columns=columns)
avg_score = df["score"].mean()
total_cost = df["cost"].max()
logger.info(f"Average score on {self.name} dataset: {avg_score:.5f}")
average_score, average_cost, total_cost = self.save_results_to_csv(results, columns)
logger.info(f"Average score on {self.name} dataset: {average_score:.5f}")
logger.info(f"Total Cost: {total_cost:.5f}")
return average_score, average_cost, total_cost

async def run_evaluation(self, graph: Callable, va_list: List[int], max_concurrent_tasks: int = 50):
data = await self.load_data(va_list)
Expand Down
2 changes: 1 addition & 1 deletion metagpt/ext/aflow/benchmark/humaneval.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def check_solution(self, solution, test, entry_point):
@retry(stop=stop_after_attempt(5), wait=wait_fixed(1), retry=retry_if_exception_type(Exception), reraise=True)
async def _generate_output(self, graph, prompt, entry_point):
# Generate output with a timeout of 60 seconds
return await asyncio.wait_for(graph(prompt, entry_point), timeout=60)
return await asyncio.wait_for(graph(prompt, entry_point), timeout=1000)

async def evaluate_problem(self, data: dict, graph: Callable) -> Tuple[str, str, str, float, float]:
input_text = data["prompt"]
Expand Down
94 changes: 94 additions & 0 deletions metagpt/ext/aflow/benchmark/matplotbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import base64
from typing import Any, Callable, List, Tuple
from pathlib import Path

import re
from metagpt.provider.llm_provider_registry import create_llm_instance
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
from metagpt.logs import logger
from metagpt.ext.eflow.src.abstract import Operator
from metagpt.llm import LLM

class VisualizationCompare(Operator):
def __init__(self, model:LLM):
super().__init__(model, "VisualizationCompare")
self.schema = [
{"name": "thought", "type": "str", "description": "The thought process of score"},
{"name": "score", "type": "str", "description": "The score of the generated plot, from 0-100"}
]

async def __call__(self, query:str, images: List[str]):
prompt = """
You are an excellent judge at evaluating visualization plots between a model-generated plot and the ground truth.
You will be giving scores on how well it matches the ground truth plot.
The generated plot will be given to you as the first figure. If the first figure is blank, that means the code failed to generate a figure.
Another plot will be given to you as the second figure, which is the desired outcome of the user query, meaning it is the ground truth for you to reference.
Please compare the two figures head to head and rate them. Suppose the second figure has a score of 100, rate the first figure on a scale from 0 to 100.
Scoring should be carried out regarding the plot correctness: Compare closely between the generated plot and the ground truth, the more resemblance the generated plot has compared to the ground truth, the higher the score.
The score should be proportionate to the resemblance between the two plots.
In some rare occurrences, see if the data points are generated randomly according to the query, if so, the generated plot may not perfectly match the ground truth, but it is correct nonetheless.
Only rate the first figure, the second figure is only for reference.
If the first figure is blank, that means the code failed to generate a figure. Give a score of 0 on the Plot correctness.
After scoring from the above aspect, please give a final score.
The user query is {query}
"""
prompt = prompt.format(query=query)
response = await self._fill_node(op_schema=self.schema, prompt=prompt, format="xml_fill", images=images)
return response["score"]


class MatPlotBench(BaseBenchmark):
def __init__(self, name: str, file_path: str, log_path: str, llm_config):
super().__init__(name, file_path, log_path)
self.eval_llm = create_llm_instance(llm_config)

def encode_image(self, image_path: str) -> str:
"""将图片编码为base64字符串"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

async def evaluate_problem(self, data: dict, graph: Callable) -> Tuple[str, str, str, float, float]:
"""评估单个可视化问题
Args:
data: 包含测试数据的字典
graph: 生成图表的函数
Returns:
Tuple[输入文本, 生成的图片路径, 期望的图片路径, 分数, 成本]
"""
query = data["simple_instruction"]
test_id = data["test_id"]

try:
# 生成图表
generated_image, cost = await graph(query)
ground_truth = f"./benchmark_data/ground_truth/example_{test_id}.png"

# 如果生成的图片不存在,返回0分
if not Path(generated_image).exists():
logger.warning(f"Generated image does not exist: {generated_image}")
return query, generated_image, ground_truth, 0.0, cost

# 评估图表质量
score = await self._evaluate_plot(ground_truth, generated_image)

return query, generated_image, ground_truth, score, cost

except Exception as e:
logger.error(f"Error evaluating plot: {e}")
return query, "", ground_truth, 0.0, 0.0

async def _evaluate_plot(self, ground_truth: str, generated_image: str) -> float:
"""使用GPT-4V评估图表质量"""
visual_compare = VisualizationCompare(self.eval_llm)
score = await visual_compare([generated_image, ground_truth])
return score

def calculate_score(self, expected_output: Any, prediction: Any) -> Tuple[float, Any]:
"""分数计算已在evaluate_problem中完成,这里仅返回占位值"""
return 0.0, prediction

def get_result_columns(self) -> List[str]:
"""定义结果CSV的列名"""
return ["query", "generated_image", "ground_truth", "score", "cost"]
1 change: 1 addition & 0 deletions metagpt/ext/aflow/scripts/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def run_code(code):
"PyQt5",
"wx",
"pyglet",
"matplotlib",
]

# Check for prohibited imports
Expand Down
98 changes: 98 additions & 0 deletions metagpt/ext/eflow/experience/moa_aflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from metagpt.ext.eflow.src.abstract import Workflow
from metagpt.ext.eflow.src.operators import Custom, Test, MOAGenerate, MOATest

llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"]

IMPROVE_CODE_PROMPT = """
The previous solution failed some test cases. Please analyze the problem carefully and provide an improved solution that addresses all edge cases and requirements. Ensure your code is efficient and follows best practices.
"""

class MoaAflowWorkflow(Workflow):
def __init__(
self,
name: str,
llm_names: list,
dataset: str,
) -> None:
super().__init__(name, llm_names, dataset)
self.custom = Custom(self.llm_dict["gpt-4o-mini"])
self.test = Test(self.llm_dict["gpt-4o"])
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])

async def __call__(self, problem: str, entry_point: str):
"""
Implementation of the MOA workflow
"""
solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
test_result = await self.test(problem=problem, solution=solution['solution'], entry_point=entry_point)

if test_result['result']:
return test_result['solution'], self.get_cost()
else:
# If the test fails, try to generate a new solution with MOA
problem = problem + "\n" + IMPROVE_CODE_PROMPT
new_solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
return new_solution['solution'], self.get_cost()


class MoaAflowTestWorkflow(Workflow):
def __init__(
self,
name: str,
llm_names: list,
dataset: str,
) -> None:
super().__init__(name, llm_names, dataset)
self.custom = Custom(self.llm_dict["gpt-4o-mini"])
self.test = Test(self.llm_dict["gpt-4o"])
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])
self.moa_test = MOATest(self.llm_dict["gpt-4o"])

async def __call__(self, problem: str, entry_point: str):
"""
Implementation of the MOA workflow
"""
solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
test_result = await self.moa_test(problem=problem, solution=solution['solution'], entry_point=entry_point, models=self.llm_dict.values())

if test_result['result']:
return test_result['solution'], self.get_cost()
else:
# If the test fails, try to generate a new solution with MOA
problem = problem + "\n" + IMPROVE_CODE_PROMPT
new_solution = await self.moa_generate(problem, entry_point, models=self.llm_dict.values())
return new_solution['solution'], self.get_cost()

if __name__ == "__main__":
import asyncio
from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark

async def main():
graph = MoaAflowWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
benchmark = HumanEvalBenchmark(
name="HumanEval",
file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl",
log_path=""
)
avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
return avg_score, avg_cost, total_cost

async def single_task():
graph = MoaAflowWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
task = "\ndef sort_array(arr):\n \"\"\"\n In this Kata, you have to sort an array of non-negative integers according to\n number of ones in their binary representation in ascending order.\n For similar number of ones, sort based on decimal value.\n\n It must be implemented like this:\n >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n \"\"\"\n"
function_name = "sort_array"
solution, cost = await graph(task, function_name)
print(solution)
print(cost)

async def single_task_test():
graph = MoaAflowTestWorkflow(name="MoaTest", llm_names=llm_name_list, dataset="HumanEval")
task = "\ndef sort_array(arr):\n \"\"\"\n In this Kata, you have to sort an array of non-negative integers according to\n number of ones in their binary representation in ascending order.\n For similar number of ones, sort based on decimal value.\n\n It must be implemented like this:\n >>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]\n >>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]\n >>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]\n \"\"\"\n"
function_name = "sort_array"
solution, cost = await graph(task, function_name)
print(solution)
print(cost)

asyncio.run(single_task_test())
# score, cost, total_cost = asyncio.run(main())
# print(f"Moa: {score}, {cost}, {total_cost}")
66 changes: 66 additions & 0 deletions metagpt/ext/eflow/experience/moa_sc_humaneval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import asyncio
from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
from metagpt.ext.eflow.src.abstract import Workflow
from metagpt.ext.eflow.src.operators import CodeGenerate, Custom, ScEnsemble, MOAGenerate

llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"]

class MutliLLMWorkflow(Workflow):
def __init__(self, name: str, llm_names: list, dataset: str):
super().__init__(name, llm_names, dataset)
self.custom = Custom(self.llm_dict["gpt-4o-mini"])
self.code_generate = CodeGenerate(self.llm_dict["gpt-4o-mini"])
self.sc_ensemble = ScEnsemble(self.llm_dict["gpt-4o"])
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])

async def __call__(self, problem, function_name):
solutions = []
for i in range(3):
solution = await self.moa_generate(problem, function_name, models=self.llm_dict.values())
solutions.append(solution["solution"])

solution = await self.sc_ensemble(solutions, problem)
return solution["response"], self.get_cost()


class MoaWorkflow(Workflow):
def __init__(self, name: str, llm_names: list, dataset: str):
super().__init__(name, llm_names, dataset)
self.moa_generate = MOAGenerate(self.llm_dict["gpt-4o"])

async def __call__(self, problem, function_name):
solution = await self.moa_generate(problem, function_name, models=self.llm_dict.values())
return solution["solution"], self.get_cost()


if __name__ == "__main__":

async def main():
graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval")
benchmark = HumanEvalBenchmark(
name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", log_path=""
)
avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
return avg_score, avg_cost, total_cost

async def single_task():
graph = MoaWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
# graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval")
task = "\n\ndef sum_to_n(n: int):\n \"\"\"sum_to_n is a function that sums numbers from 1 to n.\n >>> sum_to_n(30)\n 465\n >>> sum_to_n(100)\n 5050\n >>> sum_to_n(5)\n 15\n >>> sum_to_n(10)\n 55\n >>> sum_to_n(1)\n 1\n \"\"\"\n"
function_name = "sum_to_n"
solution, cost = await graph(task, function_name)
print(solution)
print(cost)

async def moa_workflow():
graph = MoaWorkflow(name="Moa", llm_names=llm_name_list, dataset="HumanEval")
benchmark = HumanEvalBenchmark(
name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_incremental.jsonl", log_path=""
)
avg_score, avg_cost, total_cost = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
return avg_score, avg_cost, total_cost

sc_moa_score, sc_moa_cost, sc_moa_total_cost = asyncio.run(main())
moa_score, moa_cost, moa_total_cost = asyncio.run(moa_workflow())
print(f"SelfConsistency: {sc_moa_score}, {sc_moa_cost}, {sc_moa_total_cost}")
print(f"Moa: {moa_score}, {moa_cost}, {moa_total_cost}")
Loading

0 comments on commit b4b63a5

Please sign in to comment.