Skip to content

Commit

Permalink
Basic Update
Browse files Browse the repository at this point in the history
  • Loading branch information
didiforgithub committed Dec 4, 2024
1 parent e2cdcfb commit 7ec14de
Show file tree
Hide file tree
Showing 10 changed files with 650 additions and 3 deletions.
6 changes: 5 additions & 1 deletion examples/aflow/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self, dataset: str, question_type: str, operators: List[str]):
dataset="MBPP",
question_type="code",
operators=["Custom", "CustomCodeGenerate", "ScEnsemble", "Test"],
# operators=["Custom", "CustomCodeGenerate"]
),
"HumanEval": ExperimentConfig(
dataset="HumanEval",
Expand Down Expand Up @@ -70,7 +71,9 @@ def parse_args():
)
parser.add_argument("--initial_round", type=int, default=1, help="Initial round")
parser.add_argument("--max_rounds", type=int, default=20, help="Max iteration rounds")
parser.add_argument("--check_convergence", type=bool, default=True, help="Whether to enable early stop")
parser.add_argument(
"--check_convergence", type=lambda x: x.lower() == "true", default=True, help="Whether to enable early stop"
)
parser.add_argument("--validation_rounds", type=int, default=5, help="Validation rounds")
parser.add_argument(
"--if_first_optimize",
Expand All @@ -87,6 +90,7 @@ def parse_args():
download(["datasets", "initial_rounds"], if_first_download=args.if_first_optimize)
config = EXPERIMENT_CONFIGS[args.dataset]

four_o_llm_config = ModelsConfig.default().get("gpt-4o")
mini_llm_config = ModelsConfig.default().get("gpt-4o-mini")
claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")

Expand Down
143 changes: 143 additions & 0 deletions io_humaneval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from typing import List, Tuple

from pydantic import BaseModel, Field

from metagpt.configs.models_config import ModelsConfig
from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
from metagpt.ext.aflow.scripts.operator import Operator
from metagpt.ext.aflow.scripts.workflow import Workflow
from metagpt.llm import LLM

HUMANEVAL_PROMPT_IO = """
{question}\nGenerate an answer to this question, without any additional test cases.
"""

WITHOUT_PROMPT = """
{question}
"""

SC_ENSEMBLE_PROMPT = """
Given the question described as follows: {question}
Several solutions have been generated to address the given question. They are as follows:
{solutions}
Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
"""


class GenerateOp(BaseModel):
solution: str = Field(default="", description="Python Solution For This Question.")


class ScEnsembleOp(BaseModel):
thought: str = Field(default="", description="The thought of the most consistent solution.")
solution_letter: str = Field(default="", description="The letter of most consistent solution.")


class Generate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(llm, name)

async def __call__(self, question: str, entry_point: str) -> Tuple[str, str]:
prompt = WITHOUT_PROMPT.format(question=question)
response = await self._fill_node(GenerateOp, prompt, mode="code_fill", function_name=entry_point)
return response


class JSONGenerate(Operator):
def __init__(self, llm: LLM, name: str = "JSONGenerate"):
super().__init__(llm, name)

async def __call__(self, question: str, entry_point: str) -> Tuple[str, str]:
prompt = WITHOUT_PROMPT.format(question=question)
response = await self._fill_node(GenerateOp, mode="single_fill", prompt=prompt)
return response


class ScEnsemble(Operator):
def __init__(self, llm: LLM, name: str = "ScEnsemble"):
super().__init__(llm, name)

async def __call__(self, solutions: List[str], problem: str):
answer_mapping = {}
solution_text = ""
for index, solution in enumerate(solutions):
answer_mapping[chr(65 + index)] = index
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"

prompt = SC_ENSEMBLE_PROMPT.format(question=problem, solutions=solution_text)
response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill")

answer = response.get("solution_letter", "")
answer = answer.strip().upper()

return {"response": solutions[answer_mapping[answer]]}


class IOSolveGraph(Workflow):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.generate = Generate(self.llm)

async def __call__(self, question, entry_point):
solution = await self.generate(question, entry_point)
return solution["solution"], self.llm.cost_manager.total_cost


class JSONSolveGraph(Workflow):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.generate = Generate(self.llm)

async def __call__(self, question, entry_point):
solution = await self.generate(question, entry_point)
return solution["solution"], self.llm.cost_manager.total_cost


class SelfConsistencyGraph(Workflow):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.generate = Generate(llm=self.llm)
self.sc_ensemble = ScEnsemble(llm=self.llm)

async def __call__(self, problem, function_name):
solutions = []
for i in range(5):
solution = await self.generate(problem, function_name)
solutions.append(solution["solution"])
solution = await self.sc_ensemble(solutions, problem)
return solution["response"], self.llm.cost_manager.total_cost


if __name__ == "__main__":

async def main():
# llm_config = ModelsConfig.default().get("llama-3.2-90b-vision-instruct")
llm_config = ModelsConfig.default().get("meta-llama/Meta-Llama-3.1-70B-Instruct")
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-4o")
# llm_config = ModelsConfig.default().get("deepseek-chat")
# llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
# graph = IOSolveGraph(name="IO", llm_config=llm_config, dataset="HumanEval")
# graph = JSONSolveGraph(name="JSON", llm_config=llm_config, dataset="HumanEval")
graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="HumanEval")
benchmark = HumanEvalBenchmark(
name="HumanEval",
file_path="/Users/trl/Github_project/MetaGPT-MathAI/metagpt/ext/aflow/data/humaneval_test.jsonl",
log_path="",
)
avg_score = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
return avg_score

import asyncio

asyncio.run(main())


# llama + markdown + prompt 0.75573

# gpt-4o-mini + markdown + prompt 0.871
# gpt-4o-mini + json + prompt 0.81679
# gpt-4o-mini
10 changes: 9 additions & 1 deletion metagpt/ext/aflow/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def calculate_score(self, expected_output: Any, prediction: Any) -> Tuple[float,
def get_result_columns(self) -> List[str]:
pass

async def evaluate_all_problems(self, data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
async def evaluate_all_problems(self, data: List[dict], graph: Callable, max_concurrent_tasks: int = 25):
semaphore = asyncio.Semaphore(max_concurrent_tasks)

async def sem_evaluate(problem):
Expand All @@ -94,6 +94,14 @@ async def sem_evaluate(problem):
tasks = [sem_evaluate(problem) for problem in data]
return await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {self.name} problems", total=len(data))

async def baseline_evaluation(self, graph: Callable, max_concurrent_tasks: int = 50):
data = await self.load_data()
results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)
columns = self.get_result_columns()
df = pd.DataFrame(results, columns=columns)
avg_score = df["score"].mean()
logger.info(f"Average score on {self.name} dataset: {avg_score:.5f}")

async def run_evaluation(self, graph: Callable, va_list: List[int], max_concurrent_tasks: int = 50):
data = await self.load_data(va_list)
results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)
Expand Down
2 changes: 1 addition & 1 deletion metagpt/ext/aflow/scripts/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@

class Operator:
def __init__(self, llm: LLM, name: str):
self.name = name
self.llm = llm
self.name = name

def __call__(self, *args, **kwargs):
raise NotImplementedError
Expand Down
49 changes: 49 additions & 0 deletions metagpt/ext/eflow/experience/consistency_humaneval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import asyncio

from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark
from metagpt.ext.eflow.src.abstract import Workflow
from metagpt.ext.eflow.src.operators import CodeGenerate, Custom, ScEnsemble

llm_name_list = ["claude-3-5-sonnet-20240620", "gpt-4o-mini", "gpt-4o", "deepseek-chat"]


class MutliLLMWorkflow(Workflow):
def __init__(self, name: str, llm_names: list, dataset: str):
super().__init__(name, llm_names, dataset)
self.custom = Custom(self.llm_dict["gpt-4o-mini"])
self.code_generate = CodeGenerate(self.llm_dict["gpt-4o-mini"])
self.sc_ensemble = ScEnsemble(self.llm_dict["claude-3-5-sonnet-20240620"])

async def __call__(self, problem, function_name):
solutions = []
tasks = [
self.code_generate(problem, function_name, self.llm_dict["claude-3-5-sonnet-20240620"]),
self.code_generate(problem, function_name, self.llm_dict["gpt-4o-mini"]),
self.code_generate(problem, function_name, self.llm_dict["gpt-4o"]),
self.code_generate(problem, function_name, self.llm_dict["deepseek-chat"]),
]
claude_solution, four_o_mini_solution, four_o_solution, deepseek_solution = await asyncio.gather(*tasks)

solutions.append(claude_solution)
solutions.append(four_o_mini_solution)
solutions.append(four_o_solution)
solutions.append(deepseek_solution)

solution = await self.sc_ensemble(solutions, problem)

return solution["response"], self.get_cost()


if __name__ == "__main__":

async def main():
graph = MutliLLMWorkflow(name="SelfConsistency", llm_names=llm_name_list, dataset="HumanEval")
benchmark = HumanEvalBenchmark(
name="HumanEval", file_path="metagpt/ext/aflow/data/humaneval_validate.jsonl", log_path=""
)
avg_score = await benchmark.baseline_evaluation(graph, max_concurrent_tasks=5)
return avg_score

import asyncio

asyncio.run(main())
74 changes: 74 additions & 0 deletions metagpt/ext/eflow/src/abstract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Dict, List

from pydantic import BaseModel, Field

from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.ext.aflow.scripts.evaluator import DatasetType
from metagpt.llm import LLM
from metagpt.provider.llm_provider_registry import create_llm_instance
from metagpt.utils.cost_manager import CostManager


class Schema(BaseModel):
def __init__(self, attributes: List[Dict]):
self._create_schemas(attributes)

def _create_schemas(self, attributes: List[Dict]):
for attribute in attributes:
self.model_fields[attribute["name"]] = Field(
default="", description=attribute["description"], type=attribute["type"]
)


class Operator:
def __init__(self, model: LLM, name: str):
self.default_model = model
self.name = name

def __call__(self, *args, **kwargs):
raise NotImplementedError

async def _fill_node(self, op_schema, prompt, format=None, model: LLM = None, **extra_kwargs):
fill_kwargs = {"context": prompt, "llm": model}
if model is None:
model = self.default_model
if format:
fill_kwargs["mode"] = format

fill_kwargs.update(extra_kwargs)
node = await ActionNode.from_pydantic(op_schema).fill(**fill_kwargs)
return node.instruct_content.model_dump()


class Workflow:
def __init__(
self,
name: str,
llm_names: list,
dataset: DatasetType,
) -> None:
self.name = name
self.dataset = dataset
self.llm_dict = self.create_llms(llm_names)

def get_cost(self):
# 累计所有llm的cost
total_cost = 0
for llm in self.llm_dict.values():
total_cost += llm.cost_manager.total_cost
return total_cost

def create_llms(self, llm_names: list):
llm_dict = {}
for llm_name in llm_names:
llm_config = ModelsConfig.default().get(llm_name)
llm_dict[llm_name] = create_llm_instance(llm_config)
llm_dict[llm_name].cost_manager = CostManager()
return llm_dict

async def __call__(self, problem: str):
"""
Implementation of the workflow
"""
raise NotImplementedError("This method should be implemented by the subclass")
67 changes: 67 additions & 0 deletions metagpt/ext/eflow/src/operators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import List

from pydantic import BaseModel, Field

from metagpt.ext.eflow.src.abstract import Operator
from metagpt.ext.eflow.src.prompt import GENERATE_PROMPT, SC_ENSEMBLE_PROMPT
from metagpt.llm import LLM

# Schema List[{name: str, type: str, description: str}]
# # 看起来Schema 需要一个单独的Class,方便进行生成
# 这里的抽象太乱了
# 多LLM应该在调用的时候出现,而不是在Workflow创建的时候出现
# 应该放开Node调整的条件,还是放开Operator的条件?


class GenerateOp(BaseModel):
response: str = Field(default="", description="Your solution for this problem")


class Custom(Operator):
def __init__(self, model: LLM):
super().__init__(model, "Custom")
self.schema = [{"name": "response", "type": "str", "description": "Your solution for this problem"}]

async def __call__(self, input: str, prompt: str, op_schema: BaseModel = None, model: LLM = None, **extra_kwargs):
if op_schema is None:
op_schema = self.schema
context = prompt + input
response = await self._fill_node(op_schema=op_schema, prompt=context, model=model, **extra_kwargs)
return response


class CodeGenerate(Operator):
def __init__(self, model: LLM):
super().__init__(model, "CodeGenerate")
self.schema = [{"name": "response", "type": "str", "description": "Your solution for this problem"}]

async def __call__(self, problem: str, function_name: str, model: LLM = None):
prompt = GENERATE_PROMPT.format(problem=problem)
response = await self._fill_node(
op_schema=GenerateOp, prompt=prompt, format="xml_fill", model=model, function_name=function_name
)
return response


class ScEnsemble(Operator):
def __init__(self, model: LLM):
super().__init__(model, "ScEnsemble")
self.schema = [
{"name": "thought", "type": "str", "description": "The thought of the most consistent solution."},
{"name": "solution_letter", "type": "str", "description": "The letter of most consistent solution."},
]

async def __call__(self, solutions: List[str], problem: str, model: LLM = None):
answer_mapping = {}
solution_text = ""
for index, solution in enumerate(solutions):
answer_mapping[chr(65 + index)] = index
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"

prompt = SC_ENSEMBLE_PROMPT.format(question=problem, solutions=solution_text)
response = await self._fill_node(op_schema=self.schema, prompt=prompt, format="xml_fill", model=model)

answer = response.get("solution_letter", "")
answer = answer.strip().upper()

return {"response": solutions[answer_mapping[answer]]}
Loading

0 comments on commit 7ec14de

Please sign in to comment.