generated from Marker-Inc-Korea/AutoRAG-template
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmake_qa.py
64 lines (54 loc) · 2.34 KB
/
make_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import click
import os
import pandas as pd
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
make_basic_gen_gt,
make_concise_gen_gt,
)
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop
from autorag.data.qa.schema import Raw, Corpus
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
root_dir = os.path.dirname(os.path.realpath(__file__))
@click.command()
@click.option("--corpus_path", type=click.Path(exists=True), help="Path to the corpus. Must be parquet file.",
required=True, default=os.path.join(root_dir, "chunked_corpus", "3.parquet"))
@click.option("--raw_path", type=click.Path(exists=True), help="Path to the raw data. Must be parquet file.",
required=True, default=os.path.join(root_dir, "parsed_raw", "5.parquet"))
@click.option("--qa_size", type=int, help="Number of QA pairs to generate.", default=4)
@click.option("--output_path", type=click.Path(), help="Path to save the generated QA pairs. Must be parquet file.",
required=True,
default=os.path.join(root_dir, "data", "generated_qa.parquet"))
@click.option("--corpus_output_path", type=click.Path(),
default=os.path.join(root_dir, "data", "generated_corpus.parquet"))
def main(corpus_path, raw_path, qa_size, output_path, corpus_output_path):
load_dotenv()
for path in [corpus_path, raw_path, output_path, corpus_output_path]:
if not path.endswith(".parquet"):
raise ValueError(f"Path {path} must be a parquet file.")
llm = OpenAI(model="gpt-4o-mini-2024-07-18")
initial_raw = Raw(pd.read_parquet(raw_path, engine="pyarrow"))
initial_corpus = Corpus(pd.read_parquet(corpus_path, engine="pyarrow"), initial_raw)
qa = initial_corpus.sample(random_single_hop, n=qa_size).map(
lambda df: df.reset_index(drop=True),
).make_retrieval_gt_contents().batch_apply(
factoid_query_gen, # query generation
llm=llm,
lang="en",
).batch_apply(
make_basic_gen_gt, # answer generation (basic)
llm=llm,
lang="en",
).batch_apply(
make_concise_gen_gt, # answer generation (concise)
llm=llm,
lang="en",
).filter(
dontknow_filter_rule_based, # filter unanswerable questions
lang="en",
)
qa.to_parquet(output_path, corpus_output_path)
if __name__ == "__main__":
main()