-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathconfig_dummy_data_vanilla_mistral.yaml
93 lines (82 loc) · 4.07 KB
/
config_dummy_data_vanilla_mistral.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
project_base_dir: ./
result_base_dir: ./data_process_pipeline/results
# culture relevance classifier
0_culture_relevance_classifier:
input_file: data_process_pipeline/dummy_data/comments.csv
output_file: data_process_pipeline/results/0_culture_relevance_classifier/output.csv
classifier_path: SALT-NLP/CultureBank-Relevance-Classifier
model_name: distilbert-base-uncased
device: cpu
field_name_with_comments: comment_content
batch_size: 512
# llm-based knowledge extractor
1_knowledge_extractor:
input_file: data_process_pipeline/results/0_culture_relevance_classifier/output.csv
output_raw: data_process_pipeline/results/1_knowledge_extractor/output_raw.csv
output_file: data_process_pipeline/results/1_knowledge_extractor/output.csv
pattern: plain
model: mistralai/Mistral-7B-Instruct-v0.2
num_samples: -1
partition: -1
sanity_check: False
# convert negation
2_negation_converter:
input_file: data_process_pipeline/results/1_knowledge_extractor/output.csv
output_file: data_process_pipeline/results/2_negation_converter/output.csv
# clustering component
3_clustering_component:
input_file: data_process_pipeline/results/1_knowledge_extractor/output.csv # skip the negation converter now
output_file: data_process_pipeline/results/3_clustering_component/output.csv
output_score_file: data_process_pipeline/results/3_clustering_component/score.json
output_filtered_file: data_process_pipeline/results/3_clustering_component/output_filtered.csv
cultural_group_threshold: 0.3 # this will be overriden by the parameters in CultureBank/data_process_pipeline/main.py
sent_threshold: 0.4 # this will be overriden by the parameters in CultureBank/data_process_pipeline/main.py
min_cluster_size: 1 # any cluster whose size < this number, will be removed
with_other_desc: true # this will be overriden by the parameters in CultureBank/data_process_pipeline/main.py
annotated_file: none #if you have a file with annotated clusters, feel free to put it here to get an evaluation score
sbert:
model: all-MiniLM-L6-v2
# cluster summarizer
4_cluster_summarizer:
input_file: data_process_pipeline/results/3_clustering_component/group=0.3_sent=0.4_otherdesc=False/output_filtered.csv
output_file: data_process_pipeline/results/4_cluster_summarizer/output.csv
original_before_cluster_file: data_process_pipeline/results/1_knowledge_extractor/output.csv
pattern: plain
model: mistralai/Mistral-7B-Instruct-v0.2
num_samples: -1
partition: -1
sanity_check: False
filter_threshold: -1
# topic normalization
5_topic_normalizer:
input_file: data_process_pipeline/results/4_cluster_summarizer/output.csv
output_file: data_process_pipeline/results/5_topic_normalizer/output.csv
output_score_file: data_process_pipeline/results/5_topic_normalizer/output_score.json
cultural_group_threshold: 0.3
sbert:
model: all-MiniLM-L6-v2
openai:
model: gpt-3.5-turbo-1106
temperature: 0.5
max_tokens: 20
top_p: 0.3
seed: 1234
# agreement calculator
6_agreement_calculator:
input_file: data_process_pipeline/results/5_topic_normalizer/group=0.3/output.csv
output_file: data_process_pipeline/results/6_agreement_calculator/output.csv
# content moderation
7_content_moderation:
input_file: data_process_pipeline/results/6_agreement_calculator/output.csv
output_file: data_process_pipeline/results/7_content_moderation/output.csv
output_file_for_manual_annotation: data_process_pipeline/results/7_content_moderation/output_for_annotation.csv
hard_api_threshold: 0.2 # >=0.2 will be removed without any annotation
soft_api_threshold: 0.1 # [0.1, 0.2) will go through a manual check
keyword_list_dir: data_process_pipeline/dummy_data/list_of_words.txt
device: cuda:0
controversial_field_name_to_annotate: controversial_or_PII
model_dir: SALT-NLP/CultureBank-Controversial-Classifier
8_final_formatter:
input_file: data_process_pipeline/results/7_content_moderation/output.csv
output_file: data_process_pipeline/results/8_final_formatter/output.csv
controversial_annotation_file: data_process_pipeline/results/7_content_moderation/human_annotation.csv