-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathapp.py
168 lines (133 loc) · 6.53 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Description
# This is an gradio app that allows users to input a data line by line, (input1)
# using openai chatgpt,
# it will suggests labels for the data (output1, also input2)
# and label (NER) the data with given labels (output2)
import openai
import json
from typing import List, Tuple, Union
import gradio as gr
import colorsys
import pandas as pd
import re
import io
def query_prompt(openai_key: str, prompt: str, model: str = "gpt-3.5-turbo", max_tokens: int = 1024, temperature: float = 0.5):
openai.api_key = openai_key
model = "gpt-3.5-turbo"
completions = openai.ChatCompletion.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
n=1,
stop=None,
temperature=temperature,
)
output = completions.choices[0].message.content.strip()
# model = "text-davinci-003"
# completions = openai.Completion.create(
# engine=model,
# prompt=prompt,
# max_tokens=1024,
# n=1
# )
# output = completions.choices[0].text.strip()
return output
def predict_labels(openai_key: str, data: str, explain_data: str = "", max_tokens: int = 1024, temperature: float = 0.5):
# prepend '- ' to each line
data_list_string = "\n".join([f"- {item}" for item in data.splitlines()])
prompt = (
f"What is good labels for NER in terms of given items but in general? Please match the given output format.\n"
"Your output format should be like this (print only labels):\n"
"- label 1\n"
"- label 2\n"
"- ...\n\n"
f"Items list (hint: {explain_data}):\n{data_list_string}\n\n"
)
output = query_prompt(openai_key, prompt, max_tokens=max_tokens, temperature=temperature)
# remove until line starts with '- '
output = output[output.find("- "):]
# remove '- ' from each line
output = "\n".join([item[2:] for item in output.splitlines()])
# remove parenthesis and it's content which is added by openai
output = re.sub(r"\([^)]*\)", "", output)
return output
def predict_ner(openai_key, data: str, entities: str, explain_data: str = "", max_tokens: int = 1024, temperature: float = 0.5):
# add id to labels
# e.g. "B-LOC" -> "0: B-LOC"
entities_dict = {}
for i, entity in enumerate(entities.splitlines()):
entities = entities.replace(entity, f"{i}: {entity}")
entities_dict[str(i)] = entity
prompt = (
f"Perform NER on the each item. Group the words if you need.\nFormat the output as (<word>|<entity id>) (<word>|<entity id>).\nExample (IPhone XS): (IPhone|101) (XS|107)\n\n"
f"Entity id and name\n---\n{entities}\n\n"
f"Items list (hint: {explain_data})\n---\n{data}\n\n"
)
print(prompt)
output = query_prompt(openai_key, prompt, max_tokens=max_tokens, temperature=temperature)
# ner_results_by_items = [x[1:] for x in item.split(") ()") for item in output.splitlines()]
ner_results_by_items = []
for item in output.splitlines():
# use regex to remove () and split by space
ner_results_by_items.append(re.findall(r"\((.*?)\)", item))
print(ner_results_by_items)
# parse
parsed_ner_results_by_items = []
items = data.splitlines()
for i, ner_result in enumerate(ner_results_by_items):
parsed_ner_result = [items[i]]
for j, ner in enumerate(ner_result):
# remove {}
# ner = ner[1:-1]
text, entity_id = [item.strip() for item in ner.split("|")]
entity = entities_dict[entity_id]
parsed_ner_result.append([text, entity])
parsed_ner_results_by_items.append(parsed_ner_result)
return parsed_ner_results_by_items
def predict_ner_format_dataframe(openai_key: str, data: str, labels: str, explain_data: str = "", max_tokens: int = 1024, temperature: float = 0.5):
global latest_output
# format to match gradio dataframe output
ner_data = predict_ner(openai_key, data, labels, explain_data, max_tokens, temperature)
# ner_data is list of list
# each list will be a row
# each element is a cell
# each cell is a list [text, label] and we will convert it to a "text: label" string format
# first cell is full text, so skip the converting process for the first cell
df = pd.DataFrame(ner_data)
df = df.applymap(lambda x: f"{x[0]}: {x[1]}" if isinstance(x, list) else x)
# df.columns = ["text", *labels.splitlines()]
df.columns = ["text", *[f"label_{i}" for i in range(len(df.columns)-1)]]
xlsx = df_to_excel(df)
return df, xlsx
def df_to_excel(df):
df.to_excel("/tmp/tmp.xlsx", index=False)
return "/tmp/tmp.xlsx"
with gr.Blocks() as demo:
gr.Markdown("# OpenAI ChatGPT Weak Labeler")
with gr.Row(variant="panel"):
with gr.Column(variant="panel"):
openai_key = gr.Textbox(placeholder="Enter your OpenAI API key here...", label="OpenAI API Key")
input_data = gr.Textbox(lines=5, placeholder="Enter your data here...", label="Data")
btn_predict_labels = gr.Button("Predict labels")
with gr.Accordion("Advanced Options", open=False):
explain_data = gr.Textbox(placeholder="Explain your data", label="Explain Data")
max_tokens = gr.Slider(1, 4096, 1024, label="Max Tokens")
temperature = gr.Slider(0.0, 1.0, 0.5, label="Temperature")
with gr.Column():
output_labels = gr.Textbox(lines=5, label="Labels")
btn_predict_ner = gr.Button("Predict NER", variant="primary")
# with gr.Column(scale=2, variant="panel"):
# output_ner = gr.HighlightedText(label="NER Output", elem_id="htext")
with gr.Row(variant="panel"):
with gr.Column():
output_ner = gr.Dataframe(label="NER Output")
output_ner_file = gr.File()
# output_ner = gr.HighlightedText(label="NER Output", elem_id="htext")
# output_ner = gr.HTML()
btn_predict_labels.click(fn=predict_labels, inputs=[openai_key, input_data, explain_data, max_tokens, temperature], outputs=[output_labels])
# btn_predict_ner.click(fn=predict_ner, inputs=[input_data, output_labels], outputs=[output_ner])
# btn_predict_ner.click(fn=predict_ner_format, inputs=[input_data, output_labels], outputs=[output_ner])
btn_predict_ner.click(fn=predict_ner_format_dataframe, inputs=[openai_key, input_data, output_labels, explain_data, max_tokens, temperature], outputs=[output_ner, output_ner_file])
# break line in css
demo.css = "#htext .no-cat {white-space: pre-wrap;}"
demo.launch()