diff --git a/elm/db_wiz.py b/elm/db_wiz.py index 0a5af9c2..f62832e5 100644 --- a/elm/db_wiz.py +++ b/elm/db_wiz.py @@ -3,15 +3,11 @@ ELM energy wizard """ import os -import copy -import numpy as np -import json import psycopg2 -from datetime import date, datetime import pandas as pd from elm.base import ApiBase -from elm.wizard import EnergyWizard + class DataBaseWizard(ApiBase): """Interface to ask OpenAI LLMs about energy research.""" @@ -21,7 +17,7 @@ class DataBaseWizard(ApiBase): "output based on user queries.") """High level model role, somewhat redundant to MODEL_INSTRUCTION""" - def __init__(self, connection_string, model=None, token_budget=3500, ref_col=None): + def __init__(self, connection_string, model=None, token_budget=3500): """ Parameters ---------- @@ -34,9 +30,6 @@ def __init__(self, connection_string, model=None, token_budget=3500, ref_col=Non Number of tokens that can be embedded in the prompt. Note that the default budget for GPT-3.5-Turbo is 4096, but you want to subtract some tokens to account for the response budget. - ref_col : None | str - Optional column label in the corpus that provides a reference text - string for each chunk of text. """ super().__init__(model) @@ -62,7 +55,8 @@ def get_sql_for(self, query): 'user question: "{}"\n\n' 'Return all columns from the database. ' 'All the tables are in the schema "loads"' - 'Please only return the SQL query with no commentary or preface.' + 'Please only return the SQL query with ' + 'no commentary or preface.' .format(self.database_describe, query)) out = super().chat(e_query, temperature=0) print(out) @@ -74,8 +68,8 @@ def run_sql(self, sql): response.""" query = sql print(query) - # Move Connection or cursor to init and test so that you aren't re-intializing - # it with each instance. + # Move Connection or cursor to init and test so that you aren't + # re-intializing it with each instance. with self.connection.cursor() as cursor: cursor.execute(query) data = cursor.fetchall() @@ -84,7 +78,7 @@ def run_sql(self, sql): return df def get_py_code(self, query, df): - """""" + """Get python code to respond to query""" e_query = ('Great it worked! I have made a dataframe from the output ' 'of the SQL query you gave me. ' 'Here is the dataframe head: \n{}\n' @@ -92,36 +86,29 @@ def get_py_code(self, query, df): 'Here is the dataframe description: \n{}\n' 'Here is the dataframe datatypes: \n{}\n' 'Now please write python code using matplotlib to plot ' - 'the data in the dataframe based on the original user query: "{}"' - .format(df.head(), df.tail(), df.describe(), df.dtypes, query)) + 'the data in the dataframe based on ' + 'the original user query: "{}"' + .format(df.head(), df.tail(), df.describe(), + df.dtypes, query)) out = super().chat(e_query, temperature=0) - ## get response from output + # get response from output # Need to fix full response full_response = out - #print(full_response) - ## get python code from response + # get python code from response full_response = full_response[full_response.find('```python')+9:] full_response = full_response[:full_response.find('```')] py = full_response return py def run_py_code(self, py, df): + """Run the python code with ``exec`` to plot the queried data""" try: exec(py) - return plt - except: + except Exception: print(py) - def chat(self, query, - debug=True, - stream=True, - temperature=0, - convo=False, - token_budget=None, - new_info_threshold=0.7, - print_references=False, - return_chat_obj=False): + def chat(self, query): """Answers a query by doing a semantic search of relevant text with embeddings and then sending engineered query to the LLM. @@ -129,48 +116,10 @@ def chat(self, query, ---------- query : str Question being asked of EnergyWizard - debug : bool - Flag to return extra diagnostics on the engineered question. - stream : bool - Flag to print subsequent chunks of the response in a streaming - fashion - temperature : float - GPT model temperature, a measure of response entropy from 0 to 1. 0 - is more reliable and nearly deterministic; 1 will give the model - more creative freedom and may not return as factual of results. - convo : bool - Flag to perform semantic search with full conversation history - (True) or just the single query (False). Call EnergyWizard.clear() - to reset the chat history. - token_budget : int - Option to override the class init token budget. - new_info_threshold : float - New text added to the engineered query must contain at least this - much new information. This helps prevent (for example) the table of - contents being added multiple times. - print_references : bool - Flag to print references if EnergyWizard is initialized with a - valid ref_col. - return_chat_obj : bool - Flag to only return the ChatCompletion from OpenAI API. - - Returns - ------- - response : str - GPT output / answer. - query : str - If debug is True, the engineered query asked of GPT will also be - returned here - references : list - If debug is True, the list of references (strs) used in the - engineered prompt is returned here """ self.query = query self.sql = self.get_sql_for(query) self.df = self.run_sql(self.sql) - self.py = self.get_py_code(query = query, df = self.df) - self.plt = self.run_py_code(self.py, self.df) - return self.plt - - + self.py = self.get_py_code(query=query, df=self.df) + self.run_py_code(self.py, self.df) diff --git a/elm/experts.py b/elm/experts.py index 1761f9b3..eca778f4 100644 --- a/elm/experts.py +++ b/elm/experts.py @@ -7,8 +7,6 @@ from glob import glob import pandas as pd import sys -import copy -import numpy as np from elm.base import ApiBase @@ -43,16 +41,17 @@ 'answer the question, say "I do not know."') EnergyWizard.MODEL_INSTRUCTION = EnergyWizard.MODEL_ROLE -DataBaseWizard.URL = (f'https://stratus-embeddings-south-central.openai.azure.com/' - f'openai/deployments/{model}/chat/' - f'completions?api-version={openai.api_version}') +DataBaseWizard.URL = ( + f'https://stratus-embeddings-south-central.openai.azure.com/' + f'openai/deployments/{model}/chat/' + f'completions?api-version={openai.api_version}') DataBaseWizard.HEADERS = {"Content-Type": "application/json", - "Authorization": f"Bearer {openai.api_key}", - "api-key": f"{openai.api_key}", - } + "Authorization": f"Bearer {openai.api_key}", + "api-key": f"{openai.api_key}"} st.set_option('deprecation.showPyplotGlobalUse', False) + @st.cache_data def get_corpus(): """Get the corpus of text data with embeddings.""" @@ -64,22 +63,21 @@ def get_corpus(): @st.cache_resource -def get_wizard(model = model): +def get_wizard(model=model): """Get the energy wizard object. - Parameters - ---------- - model : str - State which model to use for the energy wizard. - - Returns - ------- - response : str - GPT output / answer. - wizard : EnergyWizard - Returns the energy wizard object for use in chat responses. - """ - + Parameters + ---------- + model : str + State which model to use for the energy wizard. + + Returns + ------- + response : str + GPT output / answer. + wizard : EnergyWizard + Returns the energy wizard object for use in chat responses. + """ # Getting Corpus of data. If no corpus throw error for user. try: @@ -93,8 +91,9 @@ def get_wizard(model = model): wizard = EnergyWizard(corpus, ref_col='ref', model=model) return wizard + class MixtureOfExperts(ApiBase): - """Interface to ask OpenAI LLMs about energy + """Interface to ask OpenAI LLMs about energy research either from a database or report.""" """Parameters @@ -118,10 +117,9 @@ class MixtureOfExperts(ApiBase): "a database and creating a figure.") """High level model role, somewhat redundant to MODEL_INSTRUCTION""" - def __init__(self, connection_string, model=None, token_budget=3500, ref_col=None): - self.wizard_db = DataBaseWizard(model = model, connection_string = connection_string) - self.wizard_chat = get_wizard() - self.model = model + def __init__(self, db_wiz, txt_wiz, model=None): + self.wizard_db = db_wiz + self.wizard_chat = txt_wiz super().__init__(model) def chat(self, query, @@ -198,31 +196,30 @@ def chat(self, query, else: response_message = response["choices"][0]["message"]["content"] - message_placeholder = st.empty() full_response = "" if '1' in response_message: out = self.wizard_chat.chat(query, - debug=True, stream=True, token_budget=6000, - temperature=0.0, print_references=True, - convo=False, return_chat_obj=True) - + debug=True, stream=True, + token_budget=6000, temperature=0.0, + print_references=True, convo=False, + return_chat_obj=True) + for response in out[0]: full_response += response.choices[0].delta.content or "" message_placeholder.markdown(full_response + "▌") - - elif '2' in response_message: + elif '2' in response_message: out = self.wizard_db.chat(query, - debug=True, stream=True, token_budget=6000, - temperature=0.0, print_references=True, - convo=False, return_chat_obj=True) + debug=True, stream=True, + token_budget=6000, temperature=0.0, + print_references=True, convo=False, + return_chat_obj=True) - st.pyplot(fig = out, clear_figure = False) + st.pyplot(fig=out, clear_figure=False) - else: + else: response_message = 'Error cannot find data in report or database.' - - return full_response \ No newline at end of file + return full_response diff --git a/examples/db_wizard/retrieve_docs_general.py b/examples/db_wizard/retrieve_docs_general.py index 7aedeb96..e8f0615e 100644 --- a/examples/db_wizard/retrieve_docs_general.py +++ b/examples/db_wizard/retrieve_docs_general.py @@ -4,12 +4,10 @@ import logging import openai import time -from glob import glob from rex import init_logger from elm.pdf import PDFtoTXT from elm.embed import ChunkAndEmbed -from elm.osti import OstiList logger = logging.getLogger(__name__) @@ -51,15 +49,6 @@ os.makedirs(TXT_DIR, exist_ok=True) os.makedirs(EMBED_DIR, exist_ok=True) - #osti = OstiList(URL, n_pages=1) - #osti.download(PDF_DIR) - - #meta = osti.meta.copy() - #meta['osti_id'] = meta['osti_id'].astype(str) - #meta = meta.drop_duplicates(subset=['osti_id']) - #meta['fp'] = PDF_DIR + meta['fn'] - #meta.to_csv('./meta.csv', index=False) - '''missing = [] for i, row in meta.iterrows(): if not os.path.exists(row['fp']): @@ -68,8 +57,8 @@ fns = os.listdir(PDF_DIR) - for fn in fns: - if 'pdf' in fn: + for fn in fns: + if 'pdf' in fn: print(fn) fp = os.path.join(PDF_DIR, fn) txt_fp = os.path.join(TXT_DIR, fn.replace('.pdf', '.txt')) @@ -87,78 +76,25 @@ if pdf_obj.is_double_col(): text = pdf_obj.clean_poppler(layout=False) text = pdf_obj.clean_headers(char_thresh=0.6, page_thresh=0.8, - split_on='\n', - iheaders=[0, 1, 3, -3, -2, -1]) + split_on='\n', + iheaders=[0, 1, 3, -3, -2, -1]) with open(txt_fp, 'w') as f: f.write(text) logger.info(f'Saved: {txt_fp}') - if not os.path.exists(embed_fp): - #logger.info('Embedding {}/{}: "{}"' - # .format(i+1, len(meta), row['title'])) - #tag = f"Title: {row['title']}\nAuthors: {row['authors']}" - tag = f"Title: Fema \n Authors: FEMA" - obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, overlap=1) + tag = "Title: Fema \n Authors: FEMA" + obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, + overlap=1) embeddings = asyncio.run(obj.run_async(rate_limit=3e4)) if any(e is None for e in embeddings): raise RuntimeError('Embeddings are None!') else: df = pd.DataFrame({'text': obj.text_chunks.chunks, - 'embedding': embeddings, - 'osti_id': 1}) + 'embedding': embeddings, + 'osti_id': 1}) df.to_json(embed_fp, indent=2) logger.info('Saved: {}'.format(embed_fp)) time.sleep(5) - ''' - for i, row in meta.iterrows(): - fp = os.path.join(PDF_DIR, row['fn']) - txt_fp = os.path.join(TXT_DIR, row['fn'].replace('.pdf', '.txt')) - embed_fp = os.path.join(EMBED_DIR, row['fn'].replace('.pdf', '.json')) - - assert fp.endswith('.pdf') - assert os.path.exists(fp) - - if os.path.exists(txt_fp): - with open(txt_fp, 'r') as f: - text = f.read() - else: - pdf_obj = PDFtoTXT(fp) - text = pdf_obj.clean_poppler(layout=True) - if pdf_obj.is_double_col(): - text = pdf_obj.clean_poppler(layout=False) - text = pdf_obj.clean_headers(char_thresh=0.6, page_thresh=0.8, - split_on='\n', - iheaders=[0, 1, 3, -3, -2, -1]) - with open(txt_fp, 'w') as f: - f.write(text) - logger.info(f'Saved: {txt_fp}') - - if not os.path.exists(embed_fp): - logger.info('Embedding {}/{}: "{}"' - .format(i+1, len(meta), row['title'])) - tag = f"Title: {row['title']}\nAuthors: {row['authors']}" - obj = ChunkAndEmbed(text, tag=tag, tokens_per_chunk=500, overlap=1) - embeddings = asyncio.run(obj.run_async(rate_limit=3e4)) - if any(e is None for e in embeddings): - raise RuntimeError('Embeddings are None!') - else: - df = pd.DataFrame({'text': obj.text_chunks.chunks, - 'embedding': embeddings, - 'osti_id': row['osti_id']}) - df.to_json(embed_fp, indent=2) - logger.info('Saved: {}'.format(embed_fp)) - time.sleep(5) - - bad = [] - fps = glob(EMBED_DIR + '*.json') - for fp in fps: - data = pd.read_json(fp) - if data['embedding'].isna().any(): - bad.append(fp) - assert not any(bad), f'Bad output: {bad}' - ''' - - logger.info('Finished!') diff --git a/examples/db_wizard/run_db_wizard_app.py b/examples/db_wizard/run_db_wizard_app.py index 7741424e..83caa876 100644 --- a/examples/db_wizard/run_db_wizard_app.py +++ b/examples/db_wizard/run_db_wizard_app.py @@ -1,45 +1,42 @@ import streamlit as st import os import openai -from glob import glob -import pandas as pd -import sys -#from elm import EnergyWizard from elm.db_wiz import DataBaseWizard - model = 'gpt-4' -conn_string = 'postgresql://la100_admin:laa5SSf6KOC6k9xl@gds-cluster-1.cluster-ccklrxkcenui.us-west-2.rds.amazonaws.com:5432/la100-stage' +conn_string = ('postgresql://la100_admin:laa5SSf6KOC6k9xl' + '@gds-cluster-1.cluster-ccklrxkcenui' + '.us-west-2.rds.amazonaws.com:5432/la100-stage') -openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") -openai.api_key = os.getenv("AZURE_OPENAI_KEY") +openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT") +openai.api_key = os.getenv("AZURE_OPENAI_KEY") openai.api_type = 'azure' -openai.api_version = '2023-03-15-preview' +openai.api_version = '2023-03-15-preview' -DataBaseWizard.URL = (f'https://stratus-embeddings-south-central.openai.azure.com/' - f'openai/deployments/{model}/chat/' - f'completions?api-version={openai.api_version}') +DataBaseWizard.URL = ( + f'https://stratus-embeddings-south-central.openai.azure.com/' + f'openai/deployments/{model}/chat/' + f'completions?api-version={openai.api_version}') DataBaseWizard.HEADERS = {"Content-Type": "application/json", - "Authorization": f"Bearer {openai.api_key}", - "api-key": f"{openai.api_key}", - } + "Authorization": f"Bearer {openai.api_key}", + "api-key": f"{openai.api_key}"} st.set_option('deprecation.showPyplotGlobalUse', False) if __name__ == '__main__': - wizard = DataBaseWizard(model = model, connection_string = conn_string) + wizard = DataBaseWizard(model=model, connection_string=conn_string) opening_message = '''Hello! \n I am the Database Wizard. I - Have access to a single database. You can ask me questions - about the data and ask me to produce visualizations of the data. - Here are some examples of what you can ask me: - \n - Plot a time series of the winter residential - heating load for the moderate scenario + Have access to a single database. You can ask me questions + about the data and ask me to produce visualizations of the data. + Here are some examples of what you can ask me: + \n - Plot a time series of the winter residential + heating load for the moderate scenario in model year 2030 for geography 1. - \n - Plot a time series of the winter - residential heating load for the moderate scenario + \n - Plot a time series of the winter + residential heating load for the moderate scenario in model year 2030 for the first five load centers. ''' @@ -57,7 +54,7 @@ # Clearing Wizard wizard.clear() - wizard = DataBaseWizard(model = model, connection_string = conn_string) + wizard = DataBaseWizard(model=model, connection_string=conn_string) for message in st.session_state.messages: with st.chat_message(message["role"]): @@ -78,13 +75,4 @@ temperature=0.0, print_references=True, convo=False, return_chat_obj=True) - st.pyplot(fig = out, clear_figure = False) - #for response in out[0]: - # full_response += response.choices[0].delta.content or "" - # message_placeholder.markdown(full_response + "▌") - - #message_placeholder.markdown(full_response) - - #st.session_state.messages.append({"role": "assistant", - # "content": full_response}) - \ No newline at end of file + st.pyplot(fig=out, clear_figure=False) diff --git a/examples/db_wizard/run_experts_app.py b/examples/db_wizard/run_experts_app.py index b2b0eeb0..d914080c 100644 --- a/examples/db_wizard/run_experts_app.py +++ b/examples/db_wizard/run_experts_app.py @@ -1,11 +1,4 @@ import streamlit as st -import os -import openai -from glob import glob -import pandas as pd -import sys - -#from elm import EnergyWizard from elm.experts import MixtureOfExperts model = 'gpt-4' @@ -13,14 +6,20 @@ conn_string = '' if __name__ == '__main__': - wizard = MixtureOfExperts(model = model, connection_string = conn_string) - - msg = """Multi-Modal Wizard Demonstration!\nI am a multi-modal AI demonstration. I have access to NREL technical reports regarding the LA100 study and access to several LA100 databases. If you ask me a question, I will attempt to answer it using the reports or the database. Below are some examples of queries that have been shown to work. - \n - Describe chapter 2 of the LA100 report. - \n - What are key findings of the LA100 report? + wizard = MixtureOfExperts(model=model, connection_string=conn_string) + + msg = ("""Multi-Modal Wizard Demonstration!\nI am a multi-modal AI + demonstration. I have access to NREL technical reports regarding the + LA100 study and access to several LA100 databases. If you ask me a + question, I will attempt to answer it using the reports or the + database. Below are some examples of queries that have been shown to + work. + \n - Describe chapter 2 of the LA100 report. + \n - What are key findings of the LA100 report? \n - What enduse consumes the most electricity? - \n - During the year 2020 which geographic regions consumed the most electricity? - """ + \n - During the year 2020 which geographic regions consumed the + most electricity? + """) st.title(msg) @@ -41,21 +40,12 @@ message_placeholder = st.empty() full_response = "" - out = wizard.chat(query = prompt, + out = wizard.chat(query=prompt, debug=True, stream=True, token_budget=6000, temperature=0.0, print_references=True, convo=False, return_chat_obj=True) - #references = out[-1] - - #for response in out[0]: - # full_response += response.choices[0].delta.content or "" - # message_placeholder.markdown(full_response + "▌") message_placeholder.markdown(full_response) st.session_state.messages.append({"role": "assistant", "content": full_response}) - - - -