-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp_EDA.py
234 lines (187 loc) · 11.1 KB
/
app_EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#Import required libraries
import os
from api_key import apiKey, api_Key
import streamlit as st
import pandas as pd
from langchain.llms import OpenAI, GooglePalm, HuggingFaceHub
from langchain.agents import create_pandas_dataframe_agent
from dotenv import load_dotenv, find_dotenv
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain
from langchain.agents.agent_toolkits import create_python_agent
from langchain.tools.python.tool import PythonREPLTool
from langchain.agents.agent_types import AgentType
from langchain.utilities import WikipediaAPIWrapper
#OpenAIKey
os.environ['Google_API_KEY'] = apiKey
os.environ['HUGGINGFACEHUB_API_TOKEN'] = api_Key
load_dotenv(find_dotenv())
def header(url):
st.markdown(f'<p style="background-color:#06006C;color:white;font-size:20px;border-radius:2%; padding:2%">{url}</p>', unsafe_allow_html=True)
#Title
st.title('Dataset Analyzer for Lazy Fellows')
#Welcoming message
st.write("**Welcome..., My name is Timmy and I am here to mess your with your datasets.**")
#Explanation sidebar
with st.sidebar:
st.markdown( """ <style> [data-testid=stSidebar] { background-color:
#06006C; color: white; } </style> """, unsafe_allow_html=True,)
st.write('*Your CSV \'s Adventure Begins with Timmy.*')
st.caption('''**<p style ='color:white'>Timmy was born when an intern (long story short) was too lazy to analyze a dataset his mentor had given.
His laziness can be shown through Timmy's performance. He watched only half of the Youtube tutorial to create Timmy.
And now he has upload Timmy to Github to get a job. <p style ='color:#85D17B'>If you are a recruiter please....I can do better</p></p>**
''',unsafe_allow_html=True)
st.caption("<p style ='color:white'>Timmy is usnig <b>Google Palm API</b> for this purpose. If you ever feel like forking or cloning this project (First of all: WHY ?), Please use your own API key</p>", unsafe_allow_html=True)
st.divider()
st.caption("<p style ='text-align:center'> made by Joel D Joy (not the intern mentioned) </p>",unsafe_allow_html=True )
#Initialise the key in session state
if 'clicked' not in st.session_state:
st.session_state.clicked ={1:False}
#Function to udpate the value in session state
def clicked(button):
st.session_state.clicked[button]= True
st.button("Let's get started", on_click = clicked, args=[1])
if st.session_state.clicked[1]:
user_csv = st.file_uploader("Upload your file here", type="csv")
if user_csv is not None:
user_csv.seek(0)
df = pd.read_csv(user_csv, low_memory=False)
#llm model
llm = GooglePalm(temperature = 0)
# llm1 = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature":0.5, "max_length":512,'trust_remote_code':True})
#Function sidebar
@st.cache_data
def steps_eda():
steps_eda = llm('Write 2 Data Science Jokes')
return steps_eda
#Pandas agent
pandas_agent = create_pandas_dataframe_agent(llm, df, verbose = True)
#Functions main
@st.cache_data
def function_agent():
# Data Overview
st.write("**Data Overview**")
st.write("The first rows of your dataset look like this:")
st.write(header(df.head()))
# Data Cleaning
st.write("**Data Cleaning**")
columns_df = pandas_agent.run("What are the meaning of the columns?")
st.write(header(columns_df))
missing_values = pandas_agent.run("How many missing values does this dataframe have? Start the answer with 'There are'")
st.write(header(missing_values))
duplicates = pandas_agent.run("Are there any duplicate values and if so where?")
st.write(header(duplicates))
# Data Summarization
st.write("**Data Summarisation**")
st.write(header(df.describe()))
correlation_analysis = pandas_agent.run("Calculate correlations between numerical variables to identify potential relationships.")
st.write(header(correlation_analysis))
outliers = pandas_agent.run("Identify outliers in the data that may be erroneous or that may have a significant impact on the analysis.")
st.write(header(outliers))
new_features = pandas_agent.run("What new features would be interesting to create?.")
st.write(header(new_features))
return
@st.cache_data
def function_question_variable():
st.line_chart(df, y =[user_question_variable])
summary_statistics = pandas_agent.run(f"Give me a summary of the statistics of {user_question_variable}")
st.write(summary_statistics)
normality = pandas_agent.run(f"Check for normality or specific distribution shapes of {user_question_variable}")
st.write(normality)
outliers = pandas_agent.run(f"Assess the presence of outliers of {user_question_variable}")
st.write(outliers)
trends = pandas_agent.run(f"Analyse trends, seasonality, and cyclic patterns of {user_question_variable}")
st.write(trends)
missing_values = pandas_agent.run(f"Determine the extent of missing values of {user_question_variable}")
st.write(missing_values)
return
@st.cache_data
def function_question_dataframe():
dataframe_info = pandas_agent.run(user_question_dataframe)
st.write(dataframe_info)
return
@st.cache_resource
def wiki(prompt):
wiki_research = WikipediaAPIWrapper().run(prompt)
return wiki_research
@st.cache_data
def prompt_templates():
data_problem_template = PromptTemplate(
input_variables=['business_problem'],
template='Convert the following business problem into a data science problem: {business_problem}.'
)
model_selection_template = PromptTemplate(
input_variables=['data_problem', 'wikipedia_research'],
template='Give a list of machine learning algorithms that are suitable to solve this problem: {data_problem}, while using this wikipedia research: {wikipedia_research}.'
)
return data_problem_template, model_selection_template
# @st.cache_data
def chains():
data_problem_chain = LLMChain(llm=llm, prompt=prompt_templates()[0], verbose=True, output_key='data_problem')
model_selection_chain = LLMChain(llm=llm, prompt=prompt_templates()[1], verbose=True, output_key='model_selection')
sequential_chain = SequentialChain(chains=[data_problem_chain, model_selection_chain], input_variables=['business_problem', 'wikipedia_research'], output_variables=['data_problem', 'model_selection'], verbose=True)
return sequential_chain
@st.cache_data
def chains_output(prompt, wiki_research):
my_chain = chains()
my_chain_output = my_chain({'business_problem': prompt, 'wikipedia_research': wiki_research})
my_data_problem = my_chain_output["data_problem"]
my_model_selection = my_chain_output["model_selection"]
return my_data_problem, my_model_selection
@st.cache_data
def list_to_selectbox(my_model_selection_input):
algorithm_lines = my_model_selection_input.split('\n')
algorithms = [algorithm.split(':')[-1].split('.')[-1].strip() for algorithm in algorithm_lines if algorithm.strip()]
algorithms.insert(0, "Select Algorithm")
formatted_list_output = [f"{algorithm}" for algorithm in algorithms if algorithm]
return formatted_list_output
@st.cache_resource
def python_agent():
agent_executor = create_python_agent(
llm=llm,
tool=PythonREPLTool(),
verbose=True,
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
handle_parsing_errors=True,
)
return agent_executor
@st.cache_data
def python_solution(my_data_problem, selected_algorithm, user_csv):
solution = python_agent().run(f"Write a Python script to solve this: {my_data_problem}, using this algorithm: {selected_algorithm}, using this as your dataset: {user_csv}..Do not use Google Colab. Assume You are using Local machine which does not have google colab. Solve and give me the final answer"
)
return solution
#Main
st.header('Exploratory data analysis')
st.subheader('General information about the dataset')
with st.sidebar:
with st.expander('Data Science Jokes : Lame'):
st.write(steps_eda())
function_agent()
st.subheader('Variable of study')
user_question_variable = st.text_input('What variable are you interested in')
if user_question_variable is not None and user_question_variable !="":
function_question_variable()
st.subheader('Further study')
if user_question_variable:
user_question_dataframe = st.text_input( "Is there anything else you would like to know about your dataframe?")
if user_question_dataframe is not None and user_question_dataframe not in ("","no","No"):
function_question_dataframe()
if user_question_dataframe in ("no", "No"):
st.write("")
if user_question_dataframe:
st.divider()
st.header("Data Science Problem")
st.write("Now that we have a solid grasp of the data at hand and a clear understanding of the variable we intend to investigate, it's important that we reframe our business problem into a data science problem.")
prompt = st.text_area('What is the business problem you would like to solve?')
if prompt:
wiki_research = wiki(prompt)
my_data_problem = chains_output(prompt, wiki_research)[0]
my_model_selection = chains_output(prompt, wiki_research)[1]
st.write(my_data_problem)
st.write(my_model_selection)
formatted_list = list_to_selectbox(my_model_selection)
selected_algorithm = st.selectbox("Select Machine Learning Algorithm", formatted_list)
if selected_algorithm is not None and selected_algorithm != "Select Algorithm":
st.subheader("Solution")
solution = python_solution(my_data_problem, selected_algorithm, user_csv)
st.write(solution)