-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiments.py
109 lines (88 loc) · 3.38 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import arxiv
import pandas as pd
import streamlit as st
import plotly.express as px
import inspect
import textwrap
def intro():
st.write("# Hello! 👋")
st.markdown(
"""
This is a sample app to explore arXiv pre-prints of AI papers for Climate action.
"""
)
def search_arxiv(query: str, max_results: int, download: bool) -> pd.DataFrame:
search = arxiv.Search(
query=query, #"all:artificial intelligence AND all:climate change"
max_results=max_results, #1000
sort_by=arxiv.SortCriterion.Relevance,
sort_order=arxiv.SortOrder.Descending,
)
arxiv_papers = []
unavailable = []
for result in search.results():
temp = {}
temp["entry_id"] = result.entry_id
temp["updated"] = result.updated
temp["published"] = result.published
temp["title"] = result.title
temp["authors"] = result.authors
temp["summary"] = result.summary.replace("\n", " ")
temp["comment"] = result.comment
temp["journal_ref"] = result.journal_ref
temp["doi"] = result.doi
temp["primary_category"] = result.primary_category
temp["categories"] = result.categories
temp["links"] = result.links
temp["pdf_url"] = result.pdf_url
temp["links"] = result.links
arxiv_papers.append(temp)
if download is True:
try:
result.download_pdf(dirpath="./data/")
print("paper downloaded\n")
except:
print("-----> paper not available\n")
unavailable.append(result.title)
pass
else:
pass
# arxiv_df = pd.DataFrame.from_csv("./data/arxiv-cc-ai-papers.csv")
arxiv_df = pd.DataFrame.from_records(arxiv_papers)
return arxiv_df
def showcase_search_arxiv():
st.subheader("0. Data collection")
if st.checkbox("Show data collection code"):
sourcelines, _ = inspect.getsourcelines(search_arxiv)
st.code(textwrap.dedent("".join(sourcelines[1:])))
@st.cache
def read_data(data_filepath) -> pd.DataFrame:
df = pd.read_csv(data_filepath)
return df.loc[:, df.columns != "Unnamed: 0"]
def explore_data(data_filepath: str):
def show_histogram_plot(selected_df: pd.DataFrame):
st.subheader("2. Histogram")
feature = st.selectbox(
"Which feature?", selected_df.columns[:])
fig2 = px.histogram(selected_df, x=feature,
color="primary_category", marginal="rug")
st.plotly_chart(fig2)
def show_scatter_plot(selected_df: pd.DataFrame):
st.subheader("3. Scatter plot")
feature_x = st.selectbox("Which feature on x?",
selected_df.columns)
feature_y = st.selectbox("Which feature on y?",
selected_df.columns)
fig3 = px.scatter(selected_df, x=feature_x,
y=feature_y, color="primary_category")
st.plotly_chart(fig3)
st.subheader("1. Source data")
source_df = read_data(data_filepath)
if st.checkbox("Show collected data"):
st.write(source_df)
show_histogram_plot(source_df)
show_scatter_plot(source_df)
st.subheader("4. Code")
if st.checkbox("Show data exploration code"):
sourcelines, _ = inspect.getsourcelines(explore_data)
st.code(textwrap.dedent("".join(sourcelines[1:])))