This repository has been archived by the owner on May 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_crawl.py
64 lines (51 loc) · 2.54 KB
/
main_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# https://arxiv.org/list/cs.PF/recent
# https://arxiv.org/list/quant-ph/recent
import os
import datetime
import time
import sqlite3
import dotenv
import openai
import crawl_arxiv
dotenv.load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]
# sqlite3 PaperParseQueue table
def insert_only_user():
from werkzeug.security import generate_password_hash
sql_conn = sqlite3.connect(os.environ['SQLITE3_DB_PATH'])
tmp0 = os.environ['ONLY_USER_NAME'], '[email protected]', generate_password_hash(os.environ['ONLY_USER_PASS'])
sql_conn.execute('INSERT INTO User (username,email,password_hash) VALUES (?,?,?)', tmp0)
if __name__=='__main__':
# TODO make it a crontab (run every day)
# crawl_arxiv_recent_paper()
# _update_existing_arxiv_data()
# crawl_arxiv.database.init_vector_database()
recent_url_list = ['https://arxiv.org/list/quant-ph/recent']
arxivID_time_list = []
last_query_time = None
num_paper_limit_one_day = int(os.environ['CRAWL_ONE_DAY_LIMIT'])
sql_conn = sqlite3.connect(os.environ['SQLITE3_DB_PATH'])
while True:
if (last_query_time is None) or (datetime.datetime.now()-last_query_time).days >= 1:
last_query_time = datetime.datetime.now()
for url in recent_url_list:
crawl_arxiv.crawl.crawl_arxiv_recent_paper(url)
time.sleep(10) #query every 10 seconds
arxiv_list = [x[0] for x in sql_conn.execute('SELECT arxivID FROM paper_parse_queue').fetchall()]
sql_conn.execute('DELETE FROM paper_parse_queue')
for x in arxiv_list:
sql_conn.execute('DELETE FROM paper_parse_queue WHERE arxivID = ?', (x,))
sql_conn.commit()
tmp0 = [sql_conn.execute('SELECT arxivID FROM paper WHERE arxivID = ?', (x,)).fetchone() for x in arxiv_list]
existed_list = [x[0] for x in tmp0 if x is not None]
arxiv_list = list(set(arxiv_list) - set(existed_list))
if len(arxiv_list)>0:
for arxivID in arxiv_list:
arxivID_time_list = [x for x in arxivID_time_list if (datetime.datetime.now() - x[1]).days < 1]
if len(arxivID_time_list)>num_paper_limit_one_day:
print(f'Limit reached: {len(arxivID_time_list)}')
tmp0 = max(1, 60*60*24 - (datetime.datetime.now()-arxivID_time_list[0][1]).total_seconds())
time.sleep(tmp0)
tmp0 = crawl_arxiv.crawl.crawl_one_arxiv_paper(arxivID, tag_commit_sqlite3=True)
arxivID_time_list.append((arxivID, datetime.datetime.now()))
sql_conn.close()