-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetting problem articles for latexml.py
101 lines (88 loc) · 3.32 KB
/
getting problem articles for latexml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.3.2
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# +
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker
import sys
import os
sys.path.insert(0,'arxiv.py/')
import arxiv
import databases.create_db_define_models as cre
import re
from collections import defaultdict
from process import Xtraction
def fun1(name):
'''
takes the string: ./error/1703/1703.01344.html\n
returns 1703.01344
'''
res = re.match(r'.*/([0-9]{4}\.[0-9]{5})\.html\n', name)
return res.group(1)
def fun2(name):
'''
takes the string:
'''
res = re.match(r'.*/([0-9]{4}\.[0-9]{5})', name)
return res.group(1)
fun2('http://arxiv.org/abs/1703.01333v2')
# -
# using a list of files in the arxxmllib dataset of processed articles
# find . -name '1703.013*' > ~/find_start_1703.013.txt
with open('../find_start_1703.013.txt', 'r') as find_file:
art_lst = find_file.readlines()
# Connect to the metadata database in search of articles that contain the string: %1703.013%
database = 'sqlite:////mnt/databases/arxiv2.db'
eng = sa.create_engine(database, echo=False)
eng.connect()
SMaker = sessionmaker(bind=eng)
sess = SMaker()
#sess.query(cre.Article.id).filter(cre.Article.tags.startswith("[{'term': 'math.DG'")).all()
q_lst = sess.query(cre.Article.id).filter(cre.Article.id.like("%1703.013%")).all()
arxm_set = set([fun2(a) for a in art_lst])
all_set = set([fun2(a[0]) for a in q_lst])
prob_set = all_set.difference(arxm_set)
# Get the tar_id of each problem files
id_dict = defaultdict(list)
for nm in prob_set:
q_str = "%{}%".format(nm)
query_resu = sess.query(cre.Article.tarfile_id, cre.Article.id).filter(cre.Article.id.like(q_str)).all()[0]
tar_resu = sess.query(cre.ManifestTarFile.filename).filter(cre.ManifestTarFile.id == int(query_resu[0])).all()
id_dict[tar_resu[0][0]].append(query_resu[1])
print(tar_resu)
# Extract the files
for tar in id_dict:
x = Xtraction('/mnt/arXiv_src/' + tar, db='sqlite:////mnt/databases/arxiv2.db')
x.extract_tar('../rm_me_problem_file', article_name='^1703/1703\.013.*')
id_dict
# ### This is the problem id dict:
# defaultdict(list,
# {'src/arXiv_src_1703_003.tar': ['http://arxiv.org/abs/1703.01327v2',
# 'http://arxiv.org/abs/1703.01306v2',
# 'http://arxiv.org/abs/1703.01331v1',
# 'http://arxiv.org/abs/1703.01314v1',
# 'http://arxiv.org/abs/1703.01305v2'],
# 'src/arXiv_src_1703_004.tar': ['http://arxiv.org/abs/1703.01399v1',
# 'http://arxiv.org/abs/1703.01375v2',
# 'http://arxiv.org/abs/1703.01396v2',
# 'http://arxiv.org/abs/1703.01373v1',
# 'http://arxiv.org/abs/1703.01379v3',
# 'http://arxiv.org/abs/1703.01356v4',
# 'http://arxiv.org/abs/1703.01357v3',
# 'http://arxiv.org/abs/1703.01397v1',
# 'http://arxiv.org/abs/1703.01395v1',
# 'http://arxiv.org/abs/1703.01376v4',
# 'http://arxiv.org/abs/1703.01352v1',
# 'http://arxiv.org/abs/1703.01354v2',
# 'http://arxiv.org/abs/1703.01378v1']})