-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnpcs.py
executable file
·150 lines (133 loc) · 5.55 KB
/
npcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: UTF-8 -*-
#!/usr/bin/python
#program to harvest NONLINEAR PHENOMENA IN COMPLEX SYSTEMS
# FS 2013-01-28
import os
import ejlmod2
import re
import sys
import unicodedata
import string
import codecs
from removehtmlgesocks import akzenteabstreifen
from removehtmlgesocks import removehtmlgesocks
ejdir = '/afs/desy.de/user/l/library/dok/ejl'
lproc = '/afs/desy.de/user/l/library/proc'
tmpdir = '/tmp'
xmldir = '/afs/desy.de/user/l/library/inspire/ejl'
def tfstrip(x): return x.strip()
publisher = 'Education and Upbringing'
jnl = 'npcs'
vol = sys.argv[1]
year = str(int(vol)+1997)
issue = sys.argv[2]
jnlfilename = jnl+vol+'.'+issue
issn = '1817-2458'
jnlname = 'Nonlin.Phenom.Complex Syst.'
urltrunk = "http://www.j-npcs.org/abstracts"
print "get table of content of %s%s.%s ..." %(jnlname,vol,issue)
#os.system("lynx -source \"%s.toc\"|grep 'href.* rel=.abstract' > %s/%s.toc" % (urltrunk,tmpdir,jnlfilename))
if not os.path.isfile(tmpdir+"/"+jnlfilename+".toc"):
os.system("lynx -source \"%s/vol%sno%s.html\" > %s/%s.toc" % (urltrunk,year,issue,tmpdir,jnlfilename))
print "read table of contents..."
tocfil = open(tmpdir+"/"+jnlfilename+".toc",'r')
articleIDs = []
typecode = 'P'
note = ''
recnr = 1
recs = []
lines = ' '.join(map(tfstrip,tocfil.readlines()))
for line in re.split('<HR>', lines):
# print line
if re.search('pp\. *\d',line):
pages = re.sub('.*pp\. *([\d\- ]+).*',r'\1',line)
if re.search('HREF.*Abstract',line):
url = re.sub('.*HREF=.*?(vol.*?html).*Abstract.*',r'\1',line)
artfilname = "%s/%s.%s" %(tmpdir,jnlfilename,recnr)
if not os.path.isfile(artfilname):
print "lynx -source \"%s/%s\" > %s\n" %(urltrunk,url,artfilname)
os.system("lynx -source \"%s/%s\" > %s" %(urltrunk,url,artfilname))
artfil = open(artfilname,'r')
rec = {}
rec['auts'] = []
rec['aff'] = []
rec['vol'] = vol
rec['issue'] = issue
rec['typ'] = ''
rec['jnl'] = jnlname
keywordline = ''
#rec['pdf'] = urltrunk+'/'+articleID+'.pdf'
#rec['filename'] = rec['pdf']
rec['note'] = []
if re.search('\-',pages):
(rec['p1'],rec['p2']) = re.split(' *\- *',pages)
else:
rec['p1'] = rec['p2'] = pages
rec['refs'] = []
rec['tc'] = "P"
rec['year'] = year
flagabs = False
flagkeyw = False
title = ''
alines = ' '.join(map(tfstrip,artfil.readlines()))
for aline in re.split('<HR>', alines)[1:]:
#print aline
#keywords
if re.search('<P><I>Key words:', aline):
keywords = akzenteabstreifen(re.sub('.*<P><I>Key words: *<\/I> *(.*?)<\/P>.*', r'\1', aline.strip()))
rec['keyw'] = re.split(' *, *', keywords)
aline = re.sub('<P><I>Key words: *<\/I>.*?<\/P>', '', aline)
#title
if re.search('<B>',aline):
rec['tit'] = re.sub('.*<B>(.*?)<\/B>.*', r'\1',aline).strip()
#PDF:
if re.search('HREF.* PDF',aline):
if re.search('.*HREF=.*?(cgi.*pdf).>.*',aline):
link = re.sub('.*HREF=.*?(cgi.*pdf).>.*',r'\1',aline)
rec['pdf'] = 'http://www.j-npcs.org/'+link
else:
link = re.sub('.*HREF=.*?(online.*pdf).>.*',r'\1',aline)
rec['pdf'] = 'http://www.j-npcs.org/'+link
rec['filename'] = rec['pdf']
doi1 = re.sub('.*\/(.*)\.pdf',r'\1',link)
aline = re.sub('<P>Full text.*', '', aline)
#abstract-flags
if re.search('<P>', aline) and not rec['auts']:
rec['abs'] = re.sub('.*<[pP]>(.*?)</[pP]>.*', r'\1', aline.strip())
#authors and affiliations
if re.search('<I>',aline) and not rec['auts']:
authors = re.sub('.*<I> *(.*?) *<\/I>.*', r'\1', aline.strip())
authors = akzenteabstreifen(re.sub(',? and ',', ',authors))
for aut in re.split(' *, *',authors):
rec['auts'].append(re.sub('^(.*) (.*?)$', r'\2, \1', aut))
#write record
rec['tit'] = removehtmlgesocks(akzenteabstreifen(re.sub(' *<\/B> *','',rec['tit'])))
rec['tit'] = re.sub('<br>','',rec['tit'])
rec['tit'] = re.sub('\. *$','',rec['tit'])
rec['tit'] = re.sub('<img.*?>','???',rec['tit'])
print rec
if rec.has_key('abs'):
if len(rec['abs'])>5:
rec['abs'] = re.sub('<sub>(.*?)<\/sub>',r'_\1',rec['abs'])
rec['abs'] = re.sub('<sup>(.*?)<\/sup>',r'^\1',rec['abs'])
rec['abs'] = re.sub('<sub>(.*?)<\/sub>',r'_\1',rec['abs'])
rec['abs'] = re.sub('<em>(.*?)<\/em>',r'\1',rec['abs'])
rec['abs'] = re.sub('<\/?[a-z].*?>','',rec['abs'])
rec['abs'] = akzenteabstreifen(re.sub('<img.*?>','???', rec['abs']))
elif rec.has_key('note'):
rec['note'].append('record information might be incomplete')
recs.append(rec)
recnr += 1
xmlf = os.path.join(xmldir,jnlfilename+'.xml')
#xmlfile = open(xmlf,'w')
xmlfile = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8')
ejlmod2.writeXML(recs,xmlfile,publisher)
xmlfile.close()
#retrival
retfiles_path = "/afs/desy.de/user/l/library/proc/retinspire/retfiles"
retfiles_text = open(retfiles_path,"r").read()
line = jnlfilename+'.xml'+ "\n"
if not line in retfiles_text:
retfiles = open(retfiles_path,"a")
retfiles.write(line)
retfiles.close()