-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAsyncUva.py
149 lines (135 loc) · 4.29 KB
/
AsyncUva.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
__metaclass__=type
import UserString
import functools
from pyquery import PyQuery as pyq
from spider import AcmSpider
import httplib
import urllib2
from tornado import ioloop
from tornado import httpclient
request_count=0
count=0
class AsyncUva(AcmSpider):
def __init__(self,url,name):
super(AsyncUva,self).__init__(url,name)
self.ojids=[]
def getPages(self):
dirs=[]
http_client=httpclient.AsyncHTTPClient()
dirs.append(self.url)
def _isFinshied():
print len(dirs),request_count
if not dirs and request_count==0:
print "stop"
io_loop.stop()
return
else:
_asyncGet()
io_loop.add_callback(_isFinshied)
def _parse(response):
global request_count
if response.error:
print "Error",response.error
http_client.fetch(response.request,_parse)
request_count+=1
print "add",request_count
print "try again"
else:
curdir=response.effective_url
doc=pyq(response.body)
if doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER":
print "[folder]",curdir
links=doc("div:contains('Browse Problems')+div+table a")
for a in links:
dirs.append(self.root+'/'+pyq(a).attr('href'))
else:
print "[file]",curdir
self.pages.append(curdir)
self.cache[curdir]=doc
print "-",request_count
request_count-=1
def _asyncGet():
while dirs:
cur=dirs.pop()
http_client.fetch(cur,_parse)
global request_count
print "add",request_count
request_count+=1
io_loop=ioloop.IOLoop.instance()
io_loop.add_callback(_isFinshied)
io_loop.start()
def parsePage(self,io_loop,http_client,response,*args):
global count
if count>len(self.pages):
print "stop"
io_loop.stop()
return
try:
if response.error:
print "Error,try again",response.error
http_client.fetch(response.request,functools.partial(self.parsePage,io_loop,http_client))
return
except:
pass
print count
count+=1
if args:
doc=args[0]
else:
doc=pyq(response.body)
trs= doc("div:contains('Browse Problems')+div+table tr[class!='sectiontableheader']")
for tr in trs:
problem={}
problem['ojid']=pyq(tr).find('td').eq(1).text().split(u'\xa0')[0].encode('utf8')
if problem['ojid'] in self.ojids:
continue
else:
self.ojids.append(problem['ojid'])
print "id:",problem['ojid']
problem['title']=pyq(tr).find('td').eq(1).text().split(u'\xa0')[2]
#print problem['title'],type(problem['title'])
problem['submitted']=str(pyq(tr).find('td').eq(2).text())
ratio=pyq(tr).find('td').eq(3).find('div div').text()
try:
ratio=float(ratio.rstrip('%'))*0.01
problem['accepted']=str(int(int(pyq(tr).find('td').eq(2).text())*ratio))
except ValueError:
problem['accepted']=str(0)
self.problems.append(problem)
def period(self,io_loop):
print "period",count,len(self.pages)
if count==len(self.pages):
io_loop.stop()
io_loop.add_callback(functools.partial(self.period,io_loop))
def genData(self):
http_client=httpclient.AsyncHTTPClient()
print "have %d pages" % len(self.pages)
io_loop=ioloop.IOLoop.instance()
for page in self.pages:
if self.cache[page]:
print "cache",page
self.parsePage([],[],[],self.cache[page])
else:
print "fetch",page
http_client.fetch(page,functools.partial(self.parsePage,io_loop,http_client))
io_loop.add_callback(functools.partial(self.period,io_loop))
io_loop.start()
self.problems.sort(lambda p1,p2:int(p1['ojid'])-int(p2['ojid']))
s = UserString.MutableString('')
f=file(self.name,'w')
s+=self.TARGET['id']+','+self.TARGET['ojid']+','+self.TARGET['title']+','+self.TARGET['accepted']+','+self.TARGET['submitted']+'\n'
for i in range(len(self.problems)):
#print str(i)+'\t'+'\t'.join(self.problems[i].values())+'\n'
p=self.problems[i]
#print p['ojid'],p['title'],p['accepted'],p['submitted']
#print type(p['ojid']),type(p['title']),type(p['accepted']),type(p['submitted'])
s+=str(i+1)+','+p['ojid']+','+"'"+p['title'].encode('utf8')+"'"+','+p['accepted']+','+p['submitted']+'\n'
f.write(str(s))
f.close()
if __name__=="__main__":
uva=AsyncUva("http://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8","a.csv")
uva.getPages()
print uva.pages
#uva.pages.append("http://uva.onlinejudge.org/index.php?option=com_onlinejudge&Itemid=8&category=5")
uva.genData()