-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdouban.py
76 lines (76 loc) · 2.61 KB
/
douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# coding: UTF-8
import urllib
import urllib2
import re
from pyquery import PyQuery as pq
from lxml import etree
import json
import sys
import string
reload(sys)
sys.setdefaultencoding("utf-8")
#urllib函数,用于提交http数据
def open(aurl,post='',Referer=''):
#proxy = 'http://127.0.0.1:8088'
#opener = urllib2.build_opener( urllib2.ProxyHandler({'http':proxy}) )
#urllib2.install_opener(opener)
if post!='':
test_data_urlencode = urllib.urlencode(post)
req = urllib2.Request(url=aurl,data = test_data_urlencode)
else:
req = urllib2.Request(url=aurl)
if Referer!='':
req.add_header('Referer',Referer)
res_data = urllib2.urlopen(req)
return res_data
def timedeal(t):
t=string.atoi(t)
h=t/60
m=t-(h*60)
return "%02d:%2d"%(h,m)
#程序开始
if __name__ == '__main__':
try:
moviename=sys.argv[1].decode('utf-8')
url="http://movie.douban.com/subject_search?search_text="+urllib.quote(moviename.encode("utf8"))
res = open(url).read()#.decode('utf8')
d = pq(res)
item = d(".item").eq(0)
title = item(".nbg").attr('title')
href=item(".nbg").attr('href')
#print title
res = open(href).read()#.decode('utf8')
d = pq(res)
info = d('#info').html()
#info = info.replace("<br/>","\n")
info = re.sub('<[^>]+>','',info).strip()
info = info.replace(" ","")
info = info.replace("\n\n","\n")
#print info
indent = d('#link-report')
intro=indent("span").eq(0).text()
if u"... (展开全部)" in intro:
intro=indent(".hidden").eq(0).text()
try:
time = timedeal(re.findall(u"(?<=片长:).*?(?=分钟)",info,re.DOTALL)[0])
except:
time = ''
type = re.findall(u"(?<=类型:).*?(?=\n)",info,re.DOTALL)[0].split("/")
#print intro
res = open(href+"/photos?type=R").read()#.decode('utf8')
d = pq(res)
poster = d('.poster-col4')
posterurl = poster('li').eq(0)('div')('a').attr('href')
try:
posterurl = re.findall(r"(?<=photos/photo/).*?(?=/)",posterurl,re.DOTALL)[0]
except:
posterurl = ''
#posterurl = "http://img5.douban.com/view/photo/raw/public/"+posterurl+".jpg"
#print posterurl
ele={"title":title,"info":info,"intro":intro,"posterurl":posterurl,"time":time,"type":type}
ele.update({"status":"ok"})
print json.dumps(ele,ensure_ascii=False,indent=2)
except:
ele={}
ele.update({"status":"error"})
print json.dumps(ele,ensure_ascii=False,indent=2)