-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathKonachanPic.py
141 lines (120 loc) · 3.87 KB
/
KonachanPic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#-*- coding:utf-8 -*-
import sys,os
import time
import urllib2
import threading
import random
#import htmlcontent
from sgmllib import SGMLParser
url = "http://konachan.com/post?page="
startpage = endpage = startnum = 1
filepath = ""
threadnum = 2
event = threading.Event()
lock = threading.Lock()
class PageParser(SGMLParser):
data = []
ulswi = False
"""Parse the web pages"""
def start_ul(self,attrs):
for k,v in attrs:
if k == 'id' and v == 'post-list-posts':
self.ulswi = True
def end_ul(self):
self.ulswi = False
def start_a(self,attrs):
for k,v in attrs:
if k == 'href' and self.ulswi == True and v[0] == 'h':
self.data.append(v)
def getData(self):
return self.data
def getUrl(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT6.1; en-US; rv:1.9.1.6) Firefox/3.5.6'}
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) Maxthon/4.3.1.2000 Chrome/30.0.1599.101 Safari/537.36'}
req = urllib2.Request(url, headers=headers)
content = urllib2.urlopen(req).read()
print "正在解析:%s" % url
type = sys.getfilesystemencoding()
return content.decode("UTF-8").encode(type)
def desc():
print """
图片下载软件简要使用说明
爬取konachan.com上的图片
输入页数范围(例:15 20)回车,即可下载15页到20页的所有图片
紧接着输入开始下载的位置(例:5),即可从15页的第5张图片开始下载
输入盘符(例如:d),即可将图片下载到d:/downloadpic/文件夹中
"""
def download(url,path):
global threadnum
if lock.acquire():
if threadnum <= 1:
event.clear()
else:
event.set()
threadnum = threadnum - 1
lock.release()
time.sleep(random.randint(8,15))
filename = os.path.basename(url)
print " downloading......"
socket = urllib2.urlopen(url)
data = socket.read()
path = path + filename
with open(path,"wb") as jpg:
jpg.write(data)
socket.close()
if lock.acquire():
threadnum = threadnum + 1
lock.release()
event.set()
def page_download(low,up):
up = up + 1
global filepath
for pagenum in range(low,up):
print "正在下载第 %d 页" % pagenum
dataurl = url + str(pagenum)
htmlcontent = getUrl(dataurl)
parser.feed(htmlcontent)
DataSet = parser.getData()
datacount = len(DataSet)
print "共 %d 张图片" % datacount
if pagenum == startpage:
for i in range(startnum,datacount):
downthread = threading.Thread(target=download,args=(DataSet[i-1],filepath))
downthread.start()
event.wait()
#print "正在下载第 %d 张图片" % i
#download(DataSet[i],filepath)
else:
for i in range(1,datacount):
print "正在下载第 %d 张图片" % i
downthread = threading.Thread(target=download,args=(DataSet[i-1],filepath))
downthread.start()
event.wait()
#print "正在下载第 %d 张图片" % i
#download(DataSet[i],filepath)
print "第 %d 页下载完毕!" % pagenum
del DataSet[:]
def init():
las,nex=raw_input("请输入页数范围:").split(' ')
global startpage
global endpage
global startnum
startpage=int(las)
endpage=int(nex)
startnumstr=raw_input("从第几张图片开始下载?")
startnum=int(startnumstr)
global filepath
filepath=raw_input("将图片下载到哪个盘?")
filepath=filepath+r":/downloadpic/"
if not os.path.exists(filepath):
os.mkdir(filepath)
if __name__=="__main__":
parser=PageParser()
desc()
init()
#htmlc=htmlcontent.htmlcontent
#pp.feed(htmlc)
#DataSet=pp.getData()
page_download(startpage,endpage)
print "本次下载任务圆满结束!!"
#download(DataSet,0)