-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsdn_proj.py
executable file
·119 lines (97 loc) · 3.84 KB
/
sdn_proj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
from itertools import izip
from nltk import clean_html
from os import path, makedirs
from sys import argv
from time import mktime, strptime
import datetime
import BeautifulSoup
import Queue
import re
import threading
import urllib2
queue = Queue.Queue()
output_dirname = "./data/output/"
today = datetime.datetime.today()
yesterday = today - datetime.timedelta(1)
today = today.strftime('%m-%d-%Y')
yesterday = yesterday.strftime('%m-%d-%Y')
class Scraper(threading.Thread):
"""Threaded SDN forum scraper"""
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def extract(self, forum_id):
BASE_URL = "http://forums.studentdoctor.net/showthread.php?t=" + forum_id + "&page="
url = urllib2.urlopen(BASE_URL + "1")
soup = BeautifulSoup.BeautifulSoup(url.read())
self.school = soup.find("title").next.extract()
self.school = self.school.split(" ")
self.school = " ".join(self.school[2:self.school.index("|")]).replace(" Application Thread", "")
self.school = self.school.replace("&", "and")
self.school = self.school.replace("/", "-")
try:
page_count = int(soup.find("td", {"class":"vbmenu_control", "style":"font-weight:normal"}).next.extract().split(" ")[-1])
except AttributeError:
page_count = 1
print "Started " + self.school
self.users = []
self.statuses = []
self.timestamps = []
for i in xrange(1, page_count + 1):
url = urllib2.urlopen(BASE_URL + str(i))
soup = BeautifulSoup.BeautifulSoup(url.read())
self.users += [item.next.extract() for item in soup.findAll("a", {"title": "You must be a registered member to view member names."})]
ts = soup.findAll("td", {"class" : "thead"})
ts = [item for item in ts if "<!-- status icon and date -->" in str(item)]
ts = [clean_html(i.extract().renderContents()).replace(",", "") for i in ts]
self.timestamps += ts
for item in soup.findAll("table", {"id" : re.compile("post[0-9]*")}):
try:
self.statuses.append(item.find("a" , {"href": re.compile("/memberlist.php")}).next.extract())
except AttributeError:
self.statuses.append("Unknown")
self.epochtime = []
for item in self.timestamps:
if "Yesterday" in item:
item.replace("Yesterday", yesterday)
elif "Today" in item:
item.replace("Today", today)
try:
self.epochtime.append(str(mktime(strptime(item, '%m-%d-%Y %I:%M %p'))))
except ValueError:
self.epochtime.append("NA")
def export(self):
outfile = open(output_dirname + self.school, "w")
if len(self.users) != len(self.timestamps):
print self.school + "has a timestamp parsing problem"
for item in izip(self.users, self.statuses, self.timestamps, self.epochtime):
outfile.write(",".join(list(item) + [self.school]))
outfile.write("\n")
outfile.close()
def run(self):
while True:
self.users = []
self.statuses = []
index = self.queue.get()
self.extract(index)
self.export()
self.queue.task_done()
def main():
try:
input_file = open(argv[1])
except IOError:
print "Can't open " + argv[1]
if not path.exists(output_dirname):
print "Creating output directory: %s" % output_dirname
makedirs(output_dirname)
num_threads = 10
for i in xrange(num_threads):
t = Scraper(queue)
t.setDaemon(True)
t.start()
for line in input_file:
queue.put(line.strip())
queue.join()
if __name__ == '__main__':
main()