-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit.py
95 lines (70 loc) · 1.94 KB
/
reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import sys,urllib2, re
from bs4 import BeautifulSoup
TRIES_LIMIT = 50
class SubredditInfo:
def __init__(self):
self.name = ""
self.children = []
self.subscribers = 0
def getSubredditInfo(subredditName):
print "-",subredditName.upper()
page = getPage(subredditName)
if page is None:
return None
soup = BeautifulSoup(page)
side = getSidebar(soup)
info = SubredditInfo()
info.name = subredditName
info.children = getSubredditsFromSidebar(side)
info.subscribers = getSubscriberCount(side)
return info
def getSidebarSubreddits(subredditName):
soup = getSoupFromSubredditName(subredditName)
side = getSidebar(soup)
return getSubredditsFromSidebar(side)
def getSubredditsFromSidebar(side):
subreddits = []
for link in side.find_all('a'):
address = link.get('href')
if address is not None:
name = address.split('/r/')[-1]
#print name # some subreddits do this wrong, the butts
if isValidSubredditName(name):
subreddits += [str(name).lower()]
return subreddits
def getSubscriberCount(side):
try:
subscriberString = side.find('span', 'subscribers').find('span', 'number').text
except AttributeError:
print "couldn't find subscribers, skipping"
return 1
try:
nSubs = int(subscriberString.replace(",",""))
except ValueError:
print "Couldn't find subscriber count, skipping"
return 1
return nSubs
def getSidebar(soup):
return soup.body.contents[1];
def getPage(subredditName):
url = 'http://www.reddit.com/r/'+subredditName
sys.stdout.write("Trying to connect...")
tries = 0
while 1:
try:
page = urllib2.urlopen(url)
break
except urllib2.HTTPError:
sys.stdout.write(".")
tries += 1
if tries >= TRIES_LIMIT:
print "skipping"
return None
print "success"
contents = page.read()
return contents
# returns whether name is valid
# only alphanumeric
def isValidSubredditName(name):
return re.match('^[\w-]+$', name) is not None
#print getSubredditInfo("programming").children