-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_initial.py
109 lines (93 loc) · 3.2 KB
/
crawl_initial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from collections import defaultdict
from wordcloud import WordCloud
import operator
import re
import time
import praw
# Login to reddit.
import login
r = login.login()
# Define words to match in an array.
avoid = ["[", "]"]
# List to hold links.
links = []
# A cache for comments that have already been responded to.
cache = []
# Dictionary to measure word counts. Defaultdict lets us skip checking (does it for us).
dictcount = defaultdict(int)
# Log to write to.
#log = open("./log")
def strip_tags(html):
parser = HTMLParser()
html = parser.unescape(html)
s = MLStripper()
s.feed(html)
return s.get_data()
def sortdict(d):
sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
return sorted_d
def bad(word):
if bool(a.search(word)):
return True
return False
l = re.compile("http.*?[^ \)]*")
s = re.compile("\[.*?\]\]*")
a = re.compile("[^a-z ]+")
def run_bot():
# Count duplicate posts.
duplicate = 0
# Get top 30 posts from all.
subreddit = r.get_subreddit("all")
submissions = subreddit.get_hot(limit=1)
# For each post, put the id in a cache and then scan it.
for submission in submissions:
print("Getting submission, submission id: " + submission.id + "...")
if submission.id in cache:
# Count duplicates, alternatively just use parse here.
# TODO, heat up when there is less duplicates each time?
duplicate += 1
else:
# Now it has been read, send to cache.
cache.append(submission.id)
# All comments.
submission.replace_more_comments(limit=None, threshold=0)
# Flatten the comment tree, we don't care.
flat = praw.helpers.flatten_tree(submission.comments)
# For each comment in the flat tree.
print("Submission got, scanning.")
for comment in flat:
#print (strip_tags(comment.body))
comment = str(comment.body)
# Get the links before we destroy everything...
if l.search(comment):
links.append(l.search(comment).group())
# Try to remove link.
comment = l.sub("", comment)
comment = comment.lower()
# Try to strip out things between [].
comment = s.sub("", comment)
# Strip out anything else (anything other than words).
#comment = a.sub("", comment)
# Iterate over the words.
for word in comment.split():
# Count the word, count it.
if bad(word) == False:
dictcount[word] += 1
print("Duplicate posts this scan: " + str(duplicate))
runs = 1
while True:
run_bot() # Run the bot!
newstr = ""
for x in dictcount:
newstr = newstr + ((x + " ") * dictcount[x])
wordcloud = WordCloud().generate(newstr)
image = wordcloud.to_image()
image.show()
for whatever in sortdict(dictcount):
print(whatever)
print("Number of scans: " + str(runs))
time.sleep(200) # Not too often…
runs += 1
#log.close()