-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathget_reddit_data.py
135 lines (120 loc) · 4.67 KB
/
get_reddit_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
from pprint import pprint
import anytree
import tqdm
import yaml
from anytree import Node, NodeMixin, RenderTree
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
options = webdriver.ChromeOptions()
options.add_argument("-no-sandbox")
options.add_argument("-headless")
options.add_argument("-disable-dev-shm-usage")
d = webdriver.Chrome("chromedriver", options=options)
class NodeCom(NodeMixin):
def __init__(self, name, attrs=None, parent=None, children=None):
super(NodeCom, self).__init__()
self.name = name
self.attrs = attrs # The reddit data
self.parent = parent
if children:
self.children = children
def get_data(config_path):
with open(os.path.join(config_path,"reddit_config.yml")) as cf:
reddit_conf = yaml.load(cf, Loader=yaml.FullLoader)
SUBREDDIT = reddit_conf["subreddits"]
HTTPS = reddit_conf["https"]
SORT = reddit_conf["sort"]
TIME = reddit_conf["time"]
PAGES = reddit_conf["no_of_pages"]
posts = []
for sub in SUBREDDIT:
link = HTTPS + sub + "/" + SORT + "/?t=" + TIME
d.get(link)
for i in range(PAGES):
wdposts = d.find_elements_by_class_name("thing")
for wdpost in wdposts:
html = wdpost.get_attribute("outerHTML")
htpost = soup(html, "html.parser").div
text = htpost.find("a", class_="title").text
attrs = htpost.attrs
attrs["data-score"] = int(attrs["data-score"])
attrs.update({"text":text, "comments":None})
posts.append(attrs)
try:
next = d.find_element_by_class_name("next-button").get_attribute("outerHTML")
except NoSuchElementException:
break
page = soup(next, "html.parser").a.attrs["href"]
print(i+1, page)
d.get(page)
print("Total posts:", len(posts))
print("Getting comments for the post extracted")
for post in tqdm.tqdm(posts):
d.get(HTTPS + post["data-permalink"])
html = d.page_source
htlist = soup(html, "html.parser").select("div.nestedlisting")
parent = NodeCom(attrs["data-fullname"], post)
recursive(htlist, parent)
post["comments"] = parent
data = []
for prev in tqdm.tqdm(posts):
for pre, fill, node in RenderTree(prev["comments"]):
if len(pre) == 0:
p1 = node.attrs["text"].replace('\n', ' ')
elif len(pre) == 4:
p2 = node.attrs["text"].replace('\n', ' ')
data.append((p1,p2))
elif len(pre) == 8:
p3 = node.attrs["text"].replace('\n', ' ')
data.append((p2,p3))
elif len(pre) == 12:
p4 = node.attrs["text"].replace('\n', ' ')
data.append((p3,p4))
elif len(pre) == 16:
p5 = node.attrs["text"].replace('\n', ' ')
data.append((p4,p5))
elif len(pre) == 20:
p6 = node.attrs["text"].replace('\n', ' ')
data.append((p5,p6))
with open(os.path.join(config_path, "data/train.from"), "w") as f1, open(os.path.join(config_path, "data/train.to") , "w") as f2:
for x, y in data:
f1.write("{}\n".format(x))
f2.write("{}\n".format(y))
f1.close()
f2.close()
# Recursively get comments & create tree by linking nodes
def recursive(soup, parent):
# End recursion if it's empty
if not soup:
return
# Soup is another link opened by soup
htcoms = soup[0].find_all("div", class_="comment", recursive=False)
# Iterate through every comment found, set to the same parent
# print(len(htcoms))
for htcom in htcoms:
# if htcom is not None:
try:
attrs = get_comment(htcom)
except:
continue
# Create node. Won't be replaced in the next loop since what matters is the object, not the variable.
# node = NodeCom(attrs["data-fullname"], attrs, parent) # Deleted comments don't have data-fullname
node = NodeCom("a", attrs, parent)
nxsoup = htcom.select("div.listing")
recursive(nxsoup, node)
def get_comment(htcom):
# Get comment & vote count. None = Deleted
text = htcom.find("div", class_="usertext-body").text
vote = htcom.find("span", class_="unvoted")
# Deleted comments. Also automatically None if hidden
if vote == '':
vote = None
if vote != None:
vote = int(vote.attrs["title"])
attrs = htcom.attrs # Get everything else (in attribute)
attrs.update({"text":text, "data-score":vote})
return attrs
if __name__ == '__main__':
get_data(".")