-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_random_post.py
91 lines (76 loc) · 3.1 KB
/
get_random_post.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from urllib.request import Request, urlopen
# Create request to allow for scraping
req = Request('https://theyoungauthors.com/random-submission/', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
# Decode for string manipulation
mystr = webpage.decode("utf8")
post = ""
post_title = ""
post_author = ""
# Make sure there is an entry content div to retrieve content from
if("<div class=\"entry-content\" itemprop=\"articleBody\">" in mystr):
# Retrieve first the post title
x = mystr.find("<meta property=\"og:title\" content=")
y = mystr[x:].find(">")
post_title = mystr[x+len("<meta property=\"og:title\" content="):x+y].replace("/", "").replace("\"", "")
# Then post author
x = mystr.find("<i class=\"icon-user\"></i> by <span itemprop=\"author\">")
y = mystr[x:].find("rel=\"author\">")
z = mystr[x+y:].find("</a>")
post_author = mystr[x + y: z+x+y].replace("rel=\"author\">", "")
# Then the main content
for i in range(len(mystr)):
if(mystr[i:i+50] == "<div class=\"entry-content\" itemprop=\"articleBody\">"):
j = i + 50
# Retrieve only the relevant content from that div
while mystr[j:j+6] != "</div>":
post += mystr[j]
j+=1
break
# Replace any extra HTML tagging
if(len(post) > 0):
scraped_post = post.replace("<p class=\"font_9\">", "")
post = scraped_post.replace("<span style=\"text-decoration: underline;\"><strong>", "")
scraped_post = post.replace("</strong></span>", "")
post = scraped_post.replace("</p>", "")
scraped_post = post.replace("’", "'")
post = scraped_post.replace("<p class=\"font_8\">", "")
scraped_post = post.replace("<br />", "\n").replace("<br/>", "\n").replace(" ", "\n")
post = scraped_post.replace("<p>", "")
scraped_post = post.replace("<div id='jp-relatedposts' class='jp-relatedposts' >", "")
post = scraped_post.replace("<h3 class=\"jp-relatedposts-headline\"><em>Related</em></h3>", "")
scraped_post = post.lstrip()
post = scraped_post.replace("<p style=\"text-align: justify;\">", "").replace("<figure class=\"wp-block-image\">", "")
post = scraped_post
author_string = ""
# Get the author string if present
if "<em>" in post:
x = post.find("<em>")
y = post.find("</em>")
author_string = post[x+4:y]
scraped_post = post.replace(author_string, "")
post = scraped_post
scraped_post = post.replace("<em>", "")
post = scraped_post.replace("</em>", "")
image_url = ""
# Last but not least, get the image link if present
if "<a href=" in post:
x = post.find("srcset=\"")
y = post[x:].find(" ")
image_url = post[x+len("srcset=\""): x + y]
elif "src=\"" in post:
x = post.find("src=\"")
y = post[x:].find("\" ")
image_url = post[x+len("src=\""): x + y]
# A little more stripping
x = post.find("<a href=")
if(x != -1):
post = post[:x]
x = post.find("<img loading=")
if(x != -1):
post = post[:x]
print("Title: " + post_title)
print("Author: " + post_author)
print("Post: \n" + post)
print("Author string: " + author_string)
print("Image URL: " + image_url)