-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
299 lines (247 loc) · 10.1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
'''Codewars scraper
Finds all my codewars solutions, saves and organises them
Appends a link to the problems at the beginning of the code'''
from bs4 import BeautifulSoup
import string
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Constants
SCROLL_PAUSE_TIME = 1
MAX_SCROLL_ATTEMPTS = 20
# Set up filetypes dictionary for each language
# Ensure the languages are lowercase
language_filetypes = {
"c": ".c",
"javascript": ".js",
"python": ".py",
"shell": ".sh",
"c#": ".cs",
"sql": ".sql",
"bf": ".bf",
"prolog": ".pl",
"powershell": ".ps1",
"coffeescript": ".coffee",
"go": ".go",
"swift": ".swift",
"kotlin": ".kt",
}
# Set up comments dictionary for each language
language_comments = {
"c": "//",
"javascript": "//",
"python": "#",
"shell": "#",
"c#": "//",
"sql": "--",
"bf": "",
"prolog": "%",
"powershell": "#",
"coffeescript": "#",
"go": "//",
"swift": "//",
"kotlin": "//",
}
# Create a translator which will remove all punctuation and replace spaces with underscores
# Retains any hyphens
translator = str.maketrans("", "", string.punctuation)
translator[ord(" ")] = "_"
translator[ord("-")] = "-"
def offline_scraper(filename):
'''
HTML page format:
All solution are contained in a div with class "items-list"
Each individual solution is contained within a div with class "list-item-solutions"
The difficulty is contained in a div of class "item-title", alongside the problem name and link
The language is displayed in a h6 tag with the content of the form "[Language]:" (e.g. "Python:")
We'll need to remove the colon from the end of the language
Finally the problem itself is contained within a code block
'''
# Get the page
try:
with open(filename, "r") as f:
soup = BeautifulSoup(f.read(), "html.parser")
except FileNotFoundError:
print(
"File not found, please make sure you have downloaded the page and placed it in the same directory as this script and set the filename variable correctly"
)
exit()
parse_page(soup)
def online_scraper(github_signin, username, password):
# Start webdriver in headless mode
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.get("https://www.codewars.com/users/sign_in")
if github_signin:
button = button = driver.find_element(
By.CSS_SELECTOR, "button[data-action='auth#githubSignIn']")
button.click()
# This redirects to github, so wait for it to load and fill in login details
wait = WebDriverWait(driver, 10)
wait.until(EC.title_contains("Sign in to GitHub · GitHub"))
# Fill in username and password
username_form = driver.find_element(By.ID, "login_field")
username_form.send_keys(username)
password_form = driver.find_element(By.ID, "password")
password_form.send_keys(password)
# Press login button
submit_button = driver.find_element(By.CSS_SELECTOR,
"input[type='submit']")
submit_button.click()
else:
# Fill in username and password
username_form = driver.find_element(By.ID, "user_email")
username_form.send_keys(username)
password_form = driver.find_element(By.ID, "user_password")
password_form.send_keys(password)
# Press sign in button
submit_button = driver.find_element(By.CSS_SELECTOR,
"input[type='submit']")
submit_button.click()
# Wait for the page to load
sleep(2)
# Wait to see if we're redirected here https://github.com/sessions/two-factor/app
# If so, we need to fill in the 2FA code
redirect = "https://github.com/sessions/two-factor/app"
if redirect in driver.current_url:
auth = driver.find_element(By.ID, "app_totp")
print("Please enter your 2FA code")
_2fa = input("2FA code: ")
auth.send_keys(_2fa)
# It should auto-redirect without clicking button
sleep(2)
# We should now be logged in and get redirected to the dashboard
# But we don't want to be there, so let's go to the solutions page
driver.get(
f"https://www.codewars.com/users/{username}/completed_solutions")
# Scroll to the bottom of the page to load all solutions
# Get scroll height
# (Ref: https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python/27760083#27760083)
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_attempts = 0
while True:
# Scroll down to bottom
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(SCROLL_PAUSE_TIME)
print("Scrolling...")
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
scroll_attempts += 1
if scroll_attempts >= MAX_SCROLL_ATTEMPTS:
break
else:
last_height = new_height
scroll_attempts = 0
# Create soup and parse page
soup = BeautifulSoup(driver.page_source, "html.parser")
parse_page(soup)
def parse_page(page):
# Now find the div with class "items-list"
main_content = page.find("div", class_="items-list")
# Get the solutions
solutions = main_content.find_all("div", class_="list-item-solutions")
print(f"Found {len(solutions)} solutions to parse")
problem_count = 0
# Loop through all solutions
for solution in solutions:
problem_count += 1
# Get the difficulty, problem name and link
head_data = solution.find("div", class_="item-title")
difficulty = head_data.find("span").text
problem_name = head_data.find("a").text
problem_link = head_data.find("a")["href"]
# Fix for relative URL in live scraper
if "codewars.com" not in problem_link:
problem_link = "https://www.codewars.com" + problem_link
# Languages are stored in h6, solutions are stored in code blocks
# Date and time are stored in time-ago tags
# Loop through all the elements in "solution"
language = None
solution_code = None
datetime = None
solution_details = []
for element in solution.descendants:
if element.name == "h6":
language = element.text[:-1]
elif element.name == "code":
solution_code = element.text
elif element.name == "time-ago":
# Datetime is the last attribute before a new solution
datetime = element["datetime"]
solution_details.append((language, solution_code, datetime))
# Now reverse the order of the solutions so that the oldest is first
solution_details.reverse()
revision = 1
current_language = None
# Loop through all the solutions found
for language, solution_code, datetime in solution_details:
if language != current_language:
revision = 1
current_language = language
else:
revision += 1
# Construct filename
filename = problem_name
if revision > 1:
filename += f"-v{revision}"
filename += f"-{difficulty}"
filename = filename.translate(translator)
filename += language_filetypes[language.lower()]
# Filepath
root_dir = os.path.dirname(os.path.abspath(__file__))
directory = f"{language}"
directory = os.path.join(root_dir, directory)
filepath = f"{language}/{filename}"
filepath = os.path.join(root_dir, filepath)
# Comment
comment = f"{language_comments[language.lower()]} {problem_link}"
comment += f"\n{language_comments[language.lower()]} {datetime}"
# Now write the file
# Create directory if necessary
if not os.path.exists(directory):
os.makedirs(directory)
# Check for file existence first
if not os.path.exists(filepath):
with (open(filepath, "w")) as f:
f.write(comment + "\n")
f.write(solution_code)
print("Scraped {} solutions".format(problem_count))
def main():
# Ask if user wants to use online or offline scraper
scraper_type = input("Use the (l)ive (online) or (o)ffline scraper? ")
while scraper_type.lower() not in ["l", "o"]:
scraper_type = input("Please enter l or o: ")
if scraper_type == "l":
github = input("Do you want to sign in with GitHub? (y/n) ")
while github.lower() not in ["y", "n"]:
github = input("Please enter y or n: ")
github = True if github == "y" else False
username = input("Enter your username: ")
password = input("Enter your password: ")
print(
"Thanks! Now sit back and relax while I do the hard work...bleep bloop I am a computer"
)
online_scraper(github, username, password)
else:
# Run the scraper
print("I need the HTML file you downloaded from Codewars")
print("Put it in the same directory as this script")
filename = input(
"And tell me what the filename is including the file extension (e.g. codewars.html): "
)
print(
"Thanks! Now sit back and relax while I do the hard work...bleep bloop I am a computer"
)
offline_scraper(filename)
print("All done! And nothing went wrong! Enjoy your solutions!")
if __name__ == "__main__":
main()