-
Notifications
You must be signed in to change notification settings - Fork 1
/
Scraper-modified.py
429 lines (344 loc) · 22.8 KB
/
Scraper-modified.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
'''The aim of this script is to scrape abstracts, author names and date of publication from Springer
Sarthak J. Shetty
04/08/2018'''
'''Adding the libraries to be used here.'''
'''Importing urllib.request to use urlopen'''
from urllib.request import build_opener, HTTPCookieProcessor
''''Importing urllib.error to handle errors in HTTP pinging.'''
import urllib.error
'''BeautifulSoup is used for souping.'''
from bs4 import BeautifulSoup as bs
'''Counter generates a dictionary from the abstract data, providing frequencies of occurences'''
from collections import Counter
'''Importing the CSV library here to dump the dictionary for further analysis and error checking if required. Will edit it out later.'''
import csv
'''Importing numpy to generate a random integer for the delay_function (see below)'''
import numpy as np
'''This library is imported to check if we can feasibly introduce delays into the processor loop to reduce instances of the remote server, shutting the connection while scrapping extraordinarily large datasets.'''
import time
'''Fragmenting code into different scripts. Some functions are to be used across the different sub-parts as well. Hence, shifted some of the functions to the new script.'''
from pyResearchInsights.common_functions import pre_processing, argument_formatter, keyword_url_generator, abstract_id_log_name_generator, status_logger
def url_reader(url, status_logger_name):
'''This keyword is supplied to the URL and is hence used for souping.
Encountered an error where some links would not open due to HTTP.error
This is added here to try and ping the page. If it returns false the loop ignores it and
moves on to the next PII number'''
try:
'''Using the urllib function, urlopen to extract the html_code of the given page'''
open_connection = build_opener(HTTPCookieProcessor())
html_code = open_connection.open(url)
'''Closing the abstract window after each abstract has been extracted'''
return html_code
except (UnboundLocalError, urllib.error.HTTPError):
pass
def results_determiner(url, status_logger_name):
'''This function determines the number of results that a particular keywords returns
once it looks up the keyword on link.springer.com
The function returns all the possible links containing results and then provides the total number of results
returned by a particular keyword, or combination of keywords.'''
first_page_to_scrape = url_reader(url, status_logger_name)
first_page_to_scrape_soup = page_souper(first_page_to_scrape, status_logger_name)
number_of_results = first_page_to_scrape_soup.find('h1', {'id':'number-of-search-results-and-search-terms'}).find('strong').text
results_determiner_status_key = "Total number of results obtained: "+number_of_results
status_logger(status_logger_name, results_determiner_status_key)
def url_generator(start_url, query_string, status_logger_name):
'''This function is written to scrape all possible webpages of a given topic
The search for the URLs truncates when determiner variable doesn't return a positive value'''
url_generator_start_status_key = start_url+" "+"start_url has been received"
status_logger(status_logger_name, url_generator_start_status_key)
'''Initiallizing a list here in order to contain the URLs. Even if a URL does not return valid results,
it is popped later on from the list.'''
urls_to_scrape=[]
counter = 0
total_url = start_url+str(counter)+"?facet-content-type=\"Article\"&query="+query_string+"&facet-language=\"En\""
initial_url_status_key = total_url+" "+"has been obtained"
status_logger(status_logger_name, initial_url_status_key)
urls_to_scrape.append(total_url)
test_soup = bs(url_reader(total_url, status_logger_name), 'html.parser')
'''Here, we grab the page element that contains the number of pages to be scrapped'''
determiner = test_soup.findAll('span', {'class':'number-of-pages'})[0].text
'''We generate the urls_to_scrape from the stripped down determiner element'''
urls_to_scrape = [(start_url+str(counter)+"?facet-content-type=\"Article\"&query="+query_string+"&facet-language=\"En\"") for counter in range(1, (int(determiner.replace(',', '')) + 1))]
url_generator_stop_status_key = determiner.replace(',', '') + " page URLs have been obtained"
status_logger(status_logger_name, url_generator_stop_status_key)
return urls_to_scrape
def page_souper(page, status_logger_name):
'''Function soups the webpage elements and provided the tags for search.
Note: Appropriate encoding has to be picked up beenfore souping'''
page_souper_start_status_key = "Souping page"
status_logger(status_logger_name, page_souper_start_status_key)
page_soup = bs(page, 'html.parser')
page_souper_stop_status_key = "Souped page"
status_logger(status_logger_name, page_souper_stop_status_key)
return page_soup
def abstract_word_extractor(abstract, abstract_title, abstract_year, permanent_word_sorter_list, trend_keywords, status_logger_name):
'''This function creates the list that stores the text in the form of individual words
against their year of appearence.'''
abstract_word_sorter_start_status_key = "Adding:"+" "+abstract_title+" "+"to the archival list"
status_logger(status_logger_name, abstract_word_sorter_start_status_key)
'''This line of code converts the entire abstract into lower case'''
abstract = abstract.lower()
'''Converting the abstract into a list of words'''
abstract_word_list = abstract.split()
'''This line of code sorts the elements in the word list alphabetically. Working with dataframes is harden, hence
we are curbing this issue by modifying the list rather.'''
abstract_word_list.sort()
'''If the word currently being looped in the abstract list matches the trend word being investigated for, the year it appears
is appended to the permanent word sorter list'''
for element in abstract_word_list:
if(element==trend_keywords[0]):
permanent_word_sorter_list.append(abstract_year[:4])
abstract_word_sorter_end_status_key = "Added:"+" "+abstract_title+" "+"to the archival list"
status_logger(status_logger_name, abstract_word_sorter_end_status_key)
def abstract_year_list_post_processor(permanent_word_sorter_list, status_logger_name):
'''Because of this function we have a dictionary containing the frequency of occurrence of terms in specific years'''
abstract_year_list_post_processor_start_status_key = "Post processing of permanent word sorter list has commenced"
status_logger(status_logger_name, abstract_year_list_post_processor_start_status_key)
abstract_year_dictionary = Counter(permanent_word_sorter_list)
abstract_year_list_post_processor_end_status_key = "Post processing of permanent word sorter list has completed"
status_logger(status_logger_name, abstract_year_list_post_processor_end_status_key)
return abstract_year_dictionary
def abstract_year_dictionary_dumper(abstract_word_dictionary, abstracts_log_name, status_logger_name):
'''This function saves the abstract word dumper to the disc for further inspection.
The file is saved as a CSV bucket and then dumped.'''
permanent_word_sorter_list_start_status_key = "Dumping the entire dictionary to the disc"
status_logger(status_logger_name, permanent_word_sorter_list_start_status_key)
with open(abstracts_log_name+"_"+"DICTIONARY.csv", 'w') as dictionary_to_csv:
writer = csv.writer(dictionary_to_csv)
for key, value in abstract_word_dictionary.items():
year = key
writer.writerow([year, value])
permanent_word_sorter_list_end_status_key = "Dumped the entire dictionary to the disc"
status_logger(status_logger_name, permanent_word_sorter_list_end_status_key)
def abstract_page_scraper(abstract_url, abstract_input_tag_id, abstracts_log_name, permanent_word_sorter_list, site_url_index, status_logger_name):
'''This function is written to scrape the actual abstract of the specific paper,
that is being referenced within the list of abstracts'''
abstract_page_scraper_status_key="Abstract ID:"+" "+abstract_input_tag_id
status_logger(status_logger_name, abstract_page_scraper_status_key)
abstract_page_url = abstract_url+abstract_input_tag_id
abstract_page = url_reader(abstract_page_url, status_logger_name)
abstract_soup = page_souper(abstract_page, status_logger_name)
title = title_scraper(abstract_soup, status_logger_name)
abstract_date = abstract_date_scraper(title, abstract_soup, status_logger_name)
download_url = download_url_scraper(abstract_soup, status_logger_name)
'''Due to repeated attribute errors with respect to scraping the authors name, these failsafes had to be put in place.'''
try:
author = author_scraper(abstract_soup, status_logger_name)
except AttributeError:
author = "Author not available"
'''Due to repeated attribute errors with respect to scraping the abstract, these failsafes had to be put in place.'''
try:
abstract = abstract_scraper(abstract_soup)
# abstract_word_extractor(abstract, title, abstract_date, permanent_word_sorter_list, trend_keywords, status_logger_name)
except AttributeError:
abstract = "Abstract not available"
abstract_database_writer(abstract_page_url, title, author, download_url, abstract, abstracts_log_name, abstract_date, status_logger_name)
analytical_abstract_database_writer(title, author, abstract, abstracts_log_name, status_logger_name)
def abstract_crawler(abstract_url, abstract_id_log_name, abstracts_log_name, permanent_word_sorter_list, site_url_index, status_logger_name):
abstract_crawler_start_status_key = "Entered the Abstract Crawler"
status_logger(status_logger_name, abstract_crawler_start_status_key)
abstract_crawler_temp_index = site_url_index
'''This function crawls the page and access each and every abstract'''
abstract_input_tag_ids = abstract_id_database_reader(abstract_id_log_name, abstract_crawler_temp_index, status_logger_name)
for abstract_input_tag_id in abstract_input_tag_ids:
try:
abstract_crawler_accept_status_key="Abstract Number:"+" "+str((abstract_input_tag_ids.index(abstract_input_tag_id)+1)+abstract_crawler_temp_index*20)
status_logger(status_logger_name, abstract_crawler_accept_status_key)
abstract_page_scraper(abstract_url, abstract_input_tag_id, abstracts_log_name, permanent_word_sorter_list, site_url_index, status_logger_name)
except TypeError:
abstract_crawler_reject_status_key="Abstract Number:"+" "+str(abstract_input_tag_ids.index(abstract_input_tag_id)+1)+" "+"could not be processed"
status_logger(status_logger_name, abstract_crawler_reject_status_key)
pass
abstract_crawler_end_status_key = "Exiting the Abstract Crawler"
status_logger(status_logger_name, abstract_crawler_end_status_key)
def analytical_abstract_database_writer(title, author, abstract, abstracts_log_name, status_logger_name):
'''This function will generate a secondary abstract file that will contain only the abstract.
The abstract file generated will be passed onto the Visualizer and Analyzer function, as opposed to the complete
abstract log file containing lot of garbage words in addition to the abstract text.'''
analytical_abstract_database_writer_start_status_key = "Writing"+" "+title+" "+"by"+" "+author+" "+"to analytical abstracts file"
status_logger(status_logger_name, analytical_abstract_database_writer_start_status_key)
analytical_abstracts_txt_log = open(abstracts_log_name+'_'+'ANALYTICAL'+'.txt', 'a')
analytical_abstracts_txt_log.write(abstract)
analytical_abstracts_txt_log.write('\n'+'\n')
analytical_abstracts_txt_log.close()
analytical_abstract_database_writer_stop_status_key = "Written"+" "+title+" "+"to disc"
status_logger(status_logger_name, analytical_abstract_database_writer_stop_status_key)
def abstract_database_writer(abstract_page_url, title, author, download_url, abstract, abstracts_log_name, abstract_date, status_logger_name):
'''This function makes text files to contain the abstracts for future reference.
It holds: 1) Title, 2) Author(s), 3) Abstract'''
abstract_database_writer_start_status_key = "Writing"+" "+title+" "+"by"+" "+author+" "+"to disc"
status_logger(status_logger_name, abstract_database_writer_start_status_key)
abstracts_csv_log = open(abstracts_log_name+'.csv', 'a')
abstracts_txt_log = open(abstracts_log_name+'.txt', 'a')
download_url_log = open(abstracts_log_name+'_URLs.txt', 'a')
abstracts_txt_log.write("Title:"+" "+title)
abstracts_txt_log.write('\n')
abstracts_txt_log.write("Author:"+" "+author)
abstracts_txt_log.write('\n')
abstracts_txt_log.write("Date:"+" "+abstract_date)
abstracts_txt_log.write('\n')
abstracts_txt_log.write("URL:"+" "+abstract_page_url)
abstracts_txt_log.write('\n')
abstracts_txt_log.write("Download URL:"+" "+download_url)
abstracts_txt_log.write('\n')
download_url_log.write(download_url)
download_url_log.write('\n')
abstracts_txt_log.write("Abstract:"+" "+abstract)
abstracts_csv_log.write(abstract)
abstracts_csv_log.write('\n')
abstracts_txt_log.write('\n'+'\n')
abstracts_txt_log.close()
abstracts_csv_log.close()
download_url_log.close()
abstract_database_writer_stop_status_key = "Written"+" "+title+" "+"to disc"
status_logger(status_logger_name, abstract_database_writer_stop_status_key)
def abstract_id_database_reader(abstract_id_log_name, site_url_index, status_logger_name):
'''This function has been explicitly written to access
the abstracts database that the given prgram generates.'''
abstract_id_reader_temp_index = site_url_index
abstract_id_database_reader_start_status_key = "Extracting Abstract IDs from disc"
status_logger(status_logger_name, abstract_id_database_reader_start_status_key)
lines_in_abstract_id_database=[line.rstrip('\n') for line in open(abstract_id_log_name+str(abstract_id_reader_temp_index+1)+'.txt')]
abstract_id_database_reader_stop_status_key = "Extracted Abstract IDs from disc"
status_logger(status_logger_name, abstract_id_database_reader_stop_status_key)
return lines_in_abstract_id_database
def abstract_id_database_writer(abstract_id_log_name, abstract_input_tag_id, site_url_index):
'''This function writes the abtract ids to a .txt file for easy access and documentation.'''
abstract_id_writer_temp_index = site_url_index
abstract_id_log = open((abstract_id_log_name+str(abstract_id_writer_temp_index+1)+'.txt'), 'a')
abstract_id_log.write(abstract_input_tag_id)
abstract_id_log.write('\n')
abstract_id_log.close()
def abstract_date_scraper(title, abstract_soup, status_logger_name):
'''This function scrapes the date associated with each of the abstracts.
This function will play a crucial role in the functionality that we are trying to build into our project.'''
date_scraper_entry_status_key = "Scraping date of the abstract titled:"+" "+title
status_logger(status_logger_name, date_scraper_entry_status_key)
try:
abstract_date = abstract_soup.find('time').get('datetime')
date_scraper_exit_status_key = title+" "+"was published on"+" "+abstract_date
except AttributeError:
abstract_date = "Date for abstract titled:"+" "+title+" "+"was not available"
date_scraper_exit_status_key = abstract_date
pass
status_logger(status_logger_name, date_scraper_exit_status_key)
return abstract_date
def abstract_scraper(abstract_soup):
'''This function scrapes the abstract from the soup and returns to the page scraper'''
abstract = str(abstract_soup.find('div', {'id':'Abs1-content'}).text.encode('utf-8'))[1:]
return abstract
def author_scraper(abstract_soup, status_logger_name):
'''This function scrapes the author of the text, for easy navigation and search'''
author_scraper_start_status_key = "Scraping the author name"
status_logger(status_logger_name, author_scraper_start_status_key)
'''This class element's text attribute contains all the authors names. It is converted to a findAll() list and then concatinated into a string for storage.'''
author = ''.join(str(author) for author in [authorElement.text for authorElement in abstract_soup.findAll('li', {'class':'c-author-list__item'})])
author_scraper_end_status_key = "Scraped the author's name:" + " "+str(author)
status_logger(status_logger_name, author_scraper_end_status_key)
return author
def title_scraper(abstract_soup, status_logger_name):
'''This function scrapes the title of the text from the abstract'''
title_scraper_start_status_key = "Scraping the title of the abstract"
status_logger(status_logger_name, title_scraper_start_status_key)
'''Purpose of this block is to retrieve the title of the text even if an AttributeError arises'''
try:
title = str(abstract_soup.find('h1', {'class':'c-article-title'}).text.encode('utf-8'))[1:]
'''In case an incorrectly classified asset is to be scrapped (Journal/Chapter as opposed to Article), go through this block in an attempt to retrieve the title.'''
except AttributeError:
try:
title = str(abstract_soup.find('h1',{'class':'ChapterTitle'}).text.encode('utf-8'))[1:]
except AttributeError:
try:
title = (abstract_soup.find('span', {'class':'JournalTitle'}).text)
except AttributeError:
title = "Title not available"
title_scraper_end_status_key = "Scraped the title of the abstract"
status_logger(status_logger_name, title_scraper_end_status_key)
return title
def download_url_scraper(abstract_soup, status_logger_name):
'''This function scrapes the download URL of the fulltext'''
download_url_scraper_start_status_key = "Scraping the download link of the fulltext"
status_logger(status_logger_name, download_url_scraper_start_status_key)
'''Purpose of this block is to retrieve the download URL of the fulltext even if an AttributeError arises'''
try:
temp_download_url = str(abstract_soup.find('a', {'class':'c-pdf-download__link'}).get('href'))
if "/content/pdf/" in temp_download_url:
download_url = "https://link.springer.com" + temp_download_url
else:
download_url = "https://" + temp_download_url[2:]
except AttributeError:
download_url = "Download link not available"
download_url_scraper_end_status_key = "Scraped the download link of the fulltext"
status_logger(status_logger_name, download_url_scraper_end_status_key)
return download_url
def abstract_id_scraper(abstract_id_log_name, page_soup, site_url_index, status_logger_name):
'''This function helps in obtaining the PII number of the abstract.
This number is then coupled with the dynamic URL and provides'''
abstract_id_scraper_start_status_key="Scraping IDs"
status_logger(status_logger_name, abstract_id_scraper_start_status_key)
''''This statement collects all the input tags that have the abstract ids in them'''
abstract_input_tags = page_soup.findAll('a', {'class':'title'})
for abstract_input_tag in abstract_input_tags:
abstract_input_tag_id=abstract_input_tag.get('href')
abstract_id_database_writer(abstract_id_log_name, abstract_input_tag_id, site_url_index)
abstract_id_scraper_stop_status_key="Scraped IDs"
status_logger(status_logger_name, abstract_id_scraper_stop_status_key)
def word_sorter_list_generator(status_logger_name):
word_sorter_list_generator_start_status_key = "Generating the permanent archival list"
status_logger(status_logger_name, word_sorter_list_generator_start_status_key)
'''This function generates the list that hold the Words and corresponding Years of the
abstract data words before the actual recursion of scrapping data from the website begins.'''
word_sorter_list = []
word_sorter_list_generator_exit_status_key = "Generated the permanent archival list"
status_logger(status_logger_name, word_sorter_list_generator_exit_status_key)
return word_sorter_list
def delay_function(status_logger_name):
'''Since the Springer servers are contstantly shutting down the remote connection, we introduce
this function in the processor function in order to reduce the number of pings it delivers to the remote.'''
delay_variable = np.random.randint(0, 20)
delay_function_start_status_key = "Delaying remote server ping:"+" "+str(delay_variable)+" "+"seconds"
status_logger(status_logger_name, delay_function_start_status_key)
'''Sleep parameter causes the code to be be delayed by 1 second'''
time.sleep(delay_variable)
delay_function_end_status_key = "Delayed remote server ping:"+" "+str(delay_variable)+" "+"seconds"
status_logger(status_logger_name, delay_function_end_status_key)
def processor(abstract_url, urls_to_scrape, abstract_id_log_name, abstracts_log_name, status_logger_name, keywords_to_search):
''''Multiple page-cycling function to scrape multiple result pages returned from Springer.
print(len(urls_to_scrape))'''
'''This list will hold all the words mentioned in all the abstracts. It will be later passed on to the
visualizer code to generate the trends histogram.'''
permanent_word_sorter_list = word_sorter_list_generator(status_logger_name)
for site_url_index in range(0, len(urls_to_scrape)):
if(site_url_index==0):
results_determiner(urls_to_scrape[site_url_index], status_logger_name)
'''Collects the web-page from the url for souping'''
page_to_soup = url_reader(urls_to_scrape[site_url_index], status_logger_name)
'''Souping the page for collection of data and tags'''
page_soup = page_souper(page_to_soup, status_logger_name)
'''Scrapping the page to extract all the abstract IDs'''
abstract_id_scraper(abstract_id_log_name, page_soup, site_url_index, status_logger_name)
'''Actually obtaining the abstracts after combining ID with the abstract_url'''
abstract_crawler(abstract_url, abstract_id_log_name, abstracts_log_name, permanent_word_sorter_list, site_url_index, status_logger_name)
'''Delaying after each page being scrapped, rather than after each abstract'''
delay_function(status_logger_name)
'''This line of code processes and generates a dictionary from the abstract data'''
abstract_year_dictionary = abstract_year_list_post_processor(permanent_word_sorter_list, status_logger_name)
return abstract_year_dictionary
def scraper_main(keywords_to_search, abstracts_log_name, status_logger_name):
''''This function contains all the functions and contains this entire script here, so that it can be imported later to the main function'''
'''Here, we utilize the keywords provided by the user to generate the URLs for scrapping'''
start_url, abstract_url, query_string = keyword_url_generator(keywords_to_search)
'''Since we receive only the abstracts_log_name, we have to extract the abstract_id_log_name'''
abstract_id_log_name = abstract_id_log_name_generator(abstracts_log_name)
if(type(keywords_to_search) == str):
'''If the user ran the code using just the function from the library, then the keywords and trends words need to be in this format'''
keywords_to_search = argument_formatter(keywords_to_search)
else:
keywords_to_search = keywords_to_search
'''Provides the links for the URLs to be scraped by the scraper'''
urls_to_scrape = url_generator(start_url, query_string, status_logger_name)
'''Calling the processor() function here'''
abstract_year_dictionary = processor(abstract_url, urls_to_scrape, abstract_id_log_name, abstracts_log_name, status_logger_name, keywords_to_search)
'''This function dumps the entire dictionary onto the disc for further analysis and inference.'''
abstract_year_dictionary_dumper(abstract_year_dictionary, abstracts_log_name, status_logger_name)
return 0