forked from dusk-and-dawn/WebScraperSG
-
Notifications
You must be signed in to change notification settings - Fork 0
/
acquire_data.py
103 lines (90 loc) · 3.16 KB
/
acquire_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from db import send_to_db, get_from_db
from datetime import datetime
from scrape_more import scraping_way_back, scraping_precise, scraping_broad
from test import clean_up
from scrape_all_approach import scrape_current_day
# 2015 - 2016
snapshots = [
#'20150213020035', here different format
#'20150315063000', same
#'20150415033839',
#'20150515144023',
#'20150615082347',
#'20150716072943',
'20150924003236',
'20151025034147'
]
# 2017 - 2019
simple_snapshots = [
'20190325082923',
'20180717123539',
'20181104041242',
'20170512121814',
'20170613135845',
'20170716073915',
'20170816010622' #,
#'20171220140301'
]
# 2020
def string_maker(snapshot):
x = 'https://web.archive.org/web/' + snapshot + '/https://www.sofascore.com/tennis/rankings/wta'
return x
def run15_16(list):
counter = 0
for snapshot in list:
fst = string_maker(snapshot)
print(f'{counter} made a string: {fst}')
snd = scraping_broad(fst, 'js-list-filter-items')
print(f'scraped new snapshot {snapshot}')
trd = clean_up(snd)
print(f'cleaned the data {trd}')
fth = send_to_db(snapshot, trd)
print(f'{counter} sent it to the db')
counter += 1
print('collected 2015 - 2016 snapshot data successfully')
def run17_19(list):
counter = 0
for snapshot in list:
fst = string_maker(snapshot)
print(f'{counter} made a string: {fst}')
snd = scraping_way_back(fst)
print(f'{counter} scraped data succesfully{snd}')
trd = send_to_db(snapshot, snd)
print(f'{counter} sent it to the db {snd}')
counter += 1
print('collected 2017 - 2019 snapshot data successfully')
def run20_21(list):
counter = 0
for snapshot in list:
fst = string_maker(snapshot)
print(f'{counter} made a string: {fst}')
snd = scraping_precise(fst, 'ReactVirtualized__Grid.ReactVirtualized__List', 'ReactVirtualized__Grid__innerScrollContainer', 'Content-sc-1o55eay-0.gYsVZh', 'Section-sc-1a7xrsb-0.hwkKwf','Content-sc-1o55eay-0.gYsVZh', 'Content-sc-1o55eay-0.gYsVZh')
print(f'scraped new snapshot {snapshot}')
trd = send_to_db(snapshot, snd)
print(f'{counter} sent it to the db {snd}')
counter += 1
print('collected 2020 - 2021 snapshot data successfully')
def read_csv_to_db(name, path):
with open(path, 'r') as file:
data = file.readlines()
print(f'data: {data}')
send_to_db(name, data)
print('sent data to db')
def scrape_current():
print('start current day scraping')
fst = scrape_current_day('https://www.sofascore.com/tennis/rankings/wta')
print(f'current day scraped {fst}')
send_to_db('wtaMasterList', fst)
print('sent current day data to db')
'''
CONTROL HUB
'''
#run15_16(snapshots)
#run17_19(simple_snapshots)
#run20_21(['20200318145554'])
#scrape_current()
#read_csv_to_db('population_data','/home/a_mind/Downloads/population-and-demography.csv')