-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
36 lines (27 loc) · 1.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Packages
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas
# Obtain data URLs (needed as ONS urls are not always consistent) before cleaning.
url = ['https://www.ons.gov.uk/economy/grossdomesticproductgdp/datasets/gdpmonthlyestimateuktimeseriesdataset',
'https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/averageweeklyearnings',
'https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/employmentbyindustryemp13']
urls = []
for x in range(len(url)):
reqs = requests.get(url[x])
soup = BeautifulSoup(reqs.text, 'html.parser')
for link in soup.find_all('a'):
urls.append(link.get('href'))
match = [s for s in urls if ".xls" in s]
# Download and save xls/xlsx files
path = "/Users/joshuarawlings/PycharmProjects/Automation/"
for x in match:
file = requests.get('https://www.ons.gov.uk' + x, verify=True)
ext = (x.split("/")[-1:])
open(path + '/' + ext[0], 'wb').write(file.content)
# Try find a solution that doesn't download the files. instead just gets the latest data ready for processing.
# Process employment data
employment = "/Users/joshuarawlings/PycharmProjects/Automation/emp13aug2022.xls"
emp = pandas.read_excel(employment, sheet_name = "People", header=6, index_col=0)
emp = emp.apply(pd.to_numeric, errors='coerce').loc[:, ~emp.columns.str.contains('^Unnamed')].dropna().round(decimals=0)