-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
120 lines (92 loc) · 4.05 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
def extract_attrs(url):
"""Extracts the attributes from the HTML of the website starting with 'data-'.
Args:
url (str): The URL of the website.
Returns:
pandas.DataFrame: The extracted attributes.
"""
# Get the HTML from the website
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
# Find all div elements with the class 'restaurant-row'
divs = soup.find_all('div', class_=['restaurant-row'])
# Iterate over the div elements and extract the attributes
data = []
for div in divs:
attrs = div.attrs
del attrs['class']
data.append(attrs)
# Create a Pandas DataFrame from the data list
df = pd.DataFrame(data)
# Remove the prefix 'data-' from the column names
df = df.rename(columns={col: col.replace('data-', '') for col in df.columns})
# Convert the types of the columns
df.cena = df.cena.str.replace(',', '.').astype(float)
df.doplacilo = df.doplacilo.str.replace(',', '.').astype(float)
df.lat = df.lat.astype(float)
df.lon = df.lon.astype(float)
df.posid = df.posid.astype(int)
df.lokal = df.lokal.astype('string')
df.naslov = df.naslov.astype('string')
df.city = df.city.astype('string')
df.detailslink = df.detailslink.astype('string')
return df
def load_data():
"""Loads the data from the website and from the archive.
Returns:
tuple: The data from the website and from the archive as a tuple of pandas.DataFrames.
"""
URL_NEW = 'https://www.studentska-prehrana.si/sl/restaurant'
URL_OLD = 'https://web.archive.org/web/20220601102110/https://www.studentska-prehrana.si/sl/restaurant'
df_new = extract_attrs(URL_NEW)
df_old = extract_attrs(URL_OLD)
return df_new, df_old
def merge_data(df_new, df_old):
"""Merges the data from the website and from the archive.
Args:
df_new (pandas.DataFrame): The data from the website.
df_old (pandas.DataFrame): The data from the archive.
Returns:
pandas.DataFrame: The merged data.
"""
# Fix the posid of two restaurants
df_old.loc[df_old.posid == 2829, 'posid'] = 3191
df_old.loc[df_old.posid == 2875, 'posid'] = 3205
# Fix wrong new price of some restaurante
df_new.loc[df_new.posid == 3071, 'doplacilo'] = 0.0
df_new.loc[df_new.posid == 3071, 'cena'] = 3.5
df_new.loc[df_new.posid == 2521, 'doplacilo'] = 3.4
df_new.loc[df_new.posid == 2521, 'cena'] = 6.9
# Merge the data
df = pd.merge(df_old, df_new, on=['posid'], how='outer', suffixes=('_old', '_new'))
# Fill the NaN values with the old values
df.lat_new = df.lat_new.fillna(df.lat_old)
df.lon_new = df.lon_new.fillna(df.lon_old)
df.lokal_new = df.lokal_new.fillna(df.lokal_old)
df.naslov_new = df.naslov_new.fillna(df.naslov_old)
df.city_new = df.city_new.fillna(df.city_old)
df.detailslink_new = df.detailslink_new.fillna(df.detailslink_old)
df['sort-group_new'] = df['sort-group_new'].fillna(df['sort-group_old'])
# Remove the columns that are not needed, rename the columns and reorder them
df = df.drop(['lat_old', 'lon_old', 'lokal_old', 'naslov_old', 'city_old', 'detailslink_old', 'sort-group_old'], axis=1)
df = df.rename(columns={'lat_new': 'lat', 'lon_new': 'lon', 'lokal_new': 'lokal', 'naslov_new': 'naslov', 'city_new': 'city', 'detailslink_new': 'detailslink', 'sort-group_new': 'sort-group', 'cena_old': 'cena_old', 'cena_new': 'cena', 'doplacilo_old': 'doplacilo_old', 'doplacilo_new': 'doplacilo'})
df = df[['lokal', 'naslov', 'city', 'cena', 'cena_old', 'doplacilo', 'doplacilo_old', 'lat', 'lon', 'posid', 'detailslink', 'sort-group']]
# Calculate the differences
df['cena_diff'] = df.cena - df.cena_old
df['doplacilo_diff'] = df.doplacilo - df.doplacilo_old
df['cena_diff_percent'] = df.cena_diff / df.cena_old * 100
df['doplacilo_diff_percent'] = df.doplacilo_diff / df.doplacilo_old * 100
return df
if __name__ == '__main__':
df_new, df_old = load_data()
df = merge_data(df_new, df_old)
print(df.head())
print(f'{len(df)} restaurants')
# saves the data to a json file
df.to_json('data/restavracije.json', orient='records')
# saves the data to a csv file
df.to_csv('data/restavracije.csv', index=False)