-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathmain_houses.py
119 lines (102 loc) · 4.24 KB
/
main_houses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import subprocess
import pandas as pd
import time
import tkinter as tk
import os
# -*- Load links dataframe -*-
exclude = [i for i, line in enumerate(open('./additional_csv/links.csv')) if line.startswith('link')]
if len(exclude) is 1:
all_zones = pd.read_csv('./additional_csv/links.csv')
else:
all_zones = pd.read_csv('./additional_csv/links.csv', skiprows=exclude[1:])
# Normalize province names:
all_zones['province'] = (all_zones['province']
.str.replace('á','a')
.str.replace('é','e')
.str.replace('í','i')
.str.replace('ó','o')
.str.replace('ú','u')
.str.replace('ñ','n'))
provinces = all_zones.province.sort_values().unique()
# -*- Select provinces with GUI
selected=[]
def selection():
sel_provinces=l.curselection() # current selection
for province in sel_provinces:
selected.extend([l.get(province)])
root.destroy()
root = tk.Tk()
l = tk.Listbox(root, width=50, height=25, selectmode=tk.MULTIPLE)
b = tk.Button(root,text="Select province(s)",command=selection)
for index in range(len(provinces)):
l.insert(index, provinces[index])
l.pack()
b.pack()
root.geometry("500x500+500+100")
root.mainloop()
# -*- Extract houses from selected provinces
for province in selected:
filtered_zones = all_zones[all_zones['province'] == province]
num_link = sum(all_zones[all_zones['province'] == province]['num_link'])
print("\n***********************")
print(f"Crawler is ready to extract {num_link} houses from {province}")
print("***********************\n")
time.sleep(3)
for zone in filtered_zones.itertuples():
print("\n***********************")
print(f"Extracting {zone.num_link} houses from a zone of {province}")
print("***********************\n\n")
time.sleep(3)
command = f'scrapy crawl houses -a start_url={zone.link} -o ./province_houses/houses_{province.replace(" ", "_")}.csv'
subprocess.run(command, shell=True)
print("******** ZONE HOUSE EXTRACTION FINISHED! Waiting 20 seconds before reload")
time.sleep(20)
# -*- Get denied links of selected province -*-
deny_link_flag = True
while deny_link_flag:
denied_links = []
try:
with open('logLink.txt') as log:
denied_links.extend(log.readlines())
denied_links = [s.strip() for s in denied_links]
try:
os.remove('logLink.txt')
except Exception as e:
print(e)
deny_link_flag = False
print("\n***********************")
print(f"Extracting denied houses links from {province}")
print("***********************\n\n")
print("******** Waiting 3 minutes before starting")
time.sleep(180)
for link in denied_links:
command_denied_link = f'scrapy crawl houses -a start_url={link} -o ./province_houses/houses_{province.replace(" ","_")}.csv'
subprocess.run(command_denied_link, shell=True)
# -*- Check if still are denied houses -*-
if os.path.isfile('logLink.txt'):
print("******** STILL ARE DENIED LINKS! Waiting 2 extra minutes before reload")
time.sleep(120)
else:
deny_link_flag = False
except Exception as e:
deny_link_flag = False
# -*- Get denied houses of selected province -*-
if os.path.isfile('logHouse.txt'):
deny_house_flag = True
else:
deny_house_flag = False
while deny_house_flag:
print("\n***********************")
print(f"Extracting denied houses links from {province}")
print("***********************\n\n")
print("******** Waiting 3 minutes before starting")
time.sleep(180)
denied_link = 'https://www.idealista.com/login' # start-url-code to extract denied links
command_denied_house = f'scrapy crawl houses -a start_url={denied_link} -o ./province_houses/houses_{province.replace(" ","_")}.csv'
subprocess.run(command_denied_house, shell=True)
# -*- Check if still are denied houses -*-
if os.path.isfile('./logHouse.txt'):
print("******** STILL ARE DENIED HOUSES! Waiting 2 extra minutes before reload")
time.sleep(120)
else:
deny_house_flag = False