forked from project-sunbird/jugalbandi-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
84 lines (68 loc) · 2.66 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from bs4 import BeautifulSoup
import os
visited_links = set()
links_written = set()
DOMAIN_END_POINT = ''
def get_child_urls(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links_list = []
for link in soup.find_all('a'):
url_href = link.get('href')
if url_href and url_href.startswith("/") and url_href != "/":
links_list.append(url_href)
for child_link in links_list:
if child_link.startswith("/"):
child_url = DOMAIN_END_POINT + child_link
# print('Before -- ',visited_links,'child_link -- ',child_link)
# print(child_link not in visited_links)
if child_link not in visited_links:
print(child_url + " -- " + child_link)
visited_links.add(child_link)
# print('After -- ',visited_links)
get_child_urls(child_url)
with open("URLs.txt", "a") as f:
for url in links_list:
if url not in links_written:
links_written.add(url)
f.write(url + "\n")
def read_from_sources(base_url):
with open("URLs.txt", "r") as f:
existing_urls = f.read().splitlines()
for url in existing_urls:
print("Read from URL:", base_url + url)
# get the content of URL
page = requests.get(base_url + url)
# Creating BeautifulSoup object
soup = BeautifulSoup(page.content, "html.parser")
#ignoring Scripts and Styles for Scrapping
[x.extract() for x in soup.findAll(['script', 'style'])]
# Get the text from the URL
text = soup.get_text()
# Removing all the new lines
# output = ' '.join(text.split())
output = ' '.join(line.strip() if line.strip() else '' for line in text.splitlines())
#Saving the scraped content to a file
with open('web_scrapped_data.txt', 'a') as f:
f.write(output+ "\n")
def clean_up():
# Check if the file exists before removing
if os.path.exists("URLs.txt"):
os.remove("URLs.txt")
print("File 'URLs.txt' removed successfully.")
else:
print("File 'URLs.txt' does not exist.")
# Check if the file exists before removing
if os.path.exists("web_scrapped_data.txt"):
os.remove("web_scrapped_data.txt")
print("File 'web_scrapped_data.txt' removed successfully.")
else:
print("File 'web_scrapped_data.txt' does not exist.")
# Doing clean up
clean_up()
# Get the website URL
url = input("Enter website URL: ")
DOMAIN_END_POINT=url
get_child_urls(url)
read_from_sources(url)