-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathasktheweb.py
141 lines (118 loc) · 5.01 KB
/
asktheweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import os
import logging
from typing import Set, List, Optional
from aiolimiter import AsyncLimiter
import argparse
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AsyncCrawler:
"""
Asynchronous web crawler that fetches and saves content from a specified domain.
Attributes:
start_url (str): The initial URL to start crawling from.
domain (str): The domain to restrict crawling to.
output_dir (str): Directory to save crawled content.
seen_urls (Set[str]): Set of URLs already processed.
rate_limiter (AsyncLimiter): Limits the rate of requests to the server.
"""
def __init__(self, start_url: str, output_dir: str, rate_limit: int = 5):
"""
Initialize the AsyncCrawler.
Args:
start_url (str): The URL to start crawling from.
output_dir (str): Directory to save crawled content.
rate_limit (int): Maximum number of requests per second.
"""
self.start_url = start_url
self.domain = urlparse(start_url).netloc
self.output_dir = output_dir
self.seen_urls: Set[str] = set()
self.rate_limiter = AsyncLimiter(rate_limit, 1) # 5 requests per second
async def crawl(self):
"""
Start the crawling process.
"""
os.makedirs(self.output_dir, exist_ok=True)
async with aiohttp.ClientSession() as session:
await self.process_url(session, self.start_url)
async def process_url(self, session: aiohttp.ClientSession, url: str):
"""
Process a single URL: fetch content, save text, and extract links.
Args:
session (aiohttp.ClientSession): The active client session.
url (str): The URL to process.
"""
if url in self.seen_urls:
return
self.seen_urls.add(url)
async with self.rate_limiter:
try:
async with session.get(url) as response:
if response.status != 200:
logger.warning(f"Failed to fetch {url}: HTTP {response.status}")
return
content_type = response.headers.get('Content-Type', '')
if not content_type.startswith('text/html'):
logger.info(f"Skipping non-HTML content at {url}")
return
text = await response.text()
except aiohttp.ClientError as e:
logger.error(f"Error fetching {url}: {e}")
return
soup = BeautifulSoup(text, 'html.parser')
self.save_text(url, soup.get_text())
tasks = []
for link in self.extract_links(soup, url):
if link not in self.seen_urls:
tasks.append(asyncio.create_task(self.process_url(session, link)))
await asyncio.gather(*tasks)
def extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
"""
Extract all links from a BeautifulSoup object that belong to the same domain.
Args:
soup (BeautifulSoup): Parsed HTML content.
base_url (str): The base URL for resolving relative links.
Returns:
List[str]: List of extracted full URLs.
"""
links = []
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(base_url, href)
if urlparse(full_url).netloc == self.domain:
links.append(full_url)
return links
def save_text(self, url: str, text: str):
"""
Save the extracted text content to a file.
Args:
url (str): The URL of the page (used for filename).
text (str): The text content to save.
"""
filename = url.replace('https://', '').replace('http://', '').replace('/', '_') + '.txt'
filepath = os.path.join(self.output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(text)
logger.info(f"Saved content from {url} to {filepath}")
def main():
"""
Main function to set up and run the crawler.
"""
parser = argparse.ArgumentParser(description="Web Crawler")
parser.add_argument("start_url", help="The URL to start crawling from")
parser.add_argument("--output", default="crawled_text", help="Output directory for crawled text")
parser.add_argument("--rate-limit", type=int, default=5, help="Number of requests per second")
args = parser.parse_args()
crawler = AsyncCrawler(args.start_url, args.output, args.rate_limit)
asyncio.run(crawler.crawl())
if __name__ == "__main__":
main()
"""
Usage:
pip install aiohttp beautifulsoup4 aiolimiter
python async_crawler.py https:URL --output crawled_data --rate-limit 3
"""