-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovies.py
68 lines (51 loc) · 2.29 KB
/
movies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import asyncio
from content_fetcher import ContentFetcher
async def fetch_movie_data(base_url, html_fetcher):
"""
Extracts Oscar-winning films data from the HTML and JSON content fetched from the specified base URL
using the given ContentFetcher.
:param base_url: The base URL to fetch movie data from.
:param html_fetcher: An instance of ContentFetcher for fetching HTML and JSON content.
:return: A list containing movie data extracted from the base URL.
"""
try:
# Fetch HTML content from the base URL
html_content = await html_fetcher.fetch_content(base_url, params={})
# Extract movie years from HTML content
movie_year_list = [link.text.strip() for link in html_content.select('.year-link')]
# Fetch JSON content for each movie year concurrently
tasks = [fetch_movies_for_year(base_url, html_fetcher, movie_year) for movie_year in movie_year_list]
movies = await asyncio.gather(*tasks)
# Flatten the list of lists into a single list
return [movie for sublist in movies for movie in sublist]
except Exception as e:
print("Error:", e)
async def fetch_movies_for_year(base_url, html_fetcher, movie_year):
"""
Fetches JSON content for a specific movie year.
:param base_url: The base URL to fetch movie data from.
:param html_fetcher: An instance of ContentFetcher for fetching HTML and JSON content.
:param movie_year: The year for which to fetch movie data.
:return: A list containing movie data for the specified year.
"""
try:
json_content = await html_fetcher.fetch_content(base_url, params={"ajax": "true", "year": movie_year})
return json_content
except Exception as e:
print(f"Error fetching data for year {movie_year}: {e}")
return []
async def main():
"""
The main entry point of the program.
"""
try:
base_url = "http://www.scrapethissite.com/pages/ajax-javascript/"
html_fetcher = ContentFetcher('movie_cache.json')
# Fetch movie data and print the result
movies = await fetch_movie_data(base_url, html_fetcher)
print(movies)
except Exception as e:
print("Error:", e)
if __name__ == "__main__":
# Run the main function asynchronously
asyncio.run(main())