-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
103 lines (85 loc) · 3.12 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import hashlib
import asyncio
import time
import pandas as pd
import tempfile
from tqdm.auto import tqdm
from typing import Dict, TYPE_CHECKING
import tenacity
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
import requests
from scraper.rthk_zh_telegram import RTHKChineseTelegramScraper
from scraper.inmediahknet import InMediaHKNetTelegramScraper
from scraper.hk01 import HK01Scraper
from scraper.stheadline import HeadlineScraper
from scraper.rthk_zh import RTHKChineseScraper
from scraper.rthk_en import RTHKEnglishScraper
from huggingface_hub import HfApi
if TYPE_CHECKING:
from scraper.scraper import Scraper
REPO_NAME = os.getenv("HF_REPO_NAME")
HF_TOKEN = os.getenv("HF_TOKEN")
api = HfApi(token=HF_TOKEN)
def is_rate_limit_error(exception):
return (
isinstance(exception, requests.exceptions.HTTPError)
and exception.response.status_code == 429
)
@retry(
retry=retry_if_exception_type((requests.exceptions.HTTPError, ConnectionError)),
stop=stop_after_attempt(7), # Maximum 7 attempts
wait=wait_exponential(
multiplier=1, min=4, max=60
), # Start with 4s, double each time, max 60s
before_sleep=lambda retry_state: print(
f"Rate limited. Retry attempt {retry_state.attempt_number}. Waiting {retry_state.next_action.sleep} seconds..."
),
)
def upload_to_hf(local_path, path_in_repo, repo_id, repo_type="dataset", wait_time=3):
# Add a wait time before each call, even on first attempt
time.sleep(wait_time)
return api.upload_file(
path_or_fileobj=local_path,
path_in_repo=path_in_repo,
repo_id=repo_id,
repo_type=repo_type,
)
def main(num_proc=3):
scrapers: Dict[str, Scraper] = {
"InMediaHKNet": InMediaHKNetTelegramScraper(num_proc=num_proc),
"RTHKChineseTelegram": RTHKChineseTelegramScraper(num_proc=num_proc),
"RTHKChinese": RTHKChineseScraper(num_proc=num_proc),
"RTHKEnglish": RTHKEnglishScraper(num_proc=num_proc),
"HK01": HK01Scraper(num_proc=num_proc),
"Headline": HeadlineScraper(num_proc=num_proc),
}
temp_dir = tempfile.TemporaryDirectory()
for key in tqdm(scrapers.keys(), desc="Scraping"):
scraper = scrapers[key]
articles = asyncio.run(scraper.get_articles())
for article in tqdm(articles, desc=f"Uploading {key}"):
# Convert article to a DataFrame
article_dict = article.to_dict()
df = pd.DataFrame([article_dict])
# md5 of the article id
article_id = hashlib.md5(article.id.encode()).hexdigest()
# Save DataFrame to a temporary CSV file
temp_file_name = f"{article_id}.csv"
temp_file_path = os.path.join(temp_dir.name, temp_file_name)
df.to_csv(temp_file_path, index=False)
# Upload the CSV file to Huggingface
upload_to_hf(
temp_file_path,
f"articles/{key}/{article_id}.csv",
REPO_NAME,
"dataset",
)
temp_dir.cleanup()
if __name__ == "__main__":
main()