Skip to content

Commit

Permalink
add new scrapers to pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
indiejoseph committed Feb 9, 2025
1 parent 74d937f commit ea907c3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
4 changes: 4 additions & 0 deletions scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from typing import Dict, TYPE_CHECKING
from scraper.rthk_zh import RTHKChineseTelegramScraper
from scraper.inmediahknet import InMediaHKNetTelegramScraper
from scraper.hk01 import HK01Scraper
from scraper.stheadline import HeadlineScraper
from huggingface_hub import HfApi

if TYPE_CHECKING:
Expand All @@ -22,6 +24,8 @@ def main(num_proc=3):
scrapers: Dict[str, Scraper] = {
"InMediaHKNet": InMediaHKNetTelegramScraper(num_proc=num_proc),
"RTHKChinese": RTHKChineseTelegramScraper(num_proc=num_proc),
"HK01": HK01Scraper(num_proc=num_proc),
"Headline": HeadlineScraper(num_proc=num_proc),
}
temp_dir = tempfile.TemporaryDirectory()

Expand Down
4 changes: 2 additions & 2 deletions scraper/hk01.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from scraper.api_scraper import APIScraper


class HK01(APIScraper):
class HK01Scraper(APIScraper):
def __init__(self, **kwargs):
super().__init__(
index_url="https://web-data.api.hk01.com/v2/feed/category/0?bucketId=00000",
Expand Down Expand Up @@ -33,7 +33,7 @@ def parse_article(self, item: dict) -> dict:
import asyncio

# Example usage
scraper = HK01(
scraper = HK01Scraper(
num_proc=1,
)

Expand Down

0 comments on commit ea907c3

Please sign in to comment.