refactor: remove Selenium dependency and simplify scraping logic with…

… requests
Flexicon · Jan 10, 2025 · 1be09ae · 1be09ae
1 parent 42624f1
commit 1be09ae
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 68 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,5 @@ python-dotenv==0.11.0
 requests==2.22.0
 requests-cache==0.9.7
 responses==0.14.0
-selenium==3.141.0
-tenacity==9.0.0
 uvicorn==0.15.0
 gunicorn==20.0.4
diff --git a/scraper/helpers.py b/scraper/helpers.py
@@ -4,74 +4,36 @@
 import requests
 from bs4 import BeautifulSoup
 from bs4.element import Tag
-from selenium import webdriver
-from selenium.common.exceptions import TimeoutException
-from tenacity import (
-    retry,
-    retry_if_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
 
 from common.models import Champion
 
 traits_cache = {}
 
 
-class ScraperWebDriver:
-    def __init__(self) -> None:
-        chrome_options = webdriver.ChromeOptions()
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--headless")
-        self.driver = webdriver.Chrome(options=chrome_options)
-        self.driver.set_page_load_timeout(10)
-        self.driver.implicitly_wait(5)
-
-    @retry(
-        stop=stop_after_attempt(3),
-        retry=retry_if_exception_type(TimeoutException),
-        wait=wait_exponential(min=1, max=5),
-        reraise=True,
-    )
-    def fetch_content_html(self, url: str, *, selector: str = ".main") -> str:
-        print(f"Fetching html from: {url}")
-        self.driver.get(url)
-        return self.driver.find_element_by_css_selector(selector).get_attribute(
-            "innerHTML"
-        )
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.driver.close()
-
-
-def _build_champion_from_character(
-    driver: ScraperWebDriver, character: Tag
-) -> Champion:
-    traits = _scrape_traits_for_character(driver, character)
+def _build_champion_from_character(character: Tag) -> Champion:
+    traits = _scrape_traits_for_character(character)
     img_tag = character.find("img")
     name = img_tag["alt"]
     icon = img_tag["src"]
     cost = _price_from_character_class(" ".join(character["class"]))
     return Champion(name=name, image=icon, cost=cost, traits=traits)
 
 
-def _scrape_traits_for_character(driver: ScraperWebDriver, character: Tag) -> list[str]:
+def _scrape_traits_for_character(character: Tag) -> list[str]:
     href = character["href"]
     url = f"https://tftactics.gg{href}" if href.startswith("/") else href
 
     if url in traits_cache:
         print(f"Using cached traits for: {url}")
         return traits_cache[url]
 
-    try:
-        html = driver.fetch_content_html(url)
-    except TimeoutException:
-        print(f"Timeout fetching champion traits from: {url}")
+    print(f"Fetching champion traits from: {url}")
+    response = requests.get(url)
+    if response.status_code != 200:
+        print(f"Failed to fetch champion traits from: {url}")
         return []
 
+    html = response.text
     traits = traits_cache[url] = _extract_traits_from_character_html(html)
     return traits
 

diff --git a/scraper/scrape_champions.py b/scraper/scrape_champions.py
@@ -1,36 +1,43 @@
+import requests
+
 from typing import List
 
 from bs4 import BeautifulSoup
 from pymongo.collection import Collection
 
 from common.models import Champion
 from common.db import DB
-from .helpers import ScraperWebDriver, _build_champion_from_character
+from .helpers import _build_champion_from_character
 
-TFTChampionsURL = r'https://tftactics.gg/tierlist/champions'
+TFTChampionsURL = r"https://tftactics.gg/tierlist/champions"
+ChampionsSelector = ".characters-list > .characters-item"
 
 
 def scrape_champions() -> List[Champion]:
-    with ScraperWebDriver() as driver:
-        html = driver.fetch_content_html(TFTChampionsURL)
-        characters = BeautifulSoup(html, 'html.parser').select('.characters-list > .characters-item')
-        champions = [_build_champion_from_character(driver, c) for c in characters]
+    print(f"Fetching champions from: {TFTChampionsURL}")
+    res = requests.get(TFTChampionsURL)
+    res.raise_for_status()
+
+    characters = BeautifulSoup(res.text, "html.parser").select(ChampionsSelector)
+    champions = [_build_champion_from_character(c) for c in characters]
     return champions
 
 
 def scrape_and_persist(collection: Collection):
     result = scrape_champions()
-    print('Found {count} champions\n{separator}\n'.format(count=len(result), separator="-" * 15))
+    print(f'Found {len(result)} champions\n{"-" * 15}\n')
 
     for champion in result:
-        print(f'Name: {champion.name}\nImage: {champion.image}\nCost: {champion.cost}\nTraits: {champion.traits}\n')
+        print(
+            f"Name: {champion.name}\nImage: {champion.image}\nCost: {champion.cost}\nTraits: {champion.traits}\n"
+        )
 
     collection.drop()
     collection.insert_many([comp.dict() for comp in result])
-    print('Saved latest champions to db successfully!')
+    print("Saved latest champions to db successfully!")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     print("Scraping Champions 🕷️")
     db = DB().connect()
     scrape_and_persist(db.get_champions_collection())

diff --git a/scraper/scrape_comps.py b/scraper/scrape_comps.py
@@ -1,3 +1,5 @@
+import requests
+
 from typing import List, Optional
 
 from bs4 import BeautifulSoup
@@ -6,28 +8,29 @@
 
 from common.models import Champion, Comp, Item, ItemRecommendation
 from common.db import DB
-from .helpers import ScraperWebDriver, _build_champion_from_character
+from .helpers import _build_champion_from_character
 
 TFTCompsURL = r"https://tftactics.gg/tierlist/team-comps"
 
 
 def scrape_comps() -> List[Comp]:
-    with ScraperWebDriver() as driver:
-        html = driver.fetch_content_html(TFTCompsURL)
-        teams = BeautifulSoup(html, "html.parser").find_all(
-            "div", class_="team-portrait"
-        )
-        comps = [_build_comp_from_team(driver, t) for t in teams]
-    return comps
+    print(f"Fetching comps from: {TFTCompsURL}")
+    res = requests.get(TFTCompsURL)
+    res.raise_for_status()
+
+    teams = BeautifulSoup(res.text, "html.parser").find_all(
+        "div", class_="team-portrait"
+    )
+    return [_build_comp_from_team(t) for t in teams]
 
 
-def _build_comp_from_team(driver: ScraperWebDriver, team: Tag) -> Comp:
+def _build_comp_from_team(team: Tag) -> Comp:
     playstyle = team.find_next(class_="team-playstyle").get_text()
     name = team.find_next(class_="team-name-elipsis").get_text().replace(playstyle, "")
 
     tier = team.find_next(class_="team-rank").get_text()
     characters = team.select(".team-characters > .characters-item")
-    champions = [_build_champion_from_character(driver, c) for c in characters]
+    champions = [_build_champion_from_character(c) for c in characters]
     items = list(map(_build_item_recommendation, characters, champions))
 
     return Comp(