Skip to content

Commit

Permalink
refactor: remove Selenium dependency and simplify scraping logic with…
Browse files Browse the repository at this point in the history
… requests
  • Loading branch information
Flexicon committed Jan 10, 2025
1 parent 42624f1 commit 1be09ae
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 68 deletions.
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,5 @@ python-dotenv==0.11.0
requests==2.22.0
requests-cache==0.9.7
responses==0.14.0
selenium==3.141.0
tenacity==9.0.0
uvicorn==0.15.0
gunicorn==20.0.4
54 changes: 8 additions & 46 deletions scraper/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,74 +4,36 @@
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)

from common.models import Champion

traits_cache = {}


class ScraperWebDriver:
def __init__(self) -> None:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.set_page_load_timeout(10)
self.driver.implicitly_wait(5)

@retry(
stop=stop_after_attempt(3),
retry=retry_if_exception_type(TimeoutException),
wait=wait_exponential(min=1, max=5),
reraise=True,
)
def fetch_content_html(self, url: str, *, selector: str = ".main") -> str:
print(f"Fetching html from: {url}")
self.driver.get(url)
return self.driver.find_element_by_css_selector(selector).get_attribute(
"innerHTML"
)

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.driver.close()


def _build_champion_from_character(
driver: ScraperWebDriver, character: Tag
) -> Champion:
traits = _scrape_traits_for_character(driver, character)
def _build_champion_from_character(character: Tag) -> Champion:
traits = _scrape_traits_for_character(character)
img_tag = character.find("img")
name = img_tag["alt"]
icon = img_tag["src"]
cost = _price_from_character_class(" ".join(character["class"]))
return Champion(name=name, image=icon, cost=cost, traits=traits)


def _scrape_traits_for_character(driver: ScraperWebDriver, character: Tag) -> list[str]:
def _scrape_traits_for_character(character: Tag) -> list[str]:
href = character["href"]
url = f"https://tftactics.gg{href}" if href.startswith("/") else href

if url in traits_cache:
print(f"Using cached traits for: {url}")
return traits_cache[url]

try:
html = driver.fetch_content_html(url)
except TimeoutException:
print(f"Timeout fetching champion traits from: {url}")
print(f"Fetching champion traits from: {url}")
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to fetch champion traits from: {url}")
return []

html = response.text
traits = traits_cache[url] = _extract_traits_from_character_html(html)
return traits

Expand Down
27 changes: 17 additions & 10 deletions scraper/scrape_champions.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,43 @@
import requests

from typing import List

from bs4 import BeautifulSoup
from pymongo.collection import Collection

from common.models import Champion
from common.db import DB
from .helpers import ScraperWebDriver, _build_champion_from_character
from .helpers import _build_champion_from_character

TFTChampionsURL = r'https://tftactics.gg/tierlist/champions'
TFTChampionsURL = r"https://tftactics.gg/tierlist/champions"
ChampionsSelector = ".characters-list > .characters-item"


def scrape_champions() -> List[Champion]:
with ScraperWebDriver() as driver:
html = driver.fetch_content_html(TFTChampionsURL)
characters = BeautifulSoup(html, 'html.parser').select('.characters-list > .characters-item')
champions = [_build_champion_from_character(driver, c) for c in characters]
print(f"Fetching champions from: {TFTChampionsURL}")
res = requests.get(TFTChampionsURL)
res.raise_for_status()

characters = BeautifulSoup(res.text, "html.parser").select(ChampionsSelector)
champions = [_build_champion_from_character(c) for c in characters]
return champions


def scrape_and_persist(collection: Collection):
result = scrape_champions()
print('Found {count} champions\n{separator}\n'.format(count=len(result), separator="-" * 15))
print(f'Found {len(result)} champions\n{"-" * 15}\n')

for champion in result:
print(f'Name: {champion.name}\nImage: {champion.image}\nCost: {champion.cost}\nTraits: {champion.traits}\n')
print(
f"Name: {champion.name}\nImage: {champion.image}\nCost: {champion.cost}\nTraits: {champion.traits}\n"
)

collection.drop()
collection.insert_many([comp.dict() for comp in result])
print('Saved latest champions to db successfully!')
print("Saved latest champions to db successfully!")


if __name__ == '__main__':
if __name__ == "__main__":
print("Scraping Champions 🕷️")
db = DB().connect()
scrape_and_persist(db.get_champions_collection())
Expand Down
23 changes: 13 additions & 10 deletions scraper/scrape_comps.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import requests

from typing import List, Optional

from bs4 import BeautifulSoup
Expand All @@ -6,28 +8,29 @@

from common.models import Champion, Comp, Item, ItemRecommendation
from common.db import DB
from .helpers import ScraperWebDriver, _build_champion_from_character
from .helpers import _build_champion_from_character

TFTCompsURL = r"https://tftactics.gg/tierlist/team-comps"


def scrape_comps() -> List[Comp]:
with ScraperWebDriver() as driver:
html = driver.fetch_content_html(TFTCompsURL)
teams = BeautifulSoup(html, "html.parser").find_all(
"div", class_="team-portrait"
)
comps = [_build_comp_from_team(driver, t) for t in teams]
return comps
print(f"Fetching comps from: {TFTCompsURL}")
res = requests.get(TFTCompsURL)
res.raise_for_status()

teams = BeautifulSoup(res.text, "html.parser").find_all(
"div", class_="team-portrait"
)
return [_build_comp_from_team(t) for t in teams]


def _build_comp_from_team(driver: ScraperWebDriver, team: Tag) -> Comp:
def _build_comp_from_team(team: Tag) -> Comp:
playstyle = team.find_next(class_="team-playstyle").get_text()
name = team.find_next(class_="team-name-elipsis").get_text().replace(playstyle, "")

tier = team.find_next(class_="team-rank").get_text()
characters = team.select(".team-characters > .characters-item")
champions = [_build_champion_from_character(driver, c) for c in characters]
champions = [_build_champion_from_character(c) for c in characters]
items = list(map(_build_item_recommendation, characters, champions))

return Comp(
Expand Down

0 comments on commit 1be09ae

Please sign in to comment.