Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose timeout parameter #42

Merged
merged 3 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions elm/web/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class AsyncFileLoader:
}
"""Default header"""

PAGE_LOAD_TIMEOUT = 90_000
"""Default page load timeout value in milliseconds"""

def __init__(
self,
header_template=None,
Expand Down Expand Up @@ -205,9 +208,9 @@ async def _fetch_doc(self, url):
if doc.pages:
return doc, url_bytes

text = await load_html_with_pw(
url, self.browser_semaphore, **self.pw_launch_kwargs
)
text = await load_html_with_pw(url, self.browser_semaphore,
timeout=self.PAGE_LOAD_TIMEOUT,
**self.pw_launch_kwargs)
doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
if doc.pages:
return doc, doc.text
Expand Down
9 changes: 6 additions & 3 deletions elm/web/google_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class PlaywrightGoogleLinkSearch:
EXPECTED_RESULTS_PER_PAGE = 10
"""Number of results displayed per Google page. """

PAGE_LOAD_TIMEOUT = 90_000
"""Default page load timeout value in milliseconds"""

def __init__(self, **launch_kwargs):
"""
Expand Down Expand Up @@ -68,7 +71,7 @@ async def _search(self, query, num_results=10):
num_results = min(num_results, self.EXPECTED_RESULTS_PER_PAGE)

page = await self._browser.new_page()
await _navigate_to_google(page)
await _navigate_to_google(page, timeout=self.PAGE_LOAD_TIMEOUT)
await _perform_google_search(page, query)
return await _extract_links(page, num_results)

Expand Down Expand Up @@ -282,10 +285,10 @@ async def _load_docs(urls, browser_semaphore=None, **kwargs):
return [doc for doc in docs if not doc.empty]


async def _navigate_to_google(page):
async def _navigate_to_google(page, timeout=90_000):
"""Navigate to Google domain."""
await page.goto("https://www.google.com")
await page.wait_for_load_state("networkidle")
await page.wait_for_load_state("networkidle", timeout=timeout)


async def _perform_google_search(page, search_query):
Expand Down
19 changes: 12 additions & 7 deletions elm/web/html_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ async def _intercept_route(route): # pragma: no cover


async def load_html_with_pw( # pragma: no cover
url, browser_semaphore=None, **pw_launch_kwargs
url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs
):
"""Extract HTML from URL using Playwright.
Expand All @@ -72,6 +72,10 @@ async def load_html_with_pw( # pragma: no cover
Semaphore instance that can be used to limit the number of
playwright browsers open concurrently. If ``None``, no limits
are applied. By default, ``None``.
timeout : int, optional
Maximum time to wait for page loading state time in
milliseconds. Pass `0` to disable timeout.
By default, ``90,000``.
**pw_launch_kwargs
Keyword-value argument pairs to pass to
:meth:`async_playwright.chromium.launch`.
Expand All @@ -82,25 +86,26 @@ async def load_html_with_pw( # pragma: no cover
HTML from page.
"""
try:
text = await _load_html(url, browser_semaphore, **pw_launch_kwargs)
text = await _load_html(url, browser_semaphore=browser_semaphore,
timeout=timeout, **pw_launch_kwargs)
except (PlaywrightError, PlaywrightTimeoutError):
text = ""
return text


async def _load_html( # pragma: no cover
url, browser_sem=None, **pw_launch_kwargs
url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs
):
"""Load html using playwright"""
if browser_sem is None:
browser_sem = AsyncExitStack()
if browser_semaphore is None:
browser_semaphore = AsyncExitStack()

async with async_playwright() as p, browser_sem:
async with async_playwright() as p, browser_semaphore:
browser = await p.chromium.launch(**pw_launch_kwargs)
page = await browser.new_page()
await page.route("**/*", _intercept_route)
await page.goto(url)
await page.wait_for_load_state("networkidle", timeout=90_000)
await page.wait_for_load_state("networkidle", timeout=timeout)
text = await page.content()

return text
Loading