Skip to content

Commit

Permalink
Merge pull request #42 from NREL/pp/flexible_timeout
Browse files Browse the repository at this point in the history
Expose timeout parameter
  • Loading branch information
ppinchuk authored Jan 10, 2025
2 parents 6119620 + c20bb32 commit 42030a6
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 13 deletions.
9 changes: 6 additions & 3 deletions elm/web/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class AsyncFileLoader:
}
"""Default header"""

PAGE_LOAD_TIMEOUT = 90_000
"""Default page load timeout value in milliseconds"""

def __init__(
self,
header_template=None,
Expand Down Expand Up @@ -205,9 +208,9 @@ async def _fetch_doc(self, url):
if doc.pages:
return doc, url_bytes

text = await load_html_with_pw(
url, self.browser_semaphore, **self.pw_launch_kwargs
)
text = await load_html_with_pw(url, self.browser_semaphore,
timeout=self.PAGE_LOAD_TIMEOUT,
**self.pw_launch_kwargs)
doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
if doc.pages:
return doc, doc.text
Expand Down
9 changes: 6 additions & 3 deletions elm/web/google_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class PlaywrightGoogleLinkSearch:
EXPECTED_RESULTS_PER_PAGE = 10
"""Number of results displayed per Google page. """

PAGE_LOAD_TIMEOUT = 90_000
"""Default page load timeout value in milliseconds"""

def __init__(self, **launch_kwargs):
"""
Expand Down Expand Up @@ -68,7 +71,7 @@ async def _search(self, query, num_results=10):
num_results = min(num_results, self.EXPECTED_RESULTS_PER_PAGE)

page = await self._browser.new_page()
await _navigate_to_google(page)
await _navigate_to_google(page, timeout=self.PAGE_LOAD_TIMEOUT)
await _perform_google_search(page, query)
return await _extract_links(page, num_results)

Expand Down Expand Up @@ -282,10 +285,10 @@ async def _load_docs(urls, browser_semaphore=None, **kwargs):
return [doc for doc in docs if not doc.empty]


async def _navigate_to_google(page):
async def _navigate_to_google(page, timeout=90_000):
"""Navigate to Google domain."""
await page.goto("https://www.google.com")
await page.wait_for_load_state("networkidle")
await page.wait_for_load_state("networkidle", timeout=timeout)


async def _perform_google_search(page, search_query):
Expand Down
19 changes: 12 additions & 7 deletions elm/web/html_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ async def _intercept_route(route): # pragma: no cover


async def load_html_with_pw( # pragma: no cover
url, browser_semaphore=None, **pw_launch_kwargs
url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs
):
"""Extract HTML from URL using Playwright.
Expand All @@ -72,6 +72,10 @@ async def load_html_with_pw( # pragma: no cover
Semaphore instance that can be used to limit the number of
playwright browsers open concurrently. If ``None``, no limits
are applied. By default, ``None``.
timeout : int, optional
Maximum time to wait for page loading state time in
milliseconds. Pass `0` to disable timeout.
By default, ``90,000``.
**pw_launch_kwargs
Keyword-value argument pairs to pass to
:meth:`async_playwright.chromium.launch`.
Expand All @@ -82,25 +86,26 @@ async def load_html_with_pw( # pragma: no cover
HTML from page.
"""
try:
text = await _load_html(url, browser_semaphore, **pw_launch_kwargs)
text = await _load_html(url, browser_semaphore=browser_semaphore,
timeout=timeout, **pw_launch_kwargs)
except (PlaywrightError, PlaywrightTimeoutError):
text = ""
return text


async def _load_html( # pragma: no cover
url, browser_sem=None, **pw_launch_kwargs
url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs
):
"""Load html using playwright"""
if browser_sem is None:
browser_sem = AsyncExitStack()
if browser_semaphore is None:
browser_semaphore = AsyncExitStack()

async with async_playwright() as p, browser_sem:
async with async_playwright() as p, browser_semaphore:
browser = await p.chromium.launch(**pw_launch_kwargs)
page = await browser.new_page()
await page.route("**/*", _intercept_route)
await page.goto(url)
await page.wait_for_load_state("networkidle", timeout=90_000)
await page.wait_for_load_state("networkidle", timeout=timeout)
text = await page.content()

return text

0 comments on commit 42030a6

Please sign in to comment.