Skip to content

Commit

Permalink
Merge pull request #43 from NREL/pp/trace_logging
Browse files Browse the repository at this point in the history
Add trace logging
  • Loading branch information
ppinchuk authored Jan 10, 2025
2 parents 42030a6 + 41165d4 commit 6346376
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 9 deletions.
7 changes: 7 additions & 0 deletions elm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""

import os
import logging
from functools import partial, partialmethod
from elm.base import ApiBase
from elm.chunk import Chunker
from elm.embed import ChunkAndEmbed
Expand All @@ -18,3 +20,8 @@

ELM_DIR = os.path.dirname(os.path.realpath(__file__))
TEST_DATA_DIR = os.path.join(os.path.dirname(ELM_DIR), 'tests', 'data')

logging.TRACE = 5
logging.addLevelName(logging.TRACE, 'TRACE')
logging.Logger.trace = partialmethod(logging.Logger.log, logging.TRACE)
logging.trace = partial(logging.log, logging.TRACE)
3 changes: 3 additions & 0 deletions elm/utilities/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def read_pdf(pdf_bytes, verbose=True):
"""
import pdftotext

logger.trace("Attempting to load %d bytes as pdf", len(pdf_bytes))
try:
pages = _load_pdf_possibly_multi_col(pdf_bytes)
except pdftotext.Error as e:
Expand All @@ -380,6 +381,8 @@ def read_pdf(pdf_bytes, verbose=True):
logger.exception(e)
pages = []

logger.trace("`read_pdf` loaded %d bytes (%d pages)", len(pdf_bytes),
len(pages))
return pages


Expand Down
3 changes: 3 additions & 0 deletions elm/web/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,17 @@ async def _fetch_doc(self, url):

async with aiohttp.ClientSession() as session:
try:
logger.trace("Fetching content from %r", url)
url_bytes = await self._fetch_content_with_retry(url, session)
except ELMRuntimeError:
return PDFDocument(pages=[]), None

logger.trace("Got content from %r", url)
doc = await self.pdf_read_coroutine(url_bytes, **self.pdf_read_kwargs)
if doc.pages:
return doc, url_bytes

logger.trace("PDF read failed; fetching HTML content from %r", url)
text = await load_html_with_pw(url, self.browser_semaphore,
timeout=self.PAGE_LOAD_TIMEOUT,
**self.pw_launch_kwargs)
Expand Down
30 changes: 21 additions & 9 deletions elm/web/google_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async def _load_browser(self, pw_instance):

async def _close_browser(self):
"""Close browser instance and reset internal attributes"""
logger.trace("Closing browser...")
await self._browser.close()
self._browser = None

Expand All @@ -70,9 +71,13 @@ async def _search(self, query, num_results=10):
logger.debug("Searching Google: %r", query)
num_results = min(num_results, self.EXPECTED_RESULTS_PER_PAGE)

logger.trace("Loading browser page for query: %r", query)
page = await self._browser.new_page()
logger.trace("Navigating to google for query: %r", query)
await _navigate_to_google(page, timeout=self.PAGE_LOAD_TIMEOUT)
logger.trace("Performing google search for query: %r", query)
await _perform_google_search(page, query)
logger.trace("Extracting links for query: %r", query)
return await _extract_links(page, num_results)

async def _skip_exc_search(self, query, num_results=10):
Expand All @@ -95,7 +100,9 @@ async def _get_links(self, queries, num_results):
)
for query in queries
]
logger.trace("Kicking off search for %d queries", len(searches))
results = await asyncio.gather(*searches)
logger.trace("Got results for link search:\n%r", results)
await self._close_browser()
return results

Expand Down Expand Up @@ -247,7 +254,10 @@ async def _search_single(question, browser_sem, num_results=10, **kwargs):
browser_sem = AsyncExitStack()

search_engine = PlaywrightGoogleLinkSearch(**kwargs)
logger.trace("Single search browser_semaphore=%r", browser_sem)
async with browser_sem:
logger.trace("Starting search for %r with browser_semaphore=%r",
question, browser_sem)
return await search_engine.results(question, num_results=num_results)


Expand All @@ -268,33 +278,35 @@ def _down_select_urls(search_results, num_urls=5):

async def _load_docs(urls, browser_semaphore=None, **kwargs):
"""Load a document for each input URL."""
logger.trace("Downloading docs for the following URL's:\n%r", urls)
logger.trace("kwargs for AsyncFileLoader:\n%s",
pprint.PrettyPrinter().pformat(kwargs))
file_loader = AsyncFileLoader(
browser_semaphore=browser_semaphore, **kwargs
)
docs = await file_loader.fetch_all(*urls)

logger.debug(
"Loaded the following number of pages for docs: %s",
pprint.PrettyPrinter().pformat(
{
doc.metadata.get("source", "Unknown"): len(doc.pages)
for doc in docs
}
),
)
page_lens = {doc.metadata.get("source", "Unknown"): len(doc.pages)
for doc in docs}
logger.debug("Loaded the following number of pages for docs:\n%s",
pprint.PrettyPrinter().pformat(page_lens))
return [doc for doc in docs if not doc.empty]


async def _navigate_to_google(page, timeout=90_000):
"""Navigate to Google domain."""
await page.goto("https://www.google.com")
logger.trace("Waiting for google to load")
await page.wait_for_load_state("networkidle", timeout=timeout)


async def _perform_google_search(page, search_query):
"""Fill in search bar with user query and click search button"""
logger.trace("Finding search bar for query: %r", search_query)
await page.get_by_label("Search", exact=True).fill(search_query)
logger.trace("Closing autofill for query: %r", search_query)
await _close_autofill_suggestions(page)
logger.trace("Hitting search button for query: %r", search_query)
await page.get_by_role("button", name="Google Search").click()


Expand Down
10 changes: 10 additions & 0 deletions elm/web/html_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,24 @@ async def _load_html( # pragma: no cover
url, browser_semaphore=None, timeout=90_000, **pw_launch_kwargs
):
"""Load html using playwright"""
logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs)
logger.trace("browser_semaphore=%r", browser_semaphore)

if browser_semaphore is None:
browser_semaphore = AsyncExitStack()

logger.trace("Loading HTML using playwright")
async with async_playwright() as p, browser_semaphore:
logger.trace("launching chromium; browser_semaphore=%r",
browser_semaphore)
browser = await p.chromium.launch(**pw_launch_kwargs)
logger.trace("Loading new page")
page = await browser.new_page()
logger.trace("Intercepting requests and aborting blocked ones")
await page.route("**/*", _intercept_route)
logger.trace("Navigating to: %r", url)
await page.goto(url)
logger.trace("Waiting for load with timeout: %r", timeout)
await page.wait_for_load_state("networkidle", timeout=timeout)
text = await page.content()

Expand Down

0 comments on commit 6346376

Please sign in to comment.