Skip to content

Commit

Permalink
fix(scrapper): manage audio and video requests (#58)
Browse files Browse the repository at this point in the history
* fix(scrapper): manage audio and video requests

* refactor(scrapper): modify request_headers_size into response_headers_size

* feat(scrapper): create custom get_request_size method, and appropriate tests

* refactor(scrapper): use lambda function to filter headers

* feat(scrapper): check page method, status 200 and content type html

* fix(scrapper): check text html mimtype failed because of specif suffix to the mimetype
  • Loading branch information
PaulPHPE authored Feb 28, 2024
1 parent 7f3d7b3 commit 771ad9b
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 12 deletions.
49 changes: 37 additions & 12 deletions components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,17 @@ async def scrap_page(self) -> PageMetrics:
)
await stealth_async(self.page)
response = await self.page.goto(self.url)
if response and response.status != 200:
raise EcoindexScraperStatusException(
url=self.url,
status=response.status,
message=response.status_text,
)
await self.check_page_response(response)

await self.page.wait_for_load_state()
sleep(self.wait_before_scroll)
await self.generate_screenshot()
await self.page.keyboard.press('ArrowDown')
await self.page.keyboard.press("ArrowDown")
await self.page.evaluate(
"window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' })"
)
sleep(self.wait_after_scroll)
total_nodes = await self.get_nodes_count()

await self.page.close()
await browser.close()

Expand Down Expand Up @@ -119,9 +113,11 @@ async def get_requests_from_har_file(self):
url = entry["request"]["url"]
mime_type = entry["response"]["content"]["mimeType"]
category = await MimetypeAggregation.get_category_of_resource(mime_type)
size = entry["response"]["_transferSize"]
aggregation[category]["total_count"] += 1
size = self.get_request_size(entry)
aggregation[category]["total_size"] += size
self.all_requests.total_count += 1
self.all_requests.total_size += size
self.all_requests.items.append(
RequestItem(
url=url,
Expand All @@ -131,9 +127,6 @@ async def get_requests_from_har_file(self):
category=category,
)
)

self.all_requests.total_count += 1
self.all_requests.total_size += entry["response"]["_transferSize"]
self.all_requests.aggregation = MimetypeAggregation(**aggregation)
os.remove(self.har_temp_file_path)

Expand All @@ -142,3 +135,35 @@ async def get_nodes_count(self) -> int:
svgs = await self.page.locator("//*[local-name()='svg']//*").all()

return len(nodes) - len(svgs)

def get_request_size(self, entry) -> int:
if entry["response"]["_transferSize"] != -1:
return entry["response"]["_transferSize"]
headers = entry["response"]["headers"]
content_length_header = list(
filter(lambda header: (header["name"].lower() == "content-length"), headers)
)
if len(content_length_header) > 0 and entry["response"]["status"] == 206:
return int(content_length_header[0]["value"])
else:
return len(json.dumps(entry["response"]).encode("utf-8"))

async def check_page_response(self, response) -> None:
if response and response.status != 200:
raise EcoindexScraperStatusException(
url=self.url,
status=response.status,
message=response.status_text,
)
headers = response.headers
content_type = next((value for key, value in headers.items() if key.lower() == 'content-type'), None)
if content_type and "text/html" not in content_type:
raise TypeError(
{
"mimetype": content_type,
"message": (
"This resource is not "
"a standard page with mimeType 'text/html'"
),
}
)
145 changes: 145 additions & 0 deletions test/components/ecoindex/scraper/test_scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json
from ecoindex.exceptions.scraper import EcoindexScraperStatusException
from ecoindex.models import ScreenShot, WindowSize
from ecoindex.scraper import EcoindexScraper

Expand Down Expand Up @@ -45,3 +47,146 @@ def test_scraper_init_with_options():
assert scraper.screenshot.get_webp() == f"{screenshot_folder}/{screenshot_id}.webp" # type: ignore
assert scraper.screenshot_gid == screenshot_gid
assert scraper.page_load_timeout == page_load_timeout


def test_get_request_size():
mock_stripped_har_entry = (
{
"request": {
"url": "https://www.ecoindex.fr/",
},
"response": {
"status": 200,
"headers": [
{"name": "content-length", "value": "7347"},
],
"content": {
"mimeType": "text/html",
},
"_transferSize": 7772,
},
},
{
"request": {
"url": "https://www.ecoindex.fr/",
},
"response": {
"status": 200,
"headers": [
{"name": "content-length", "value": "7347"},
],
"content": {
"mimeType": "text/html",
},
"_transferSize": -1,
},
},
{
"request": {
"url": "https://www.ecoindex.fr/",
},
"response": {
"status": 206,
"headers": [
{"name": "Content-Length", "value": "7347"},
],
"content": {
"mimeType": "text/html",
},
"_transferSize": -1,
},
},
)
url = "https://www.example.com"
window_size = WindowSize(width=800, height=600)
wait_before_scroll = 2
wait_after_scroll = 2
screenshot_uid = 123
screenshot_gid = 456
page_load_timeout = 30
screenshot_id = "123"
screenshot_folder = "/tmp/screenshots"

scraper = EcoindexScraper(
url=url, # type: ignore
window_size=window_size,
wait_before_scroll=wait_before_scroll,
wait_after_scroll=wait_after_scroll,
screenshot=ScreenShot(id=screenshot_id, folder=screenshot_folder),
screenshot_uid=screenshot_uid,
screenshot_gid=screenshot_gid,
page_load_timeout=page_load_timeout,
)
assert scraper.get_request_size(mock_stripped_har_entry[0]) == 7772
assert scraper.get_request_size(mock_stripped_har_entry[1]) == len(
json.dumps(mock_stripped_har_entry[1]["response"]).encode("utf-8")
)
assert scraper.get_request_size(mock_stripped_har_entry[1]) == len(
json.dumps(mock_stripped_har_entry[1]["response"]).encode("utf-8")
)
assert scraper.get_request_size(mock_stripped_har_entry[2]) == 7347


async def test_check_page_response():
mock_stripped_har_entry = (
{
"response": {
"status": 200,
"headers": {"content-type": "audio/mpeg"},
}
},
{
"response": {
"status": 404,
"headers": {"content-type": "text/html"},
"status_text": "Not Found",
}
},
{
"response": {
"status": 200,
"headers": {"content-type": "text/html"},
}
},
)
url = "https://www.example.com"
window_size = WindowSize(width=800, height=600)
wait_before_scroll = 2
wait_after_scroll = 2
screenshot_uid = 123
screenshot_gid = 456
page_load_timeout = 30
screenshot_id = "123"
screenshot_folder = "/tmp/screenshots"

scraper = EcoindexScraper(
url=url, # type: ignore
window_size=window_size,
wait_before_scroll=wait_before_scroll,
wait_after_scroll=wait_after_scroll,
screenshot=ScreenShot(id=screenshot_id, folder=screenshot_folder),
screenshot_uid=screenshot_uid,
screenshot_gid=screenshot_gid,
page_load_timeout=page_load_timeout,
)
try:
scraper.check_page_response(mock_stripped_har_entry[0])
except TypeError as e:
assert str(e) == {
"mimetype": "audio/mpeg",
"message": (
"This resource is not "
"a standard page with mimeType 'text/html'"
),
}

try:
scraper.check_page_response(mock_stripped_har_entry[1])
except EcoindexScraperStatusException as e:
assert str(e) == {
"url": "https://www.example.com",
"status": 404,
"message": mock_stripped_har_entry[1]["response"]["status_text"],
}

assert scraper.check_page_response(mock_stripped_har_entry[2]) is None

1 comment on commit 771ad9b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage PR

Branch coverage •
FileStmtsMissCoverMissing
components/ecoindex/scraper
   scrap.py955146%45, 48–49, 51, 60, 63, 66–68, 73–75, 77–81, 84–87, 89, 91, 98–101, 108–110, 112–121, 130–131, 134–135, 137, 152–153, 158–161
TOTAL69923067% 

Please sign in to comment.