Skip to content

Commit

Permalink
Add download stats and estimate to fetch-data (#149)
Browse files Browse the repository at this point in the history
  • Loading branch information
willkg committed Sep 18, 2024
1 parent eef8a0e commit 918c799
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 15 deletions.
69 changes: 57 additions & 12 deletions src/crashstats_tools/cmd_fetch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from datetime import timedelta
from functools import partial
import json
from multiprocessing import Pool
import os
import sys
import time

import click
from rich.console import Console
Expand Down Expand Up @@ -35,9 +37,10 @@ def fetch_crash(
fetchraw,
fetchdumps,
fetchprocessed,
outputdir,
color,
overwrite,
stats,
outputdir,
):
"""Fetch crash data and save to correct place on the file system
Expand All @@ -59,9 +62,11 @@ def fetch_crash(
# Fetch raw crash metadata to OUTPUTDIR/raw_crash/DATE/CRASHID
fn = os.path.join(outputdir, "raw_crash", "20" + crash_id[-6:], crash_id)
if os.path.exists(fn) and not overwrite:
console.print(f"{crash_id}: fetching raw crash -- already exists")
if not stats:
console.print(f"{crash_id}: fetching raw crash -- already exists")
else:
console.print(f"{crash_id}: fetching raw crash")
if not stats:
console.print(f"{crash_id}: fetching raw crash")
raw_crash = get_crash_annotations(crash_id, host=host, api_token=api_token)

# Save raw crash to file system
Expand All @@ -87,11 +92,13 @@ def fetch_crash(

fn = os.path.join(outputdir, dump_name, crash_id)
if os.path.exists(fn) and not overwrite:
console.print(
f"{crash_id}: fetching dump: {dump_name} -- already exists"
)
if not stats:
console.print(
f"{crash_id}: fetching dump: {dump_name} -- already exists"
)
else:
console.print(f"{crash_id}: fetching dump: {dump_name}")
if not stats:
console.print(f"{crash_id}: fetching dump: {dump_name}")
dump_content = get_dump(
crash_id, dump_name=file_name, api_token=api_token, host=host
)
Expand All @@ -103,9 +110,11 @@ def fetch_crash(
# Fetch processed crash data
fn = os.path.join(outputdir, "processed_crash", crash_id)
if os.path.exists(fn) and not overwrite:
console.print(f"{crash_id}: fetching processed crash -- already exists")
if not stats:
console.print(f"{crash_id}: fetching processed crash -- already exists")
else:
console.print(f"{crash_id}: fetching processed crash")
if not stats:
console.print(f"{crash_id}: fetching processed crash")
processed_crash = get_processed_crash(
crash_id, api_token=api_token, host=host
)
Expand Down Expand Up @@ -153,6 +162,14 @@ def fetch_crash(
type=click.IntRange(1, 10, clamp=True),
help="how many workers to use to download data; requires CRASHSTATS_API_TOKEN",
)
@click.option(
"--stats/--no-stats",
default=False,
help=(
"prints download stats for large fetch-data jobs; if it's printing download "
"stats, it's not printing other things"
),
)
@click.option(
"--color/--no-color",
default=True,
Expand All @@ -172,6 +189,7 @@ def fetch_data(
fetchdumps,
fetchprocessed,
workers,
stats,
color,
outputdir,
crash_ids,
Expand Down Expand Up @@ -259,17 +277,44 @@ def fetch_data(
fetchdumps=fetchdumps,
fetchprocessed=fetchprocessed,
color=color,
outputdir=outputdir,
overwrite=overwrite,
stats=stats,
outputdir=outputdir,
)

start_time = time.time()
total = len(crash_ids)
i = 0

if workers > 1:
with Pool(workers) as p:
p.map(fetch_crash_partial, crash_ids)
with Pool(workers) as pool:
for _ in pool.imap(fetch_crash_partial, crash_ids):
if stats:
# Print something every 100
if i % 100 == 0:
seconds_per_item = (time.time() - start_time) / (i + 1)
estimate_left = str(
timedelta(seconds=seconds_per_item * (total - i + 1))
)
console.print(
(f"Downloaded ({i}/{total}) {estimate_left}").strip()
)
i += 1

else:
for crash_id in crash_ids:
fetch_crash_partial(crash_id)
if stats:
if i % 100 == 0:
seconds_per_item = (time.time() - start_time) / (i + 1)
estimate_left = str(
timedelta(seconds=int(seconds_per_item * (total - i + 1)))
)
console.print((f"Downloaded ({i}/{total}) {estimate_left}").strip())
i += 1

total_time = timedelta(seconds=int(time.time() - start_time))
console.print(f"Completed in {total_time}.")


if __name__ == "__main__":
Expand Down
54 changes: 54 additions & 0 deletions tests/test_fetch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def test_fetch_raw(tmpdir):
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(
Expand Down Expand Up @@ -110,6 +111,7 @@ def test_fetch_raw_with_token(tmpdir):
"""\
Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(
Expand Down Expand Up @@ -236,6 +238,7 @@ def test_fetch_dumps(tmpdir):
Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching dump: upload_file_minidump
Completed in 0:00:00.
"""
)
data = pathlib.Path(
Expand Down Expand Up @@ -283,6 +286,7 @@ def test_fetch_processed(tmpdir):
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes()
Expand Down Expand Up @@ -329,6 +333,7 @@ def test_fetch_processed_with_token(tmpdir):
"""\
Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes()
Expand Down Expand Up @@ -379,9 +384,58 @@ def test_host(tmpdir):
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
Completed in 0:00:00.
"""
)
data = pathlib.Path(
tmpdir / "raw_crash" / f"20{crash_id[-6:]}" / crash_id
).read_bytes()
assert json.loads(data) == raw_crash


@responses.activate
def test_stats(tmpdir):
crash_id = "2ac9a763-83d2-4dca-89bb-091bd0220630"
raw_crash = {
"ProductName": "Firefox",
"Version": "100.0",
}

responses.add(
responses.GET,
DEFAULT_HOST + "/api/RawCrash/",
match=[
responses.matchers.query_param_matcher(
{
"crash_id": crash_id,
"format": "meta",
}
)
],
status=200,
json=raw_crash,
)

runner = CliRunner()
args = [
"--raw",
"--no-dumps",
"--no-processed",
"--stats",
str(tmpdir),
crash_id,
]
result = runner.invoke(
cli=cmd_fetch_data.fetch_data,
args=args,
env={"COLUMNS": "100"},
)
assert result.exit_code == 0
assert result.output == dedent(
"""\
No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
Skipping dumps and protected data.
Downloaded (0/1) 0:00:00
Completed in 0:00:00.
"""
)
1 change: 0 additions & 1 deletion tests/test_reprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ def test_reprocess_tenthousand_allowmany():
"""
)
)
print(result.output.splitlines()[-3:])
assert result.output.endswith(
dedent(
f"""\
Expand Down
1 change: 0 additions & 1 deletion tests/test_supersearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,6 @@ def test_json():
args=["--_columns=uuid", "--_columns=signature", "--format=json"],
env={"COLUMNS": "100"},
)
print(result.output)
assert result.exit_code == 0
assert result.output == dedent(
"""\
Expand Down
1 change: 0 additions & 1 deletion tests/test_supersearchfacet.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,6 @@ def test_supersearch_url():
],
env={"COLUMNS": "100"},
)
print(result.output)
assert result.exit_code == 0
assert result.output == dedent(
"""\
Expand Down

0 comments on commit 918c799

Please sign in to comment.