Add download stats and estimate to fetch-data (#149)

mozilla-services · Sep 18, 2024 · 918c799 · 918c799
1 parent eef8a0e
commit 918c799
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 15 deletions.
diff --git a/src/crashstats_tools/cmd_fetch_data.py b/src/crashstats_tools/cmd_fetch_data.py
@@ -2,11 +2,13 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+from datetime import timedelta
 from functools import partial
 import json
 from multiprocessing import Pool
 import os
 import sys
+import time
 
 import click
 from rich.console import Console
@@ -35,9 +37,10 @@ def fetch_crash(
     fetchraw,
     fetchdumps,
     fetchprocessed,
-    outputdir,
     color,
     overwrite,
+    stats,
+    outputdir,
 ):
     """Fetch crash data and save to correct place on the file system
 
@@ -59,9 +62,11 @@ def fetch_crash(
         # Fetch raw crash metadata to OUTPUTDIR/raw_crash/DATE/CRASHID
         fn = os.path.join(outputdir, "raw_crash", "20" + crash_id[-6:], crash_id)
         if os.path.exists(fn) and not overwrite:
-            console.print(f"{crash_id}: fetching raw crash -- already exists")
+            if not stats:
+                console.print(f"{crash_id}: fetching raw crash -- already exists")
         else:
-            console.print(f"{crash_id}: fetching raw crash")
+            if not stats:
+                console.print(f"{crash_id}: fetching raw crash")
             raw_crash = get_crash_annotations(crash_id, host=host, api_token=api_token)
 
             # Save raw crash to file system
@@ -87,11 +92,13 @@ def fetch_crash(
 
                 fn = os.path.join(outputdir, dump_name, crash_id)
                 if os.path.exists(fn) and not overwrite:
-                    console.print(
-                        f"{crash_id}: fetching dump: {dump_name} -- already exists"
-                    )
+                    if not stats:
+                        console.print(
+                            f"{crash_id}: fetching dump: {dump_name} -- already exists"
+                        )
                 else:
-                    console.print(f"{crash_id}: fetching dump: {dump_name}")
+                    if not stats:
+                        console.print(f"{crash_id}: fetching dump: {dump_name}")
                     dump_content = get_dump(
                         crash_id, dump_name=file_name, api_token=api_token, host=host
                     )
@@ -103,9 +110,11 @@ def fetch_crash(
         # Fetch processed crash data
         fn = os.path.join(outputdir, "processed_crash", crash_id)
         if os.path.exists(fn) and not overwrite:
-            console.print(f"{crash_id}: fetching processed crash -- already exists")
+            if not stats:
+                console.print(f"{crash_id}: fetching processed crash -- already exists")
         else:
-            console.print(f"{crash_id}: fetching processed crash")
+            if not stats:
+                console.print(f"{crash_id}: fetching processed crash")
             processed_crash = get_processed_crash(
                 crash_id, api_token=api_token, host=host
             )
@@ -153,6 +162,14 @@ def fetch_crash(
     type=click.IntRange(1, 10, clamp=True),
     help="how many workers to use to download data; requires CRASHSTATS_API_TOKEN",
 )
+@click.option(
+    "--stats/--no-stats",
+    default=False,
+    help=(
+        "prints download stats for large fetch-data jobs; if it's printing download "
+        "stats, it's not printing other things"
+    ),
+)
 @click.option(
     "--color/--no-color",
     default=True,
@@ -172,6 +189,7 @@ def fetch_data(
     fetchdumps,
     fetchprocessed,
     workers,
+    stats,
     color,
     outputdir,
     crash_ids,
@@ -259,17 +277,44 @@ def fetch_data(
         fetchdumps=fetchdumps,
         fetchprocessed=fetchprocessed,
         color=color,
-        outputdir=outputdir,
         overwrite=overwrite,
+        stats=stats,
+        outputdir=outputdir,
     )
 
+    start_time = time.time()
+    total = len(crash_ids)
+    i = 0
+
     if workers > 1:
-        with Pool(workers) as p:
-            p.map(fetch_crash_partial, crash_ids)
+        with Pool(workers) as pool:
+            for _ in pool.imap(fetch_crash_partial, crash_ids):
+                if stats:
+                    # Print something every 100
+                    if i % 100 == 0:
+                        seconds_per_item = (time.time() - start_time) / (i + 1)
+                        estimate_left = str(
+                            timedelta(seconds=seconds_per_item * (total - i + 1))
+                        )
+                        console.print(
+                            (f"Downloaded ({i}/{total}) {estimate_left}").strip()
+                        )
+                    i += 1
 
     else:
         for crash_id in crash_ids:
             fetch_crash_partial(crash_id)
+            if stats:
+                if i % 100 == 0:
+                    seconds_per_item = (time.time() - start_time) / (i + 1)
+                    estimate_left = str(
+                        timedelta(seconds=int(seconds_per_item * (total - i + 1)))
+                    )
+                    console.print((f"Downloaded ({i}/{total}) {estimate_left}").strip())
+                i += 1
+
+    total_time = timedelta(seconds=int(time.time() - start_time))
+    console.print(f"Completed in {total_time}.")
 
 
 if __name__ == "__main__":

diff --git a/tests/test_fetch_data.py b/tests/test_fetch_data.py
@@ -61,6 +61,7 @@ def test_fetch_raw(tmpdir):
         No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
         Skipping dumps and protected data.
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
+        Completed in 0:00:00.
         """
     )
     data = pathlib.Path(
@@ -110,6 +111,7 @@ def test_fetch_raw_with_token(tmpdir):
         """\
         Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
+        Completed in 0:00:00.
         """
     )
     data = pathlib.Path(
@@ -236,6 +238,7 @@ def test_fetch_dumps(tmpdir):
         Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching dump: upload_file_minidump
+        Completed in 0:00:00.
         """
     )
     data = pathlib.Path(
@@ -283,6 +286,7 @@ def test_fetch_processed(tmpdir):
         No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
         Skipping dumps and protected data.
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash
+        Completed in 0:00:00.
         """
     )
     data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes()
@@ -329,6 +333,7 @@ def test_fetch_processed_with_token(tmpdir):
         """\
         Using API token: 935exxxxxxxxxxxxxxxxxxxxxxxxxxxx
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching processed crash
+        Completed in 0:00:00.
         """
     )
     data = pathlib.Path(tmpdir / "processed_crash" / crash_id).read_bytes()
@@ -379,9 +384,58 @@ def test_host(tmpdir):
         No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
         Skipping dumps and protected data.
         2ac9a763-83d2-4dca-89bb-091bd0220630: fetching raw crash
+        Completed in 0:00:00.
         """
     )
     data = pathlib.Path(
         tmpdir / "raw_crash" / f"20{crash_id[-6:]}" / crash_id
     ).read_bytes()
     assert json.loads(data) == raw_crash
+
+
+@responses.activate
+def test_stats(tmpdir):
+    crash_id = "2ac9a763-83d2-4dca-89bb-091bd0220630"
+    raw_crash = {
+        "ProductName": "Firefox",
+        "Version": "100.0",
+    }
+
+    responses.add(
+        responses.GET,
+        DEFAULT_HOST + "/api/RawCrash/",
+        match=[
+            responses.matchers.query_param_matcher(
+                {
+                    "crash_id": crash_id,
+                    "format": "meta",
+                }
+            )
+        ],
+        status=200,
+        json=raw_crash,
+    )
+
+    runner = CliRunner()
+    args = [
+        "--raw",
+        "--no-dumps",
+        "--no-processed",
+        "--stats",
+        str(tmpdir),
+        crash_id,
+    ]
+    result = runner.invoke(
+        cli=cmd_fetch_data.fetch_data,
+        args=args,
+        env={"COLUMNS": "100"},
+    )
+    assert result.exit_code == 0
+    assert result.output == dedent(
+        """\
+        No API token provided. Set CRASHSTATS_API_TOKEN in the environment.
+        Skipping dumps and protected data.
+        Downloaded (0/1) 0:00:00
+        Completed in 0:00:00.
+        """
+    )
diff --git a/tests/test_reprocess.py b/tests/test_reprocess.py
@@ -284,7 +284,6 @@ def test_reprocess_tenthousand_allowmany():
         """
         )
     )
-    print(result.output.splitlines()[-3:])
     assert result.output.endswith(
         dedent(
             f"""\

diff --git a/tests/test_supersearch.py b/tests/test_supersearch.py
@@ -249,7 +249,6 @@ def test_json():
         args=["--_columns=uuid", "--_columns=signature", "--format=json"],
         env={"COLUMNS": "100"},
     )
-    print(result.output)
     assert result.exit_code == 0
     assert result.output == dedent(
         """\

diff --git a/tests/test_supersearchfacet.py b/tests/test_supersearchfacet.py
@@ -412,7 +412,6 @@ def test_supersearch_url():
         ],
         env={"COLUMNS": "100"},
     )
-    print(result.output)
     assert result.exit_code == 0
     assert result.output == dedent(
         """\
-Original file line number
+Diff line change
@@ Expand Up / @@ -284,7 +284,6 @@ def test_reprocess_tenthousand_allowmany(): @@
             """
             )
         )
-        print(result.output.splitlines()[-3:])
         assert result.output.endswith(
             dedent(
                 f"""\
@@ Expand Down @@