From a527e766e13ae6d87f50b20113f5ece7fd479b09 Mon Sep 17 00:00:00 2001
From: Dan LaManna <dan.lamanna@kitware.com>
Date: Mon, 24 Apr 2023 09:11:25 -0400
Subject: [PATCH] Add batched image downloading

---
 isic_cli/cli/image.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/isic_cli/cli/image.py b/isic_cli/cli/image.py
index 41c9c6b..d206042 100644
--- a/isic_cli/cli/image.py
+++ b/isic_cli/cli/image.py
@@ -10,6 +10,7 @@
 import click
 from click.types import IntRange
 from humanize import intcomma
+from more_itertools.more import chunked
 from rich.console import Console
 from rich.progress import Progress
 
@@ -98,28 +99,22 @@ def download(
         archive_num_images = get_num_images(ctx.session, search, collections)
         download_num_images = archive_num_images if limit == 0 else min(archive_num_images, limit)
         nice_num_images = intcomma(download_num_images)
-
-        task1 = progress.add_task(
-            f"Downloading image information ({nice_num_images} total)",
+        task = progress.add_task(
+            f"Downloading images (and metadata) ({nice_num_images} total)",
             total=download_num_images,
         )
-        task2 = progress.add_task(
-            f"Downloading image files ({nice_num_images} total)", total=download_num_images
-        )
+        # the futures ThreadPoolExecutor doesn't allow one to easily Ctrl-c
+        thread_pool = ThreadPool(max(10, os.cpu_count() or 10))
         images_iterator = itertools.islice(
             get_images(ctx.session, search, collections), download_num_images
         )
-        images = []
 
         # See comment above _extract_metadata for why this is necessary
-        for image in images_iterator:
-            images.append(image)
-            progress.update(task1, advance=1)
-
-        # the futures ThreadPoolExecutor doesn't allow one to easily Ctrl-c
-        thread_pool = ThreadPool(max(10, os.cpu_count() or 10))
-        func = functools.partial(download_image, to=outdir, progress=progress, task=task2)
-        thread_pool.map(func, images)
+        images = []
+        func = functools.partial(download_image, to=outdir, progress=progress, task=task)
+        for image_chunk in chunked(images_iterator, 100):
+            images.extend(image_chunk)
+            thread_pool.map(func, image_chunk)
 
         headers, records = _extract_metadata(images)
         with (outdir / "metadata.csv").open("w", encoding="utf8") as outfile: