diff --git a/isic_cli/cli/image.py b/isic_cli/cli/image.py index 6b518b2..335b935 100644 --- a/isic_cli/cli/image.py +++ b/isic_cli/cli/image.py @@ -12,7 +12,7 @@ from isic_cli.cli.context import IsicContext from isic_cli.cli.types import SearchString -from isic_cli.cli.utils import suggest_guest_login +from isic_cli.cli.utils import _extract_metadata, get_attributions, suggest_guest_login from isic_cli.io.http import download_image, get_images, get_num_images @@ -88,39 +88,25 @@ def download( images_iterator = itertools.islice( get_images(ctx.session, search, collections), download_num_images ) - - # This is memory inefficient but unavoidable since the CSV needs to look at ALL - # records to determine what the final headers should be. The alternative would - # be to iterate through all images_iterator twice (hitting the API each time). images = [] - fieldnames = set() + + # See comment above _extract_metadata for why this is necessary for image in images_iterator: - progress.update(task1, advance=1) - fieldnames |= set(image.get('metadata', {}).keys()) images.append(image) + progress.update(task1, advance=1) with parallel_backend('threading'): Parallel()(delayed(download_image)(image, outdir, progress, task2) for image in images) + headers, records = _extract_metadata(images) with (outdir / 'metadata.csv').open('w') as outfile: - writer = csv.DictWriter(outfile, ['isic_id'] + list(sorted(fieldnames))) - writer.writeheader() - - for image in images: - writer.writerow({**{'isic_id': image['isic_id']}, **image['metadata']}) - - with (outdir / 'attributions.csv').open('w') as outfile: - writer = csv.DictWriter(outfile, ['isic_id', 'license', 'attribution']) + writer = csv.DictWriter(outfile, headers) writer.writeheader() + writer.writerows(records) - for image in images: - writer.writerow( - { - 'isic_id': image['isic_id'], - 'license': image['copyright_license'], - 'attribution': image['attribution'], - } - ) + with (outdir / 'attribution.txt').open('w') as outfile: + # TODO: os.linesep? + outfile.write('\n\n'.join(get_attributions(records))) click.echo() click.secho(f'Successfully downloaded {nice_num_images} images to {outdir}/.', fg='green') @@ -129,6 +115,6 @@ def download( fg='green', ) click.secho( - f'Successfully wrote {nice_num_images} attribution records to {outdir/"attribution.csv"}.', + f'Successfully wrote attributions to {outdir/"attribution.txt"}.', fg='green', ) diff --git a/isic_cli/cli/metadata.py b/isic_cli/cli/metadata.py index 9dc7f7b..e8703fc 100644 --- a/isic_cli/cli/metadata.py +++ b/isic_cli/cli/metadata.py @@ -16,7 +16,7 @@ from isic_cli.cli.context import IsicContext from isic_cli.cli.types import CommaSeparatedIdentifiers, SearchString -from isic_cli.cli.utils import suggest_guest_login +from isic_cli.cli.utils import _extract_metadata, suggest_guest_login from isic_cli.io.http import get_images, get_num_images @@ -136,16 +136,10 @@ def download(ctx: IsicContext, search: Union[None, str], collections: Union[None task = progress.add_task( f'Downloading metadata records ({nice_num_images})', total=download_num_images ) - - fieldnames = set() - records = [] - for image in images: - fieldnames |= set(image.get('metadata', {}).keys()) - records.append({**{'isic_id': image['isic_id']}, **image['metadata']}) - progress.update(task, advance=1) + headers, records = _extract_metadata(images, progress, task) if records: - writer = csv.DictWriter(sys.stdout, ['isic_id'] + list(sorted(fieldnames))) + writer = csv.DictWriter(sys.stdout, headers) writer.writeheader() for record in records: writer.writerow(record) diff --git a/isic_cli/cli/utils.py b/isic_cli/cli/utils.py index ba1c17d..805f1dc 100644 --- a/isic_cli/cli/utils.py +++ b/isic_cli/cli/utils.py @@ -1,4 +1,6 @@ +from collections import Counter import sys +from typing import Iterable import click @@ -29,3 +31,41 @@ def decorator(ctx: IsicContext, **kwargs): f(ctx, **kwargs) return decorator + + +# This is memory inefficient but unavoidable since the CSV needs to look at ALL +# records to determine what the final headers should be. The alternative would +# be to iterate through all images_iterator twice (hitting the API each time). +def _extract_metadata( + images: Iterable[dict], progress=None, task=None +) -> tuple[list[str], list[dict]]: + metadata = [] + base_fields = ['isic_id', 'attribution', 'copyright_license'] + metadata_fields = set() + + for image in images: + metadata_fields |= set(image.get('metadata', {}).keys()) + metadata.append( + { + **{ + 'isic_id': image['isic_id'], + 'attribution': image['attribution'], + 'copyright_license': image['copyright_license'], + }, + **image['metadata'], + } + ) + + if progress and task: + progress.update(task, advance=1) + + return base_fields + list(sorted(metadata_fields)), metadata + + +def get_attributions(images: Iterable[dict]) -> list[str]: + counter = Counter(r['attribution'] for r in images) + # sort by the number of images descending, then the name of the institution ascending + attributions = sorted(counter.most_common(), key=lambda v: (-v[1], v[0])) + # push anonymous attributions to the end + attributions = sorted(attributions, key=lambda v: 1 if v[0] == 'Anonymous' else 0) + return [x[0] for x in attributions] diff --git a/tests/test_cli_metadata.py b/tests/test_cli_metadata.py index d31f0e4..8640a9b 100644 --- a/tests/test_cli_metadata.py +++ b/tests/test_cli_metadata.py @@ -18,11 +18,18 @@ def test_metadata_download(cli_run, mocker): mocker.patch( 'isic_cli.cli.metadata.get_images', return_value=iter( - [{'isic_id': 'ISIC_0000000', 'metadata': {'sex': 'male', 'diagnosis': 'melanoma'}}] + [ + { + 'isic_id': 'ISIC_0000000', + 'attribution': 'Foo', + 'copyright_license': 'CC-0', + 'metadata': {'sex': 'male', 'diagnosis': 'melanoma'}, + } + ] ), ) result = cli_run(['metadata', 'download']) assert result.exit_code == 0, result.exception - assert re.search(r'ISIC_0000000.*melanoma.*male', result.output), result.output + assert re.search(r'ISIC_0000000.*Foo.*CC-0.*melanoma.*male', result.output), result.output diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..484bb3b --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,49 @@ +import pytest + +from isic_cli.cli.utils import get_attributions + + +@pytest.mark.parametrize( + 'images,attributions', + [ + [ + [ + {'attribution': 'foo'}, + {'attribution': 'bar'}, + {'attribution': 'foo'}, + {'attribution': 'bar'}, + ], + ['bar', 'foo'], + ], + [ + [ + {'attribution': 'foo'}, + {'attribution': 'foo'}, + {'attribution': 'bar'}, + ], + ['foo', 'bar'], + ], + [ + [ + {'attribution': 'foo'}, + {'attribution': 'foo'}, + {'attribution': 'bar'}, + {'attribution': 'Anonymous'}, + ], + ['foo', 'bar', 'Anonymous'], + ], + [ + [ + {'attribution': 'foo'}, + {'attribution': 'foo'}, + {'attribution': 'bar'}, + {'attribution': 'Anonymous'}, + {'attribution': 'Anonymous'}, + {'attribution': 'Anonymous'}, + ], + ['foo', 'bar', 'Anonymous'], + ], + ], +) +def test_get_attributions(images, attributions): + assert get_attributions(images) == attributions