Skip to content

Commit

Permalink
Correctly handle attribution information
Browse files Browse the repository at this point in the history
  • Loading branch information
danlamanna committed Mar 6, 2022
1 parent 026cf42 commit 89153bb
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 36 deletions.
36 changes: 11 additions & 25 deletions isic_cli/cli/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from isic_cli.cli.context import IsicContext
from isic_cli.cli.types import SearchString
from isic_cli.cli.utils import suggest_guest_login
from isic_cli.cli.utils import _extract_metadata, get_attributions, suggest_guest_login
from isic_cli.io.http import download_image, get_images, get_num_images


Expand Down Expand Up @@ -88,39 +88,25 @@ def download(
images_iterator = itertools.islice(
get_images(ctx.session, search, collections), download_num_images
)

# This is memory inefficient but unavoidable since the CSV needs to look at ALL
# records to determine what the final headers should be. The alternative would
# be to iterate through all images_iterator twice (hitting the API each time).
images = []
fieldnames = set()

# See comment above _extract_metadata for why this is necessary
for image in images_iterator:
progress.update(task1, advance=1)
fieldnames |= set(image.get('metadata', {}).keys())
images.append(image)
progress.update(task1, advance=1)

with parallel_backend('threading'):
Parallel()(delayed(download_image)(image, outdir, progress, task2) for image in images)

headers, records = _extract_metadata(images)
with (outdir / 'metadata.csv').open('w') as outfile:
writer = csv.DictWriter(outfile, ['isic_id'] + list(sorted(fieldnames)))
writer.writeheader()

for image in images:
writer.writerow({**{'isic_id': image['isic_id']}, **image['metadata']})

with (outdir / 'attributions.csv').open('w') as outfile:
writer = csv.DictWriter(outfile, ['isic_id', 'license', 'attribution'])
writer = csv.DictWriter(outfile, headers)
writer.writeheader()
writer.writerows(records)

for image in images:
writer.writerow(
{
'isic_id': image['isic_id'],
'license': image['copyright_license'],
'attribution': image['attribution'],
}
)
with (outdir / 'attribution.txt').open('w') as outfile:
# TODO: os.linesep?
outfile.write('\n\n'.join(get_attributions(records)))

click.echo()
click.secho(f'Successfully downloaded {nice_num_images} images to {outdir}/.', fg='green')
Expand All @@ -129,6 +115,6 @@ def download(
fg='green',
)
click.secho(
f'Successfully wrote {nice_num_images} attribution records to {outdir/"attribution.csv"}.',
f'Successfully wrote attributions to {outdir/"attribution.txt"}.',
fg='green',
)
12 changes: 3 additions & 9 deletions isic_cli/cli/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from isic_cli.cli.context import IsicContext
from isic_cli.cli.types import CommaSeparatedIdentifiers, SearchString
from isic_cli.cli.utils import suggest_guest_login
from isic_cli.cli.utils import _extract_metadata, suggest_guest_login
from isic_cli.io.http import get_images, get_num_images


Expand Down Expand Up @@ -136,16 +136,10 @@ def download(ctx: IsicContext, search: Union[None, str], collections: Union[None
task = progress.add_task(
f'Downloading metadata records ({nice_num_images})', total=download_num_images
)

fieldnames = set()
records = []
for image in images:
fieldnames |= set(image.get('metadata', {}).keys())
records.append({**{'isic_id': image['isic_id']}, **image['metadata']})
progress.update(task, advance=1)
headers, records = _extract_metadata(images, progress, task)

if records:
writer = csv.DictWriter(sys.stdout, ['isic_id'] + list(sorted(fieldnames)))
writer = csv.DictWriter(sys.stdout, headers)
writer.writeheader()
for record in records:
writer.writerow(record)
40 changes: 40 additions & 0 deletions isic_cli/cli/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from collections import Counter
import sys
from typing import Iterable

import click

Expand Down Expand Up @@ -29,3 +31,41 @@ def decorator(ctx: IsicContext, **kwargs):
f(ctx, **kwargs)

return decorator


# This is memory inefficient but unavoidable since the CSV needs to look at ALL
# records to determine what the final headers should be. The alternative would
# be to iterate through all images_iterator twice (hitting the API each time).
def _extract_metadata(
images: Iterable[dict], progress=None, task=None
) -> tuple[list[str], list[dict]]:
metadata = []
base_fields = ['isic_id', 'attribution', 'copyright_license']
metadata_fields = set()

for image in images:
metadata_fields |= set(image.get('metadata', {}).keys())
metadata.append(
{
**{
'isic_id': image['isic_id'],
'attribution': image['attribution'],
'copyright_license': image['copyright_license'],
},
**image['metadata'],
}
)

if progress and task:
progress.update(task, advance=1)

return base_fields + list(sorted(metadata_fields)), metadata


def get_attributions(images: Iterable[dict]) -> list[str]:
counter = Counter(r['attribution'] for r in images)
# sort by the number of images descending, then the name of the institution ascending
attributions = sorted(counter.most_common(), key=lambda v: (-v[1], v[0]))
# push anonymous attributions to the end
attributions = sorted(attributions, key=lambda v: 1 if v[0] == 'Anonymous' else 0)
return [x[0] for x in attributions]
11 changes: 9 additions & 2 deletions tests/test_cli_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ def test_metadata_download(cli_run, mocker):
mocker.patch(
'isic_cli.cli.metadata.get_images',
return_value=iter(
[{'isic_id': 'ISIC_0000000', 'metadata': {'sex': 'male', 'diagnosis': 'melanoma'}}]
[
{
'isic_id': 'ISIC_0000000',
'attribution': 'Foo',
'copyright_license': 'CC-0',
'metadata': {'sex': 'male', 'diagnosis': 'melanoma'},
}
]
),
)

result = cli_run(['metadata', 'download'])

assert result.exit_code == 0, result.exception
assert re.search(r'ISIC_0000000.*melanoma.*male', result.output), result.output
assert re.search(r'ISIC_0000000.*Foo.*CC-0.*melanoma.*male', result.output), result.output
49 changes: 49 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

from isic_cli.cli.utils import get_attributions


@pytest.mark.parametrize(
'images,attributions',
[
[
[
{'attribution': 'foo'},
{'attribution': 'bar'},
{'attribution': 'foo'},
{'attribution': 'bar'},
],
['bar', 'foo'],
],
[
[
{'attribution': 'foo'},
{'attribution': 'foo'},
{'attribution': 'bar'},
],
['foo', 'bar'],
],
[
[
{'attribution': 'foo'},
{'attribution': 'foo'},
{'attribution': 'bar'},
{'attribution': 'Anonymous'},
],
['foo', 'bar', 'Anonymous'],
],
[
[
{'attribution': 'foo'},
{'attribution': 'foo'},
{'attribution': 'bar'},
{'attribution': 'Anonymous'},
{'attribution': 'Anonymous'},
{'attribution': 'Anonymous'},
],
['foo', 'bar', 'Anonymous'],
],
],
)
def test_get_attributions(images, attributions):
assert get_attributions(images) == attributions

0 comments on commit 89153bb

Please sign in to comment.