Skip to content

Commit

Permalink
Merge pull request #9 from Imageomics/6-download-original
Browse files Browse the repository at this point in the history
Fix hash value mismatch
  • Loading branch information
johnbradley authored Oct 6, 2022
2 parents f628cb2 + 18008a5 commit 03c3ec6
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 21 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: run-tests

on:
pull_request:
push:
branches: main

jobs:
run-tests:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- uses: actions/setup-python@v4
with:
python-version: '3.8'

- name: Install tox
run: pip install tox
shell: bash

- name: Run tests
run: tox
shell: bash
39 changes: 31 additions & 8 deletions src/dva/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
import hashlib
from pyDataverse.api import NativeApi, DataAccessApi
from pyDataverse.models import Datafile
Expand All @@ -18,13 +19,28 @@ def get_files_for_doi(self, doi):
dataset = self._api.get_dataset(doi).json()
return dataset['data']['latestVersion']['files']

def download_file(self, dvfile, path):
def _get_datafile_response(self, dvfile):
dv_data_file = dvfile["dataFile"]
# Retrieve the original file (that matches MD5 checksum) for files processed by Dataverse ingress
data_format = None
if dv_data_file.get("originalFileFormat"):
data_format = "original"

# NOTE: the call below blocks until the entire file is retrieved (in memory)
return self._data_api.get_datafile(dv_data_file["id"], data_format=data_format)

def download_file(self, dvfile, dest):
response = self._get_datafile_response(dvfile)
filename = self.get_download_filename(response)
directory_label = dvfile.get("directoryLabel", "")
if directory_label:
path = os.path.join(dest, directory_label, filename)
else:
path = os.path.join(dest, filename)
os.makedirs(os.path.dirname(path), exist_ok=True)
file_id = dvfile["dataFile"]["id"]
with open(path, "wb") as f:
# NOTE: the call below blocks until the entire file is retrieved (in memory)
response = self._data_api.get_datafile(file_id)
f.write(response.content)
return path

@staticmethod
def verify_checksum(dvfile, path):
Expand All @@ -51,21 +67,28 @@ def upload_file(self, doi, path, dirname=""):
raise APIException(f"Uploading failed with status {status}.")

@staticmethod
def get_dvfile_path(dvfile, parent_dir=None):
def get_remote_path(dvfile):
path = dvfile["dataFile"]["filename"]
directory_label = dvfile.get("directoryLabel", "")
if directory_label:
path = f"{directory_label}/{path}"
if parent_dir:
path = f"{parent_dir}/{path}"
return path

@staticmethod
def get_download_filename(response):
content_disposition = response.headers['Content-disposition']
regex = '^ *filename=(.*?)$'
for part in content_disposition.split(';'):
found_items = re.findall(regex, part)
if found_items:
return found_items[0].strip('"')
raise APIException(f"Invalid Content-disposition {content_disposition}")

@staticmethod
def get_dvfile_size(dvfile):
return dvfile["dataFile"]["filesize"]



def get_api(url):
config = Config(url)
return API(
Expand Down
13 changes: 7 additions & 6 deletions src/dva/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def ls(doi, json_format, url):
click.echo(json.dumps(dvfiles, indent=4))
else:
for dvfile in dvfiles:
path = api.get_dvfile_path(dvfile)
click.echo(path)
remote_path = api.get_remote_path(dvfile)
click.echo(remote_path)


@click.command()
Expand All @@ -43,11 +43,12 @@ def download(doi, dest, url):
"""
api = get_api(url)
for dvfile in api.get_files_for_doi(doi):
path = api.get_dvfile_path(dvfile, dest)
click.echo(f"Downloading {path}")
api.download_file(dvfile, path)
file_id = dvfile["dataFile"]["id"]
click.echo(f"Downloading file {file_id}")
path = api.download_file(dvfile, dest)
click.echo(f"Downloaded file {file_id} to {path}")
api.verify_checksum(dvfile, path)
click.echo(f"Verified file checksum for {path}.")
click.echo(f"Verified file checksum for {path}")


@click.command()
Expand Down
49 changes: 43 additions & 6 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,17 @@ def test_get_files_for_doi(self, mock_data_api, mock_native_api, mock_config):
self.assertEqual(len(result), 1)
self.assertEqual(result[0]["dataFile"]["id"], 2222)

def test_get_dvfile_path(self):
def test_get_remote_path(self):
dvfile = {
"dataFile": {
"filename": "data.txt"
}
}
result = API.get_dvfile_path(dvfile, parent_dir="/tmp")
self.assertEqual(result, "/tmp/data.txt")
result = API.get_remote_path(dvfile)
self.assertEqual(result, "data.txt")
dvfile["directoryLabel"] = "results"
result = API.get_dvfile_path(dvfile, parent_dir="/tmp")
self.assertEqual(result, "/tmp/results/data.txt")
result = API.get_remote_path(dvfile)
self.assertEqual(result, "results/data.txt")

@patch('dva.api.Config')
@patch('dva.api.NativeApi')
Expand All @@ -62,9 +62,35 @@ def test_download_file(self, mock_data_api, mock_native_api, mock_config):
"id": 2222
}
}
mock_data_api.return_value.get_datafile.return_value = Mock(headers={
'Content-disposition': 'attachment; filename="data.txt"'
})
api = get_api(url=None)
with patch("builtins.open", mock_open()) as mock_file:
api.download_file(dvfile, path="/tmp/data.txt")
api.download_file(dvfile, dest="/tmp")
mock_data_api.return_value.get_datafile.assert_called_with(2222, data_format=None)
mock_file.assert_called_with("/tmp/data.txt", "wb")
mock_file.return_value.write.assert_called_with(
mock_data_api.return_value.get_datafile.return_value.content
)

@patch('dva.api.Config')
@patch('dva.api.NativeApi')
@patch('dva.api.DataAccessApi')
def test_download_file_original(self, mock_data_api, mock_native_api, mock_config):
dvfile = {
"dataFile": {
"id": 2222,
"originalFileFormat": "CSV"
}
}
mock_data_api.return_value.get_datafile.return_value = Mock(headers={
'Content-disposition': 'attachment; filename="data.txt"'
})
api = get_api(url=None)
with patch("builtins.open", mock_open()) as mock_file:
api.download_file(dvfile, dest="/tmp")
mock_data_api.return_value.get_datafile.assert_called_with(2222, data_format='original')
mock_file.assert_called_with("/tmp/data.txt", "wb")
mock_file.return_value.write.assert_called_with(
mock_data_api.return_value.get_datafile.return_value.content
Expand Down Expand Up @@ -108,3 +134,14 @@ def test_verify_checksum(self, mock_datafile, mock_data_api, mock_native_api, mo
with patch("builtins.open", mock_open(read_data=b"123")) as mock_file:
api.upload_file(doi='doi:10.70122/FK2/WUU4DM', path="/tmp/data.txt")
self.assertEqual(str(raised_exception.exception), "Uploading failed with status bad.")

def test_get_download_filename(self):
good_values = [
('attachment; filename=bob.txt', 'bob.txt'),
('attachment; filename="tom.txt"', 'tom.txt'),
('attachment; filename="file1.txt"; other="a"', 'file1.txt'),
]
response = Mock()
for content_disposition, expected_result in good_values:
response.headers = {'Content-disposition': content_disposition}
self.assertEqual(expected_result, API.get_download_filename(response))
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py36
envlist = py38

[testenv]
deps = pytest
Expand Down

0 comments on commit 03c3ec6

Please sign in to comment.