From a25d742f424e8800fee47b9b1eef9b84111fb0b1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 22 Sep 2020 12:23:59 -0400 Subject: [PATCH] Add largest files report --- AIPscan/API/namespace_data.py | 30 ++++ AIPscan/Data/data.py | 94 +++++++++++- AIPscan/Data/tests/__init__.py | 0 AIPscan/Data/tests/test_largest_files.py | 136 ++++++++++++++++++ AIPscan/Reporter/helpers.py | 10 +- AIPscan/Reporter/report_largest_files.py | 40 ++++++ AIPscan/Reporter/templates/file.html | 3 +- .../templates/report_largest_files.html | 123 ++++++++++++++++ AIPscan/Reporter/templates/reports.html | 18 +++ AIPscan/Reporter/views.py | 1 + 10 files changed, 449 insertions(+), 6 deletions(-) create mode 100644 AIPscan/Data/tests/__init__.py create mode 100644 AIPscan/Data/tests/test_largest_files.py create mode 100644 AIPscan/Reporter/report_largest_files.py create mode 100644 AIPscan/Reporter/templates/report_largest_files.html diff --git a/AIPscan/API/namespace_data.py b/AIPscan/API/namespace_data.py index f2f952d3..a45dcc41 100644 --- a/AIPscan/API/namespace_data.py +++ b/AIPscan/API/namespace_data.py @@ -84,3 +84,33 @@ def get(self, storage_service_id): """AIP overview two""" aip_data = data.derivative_overview(storage_service_id=storage_service_id) return aip_data + + +@api.route("/largest-files/") +class LargestFileList(Resource): + @api.doc( + "list_formats", + params={ + "file_type": { + "description": "Optional file type filter (original or preservation)", + "in": "query", + "type": "str", + }, + "limit": { + "description": "Number of results to return (default is 20)", + "in": "query", + "type": "int", + }, + }, + ) + def get(self, storage_service_id, file_type=None, limit=20): + """Largest files""" + file_type = request.args.get("file_type", None) + try: + limit = int(request.args.get("limit", 20)) + except ValueError: + pass + file_data = data.largest_files( + storage_service_id=storage_service_id, file_type=file_type, limit=limit + ) + return file_data diff --git a/AIPscan/Data/data.py b/AIPscan/Data/data.py index a4c62581..43b0d7bf 100644 --- a/AIPscan/Data/data.py +++ b/AIPscan/Data/data.py @@ -5,10 +5,13 @@ from AIPscan.models import AIP, File, FileType, StorageService -FIELD_AIP_NAME = "AipName" +FIELD_AIP = "AIP" +FIELD_AIP_ID = "AIPID" +FIELD_AIP_NAME = "AIPName" +FIELD_AIP_SIZE = "AIPSize" +FIELD_AIP_UUID = "AIPUUID" FIELD_AIPS = "AIPs" -FIELD_AIP_SIZE = "AipSize" -FIELD_ALL_AIPS = "AllAips" +FIELD_ALL_AIPS = "AllAIPs" FIELD_COUNT = "Count" FIELD_CREATED_DATE = "CreatedDate" @@ -17,7 +20,11 @@ FIELD_DERIVATIVE_FORMAT = "DerivativeFormat" FIELD_DERIVATIVE_UUID = "DerivativeUUID" +FIELD_FILES = "Files" FIELD_FILE_COUNT = "FileCount" +FIELD_FILE_TYPE = "FileType" +FIELD_FILENAME = "Filename" +FIELD_FORMAT = "Format" FIELD_FORMATS = "Formats" FIELD_NAME = "Name" @@ -25,8 +32,11 @@ FIELD_ORIGINAL_UUID = "OriginalUUID" FIELD_ORIGINAL_FORMAT = "OriginalFormat" +FIELD_PUID = "PUID" + FIELD_RELATED_PAIRING = "RelatedPairing" +FIELD_SIZE = "Size" FIELD_STORAGE_NAME = "StorageName" FIELD_TRANSFER_NAME = "TransferName" @@ -204,3 +214,81 @@ def derivative_overview(storage_service_id): report[FIELD_STORAGE_NAME] = storage_service.name return report + + +def _largest_files_query(storage_service_id, file_type, limit): + """Fetch file information from database for largest files query + + This is separated into its own helper function to aid in testing. + """ + VALID_FILE_TYPES = set(item.value for item in FileType) + if file_type is not None and file_type in VALID_FILE_TYPES: + files = ( + File.query.join(AIP) + .join(StorageService) + .filter(StorageService.id == storage_service_id) + .filter(File.file_type == file_type) + .order_by(File.size.desc()) + .limit(limit) + ) + else: + files = ( + File.query.join(AIP) + .join(StorageService) + .filter(StorageService.id == storage_service_id) + .order_by(File.size.desc()) + .limit(limit) + ) + return files + + +def largest_files(storage_service_id, file_type=None, limit=20): + """Return a summary of the largest files in a given Storage Service + + :param storage_service_id: Storage Service ID. + :param file_type: Optional filter for type of file to return + (acceptable values are "original" or "preservation"). + :param limit: Upper limit of number of results to return. + + :returns: "report" dict containing following fields: + report["StorageName"]: Name of Storage Service queried + report["Files"]: List of result files ordered desc by size + """ + report = {} + report[FIELD_FILES] = [] + storage_service = _get_storage_service(storage_service_id) + report[FIELD_STORAGE_NAME] = storage_service.name + + files = _largest_files_query(storage_service_id, file_type, limit) + + for file_ in files: + file_info = {} + + file_info["id"] = file_.id + file_info[FIELD_UUID] = file_.uuid + file_info[FIELD_NAME] = file_.name + file_info[FIELD_SIZE] = int(file_.size) + file_info[FIELD_AIP_ID] = file_.aip_id + file_info[FIELD_FILE_TYPE] = file_.file_type.value + + try: + file_info[FIELD_FORMAT] = file_.file_format + except AttributeError: + pass + try: + file_info[FIELD_VERSION] = file_.format_version + except AttributeError: + pass + try: + file_info[FIELD_PUID] = file_.puid + except AttributeError: + pass + + matching_aip = AIP.query.get(file_.aip_id) + if matching_aip is not None: + file_info[FIELD_AIP_NAME] = matching_aip.transfer_name + file_info[FIELD_AIP_UUID] = matching_aip.uuid + + report[FIELD_FILES].append(file_info) + + return report diff --git a/AIPscan/Data/tests/__init__.py b/AIPscan/Data/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/AIPscan/Data/tests/test_largest_files.py b/AIPscan/Data/tests/test_largest_files.py new file mode 100644 index 00000000..7a373333 --- /dev/null +++ b/AIPscan/Data/tests/test_largest_files.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +import datetime +import pytest +import uuid + +from AIPscan.Data import data +from AIPscan.models import AIP, File, FileType, StorageService + +TEST_FILES = [ + File( + uuid=uuid.uuid4(), + name="test.csv", + size=1234567, + aip_id=1, + file_type=FileType.original, + file_format="Comma Separated Values", + filepath="/path/to/file.csv", + date_created=datetime.datetime.now(), + checksum_type="md5", + checksum_value="fakemd5", + ), + File( + uuid=uuid.uuid4(), + name="test.txt", + size=12345, + aip_id=2, + file_type=FileType.original, + file_format="Plain Text File", + puid="x-fmt/111", + filepath="/path/to/file.txt", + date_created=datetime.datetime.now(), + checksum_type="md5", + checksum_value="anotherfakemd5", + ), + File( + uuid=uuid.uuid4(), + name="test.pdf", + size=12345678, + aip_id=1, + file_type=FileType.preservation, + file_format="Acrobat PDF/A - Portable Document Format", + format_version="1b", + filepath="/path/to/test.pdf", + date_created=datetime.datetime.now(), + checksum_type="md5", + checksum_value="yetanotherfakemd5", + original_file_id=1, + ), +] + +MOCK_STORAGE_SERVICE_ID = 1 +MOCK_STORAGE_SERVICE_NAME = "some name" +TEST_STORAGE_SERVICE = StorageService( + name=MOCK_STORAGE_SERVICE_NAME, + url="http://example.com", + user_name="test", + api_key="test", + download_limit=20, + download_offset=10, + default=False, +) + +MOCK_AIP_NAME = "Test transfer" +MOCK_AIP_UUID = uuid.uuid4() +TEST_AIP = AIP( + uuid=MOCK_AIP_UUID, + transfer_name=MOCK_AIP_NAME, + create_date=datetime.datetime.now(), + storage_service_id=MOCK_STORAGE_SERVICE_ID, + fetch_job_id=1, +) + + +@pytest.mark.parametrize( + "file_data, file_count", [([], 0), (TEST_FILES, 3), (TEST_FILES[:2], 2)] +) +def test_largest_files(mocker, file_data, file_count): + """Test that return value conforms to expected structure. + """ + mock_query = mocker.patch("AIPscan.Data.data._largest_files_query") + mock_query.return_value = file_data + + mock_get_ss = mocker.patch("AIPscan.Data.data._get_storage_service") + mock_get_ss.return_value = TEST_STORAGE_SERVICE + + mock_get_aip = mocker.patch("sqlalchemy.orm.query.Query.get") + mock_get_aip.return_value = TEST_AIP + + report = data.largest_files(MOCK_STORAGE_SERVICE_ID) + report_files = report[data.FIELD_FILES] + assert report[data.FIELD_STORAGE_NAME] == MOCK_STORAGE_SERVICE_NAME + assert len(report_files) == file_count + + +@pytest.mark.parametrize( + "test_file, has_format_version, has_puid", + [ + (TEST_FILES[0], False, False), + (TEST_FILES[1], False, True), + (TEST_FILES[2], True, False), + ], +) +def test_largest_files_elements(mocker, test_file, has_format_version, has_puid): + """Test that returned file data matches expected values. + """ + mock_query = mocker.patch("AIPscan.Data.data._largest_files_query") + mock_query.return_value = [test_file] + + mock_get_ss = mocker.patch("AIPscan.Data.data._get_storage_service") + mock_get_ss.return_value = TEST_STORAGE_SERVICE + + mock_get_aip = mocker.patch("sqlalchemy.orm.query.Query.get") + mock_get_aip.return_value = TEST_AIP + + report = data.largest_files(MOCK_STORAGE_SERVICE_ID) + report_file = report[data.FIELD_FILES][0] + + # Required elements + assert test_file.name == report_file.get(data.FIELD_NAME) + assert test_file.file_format == report_file.get(data.FIELD_FORMAT) + + # Optional elements + if has_format_version: + assert test_file.format_version == report_file.get(data.FIELD_VERSION) + else: + assert report_file.get(data.FIELD_VERSION) is None + + if has_puid: + assert test_file.puid == report_file.get(data.FIELD_PUID) + else: + assert report_file.get(data.FIELD_PUID) is None + + # AIP information + assert report_file.get(data.FIELD_AIP_NAME) == MOCK_AIP_NAME + assert report_file.get(data.FIELD_AIP_UUID) == MOCK_AIP_UUID diff --git a/AIPscan/Reporter/helpers.py b/AIPscan/Reporter/helpers.py index b8cfa023..6d17e4b4 100644 --- a/AIPscan/Reporter/helpers.py +++ b/AIPscan/Reporter/helpers.py @@ -12,20 +12,26 @@ def translate_headers(headers): """ field_lookup = { data.FIELD_AIP_NAME: "AIP Name", + data.FIELD_AIP: "AIP", data.FIELD_AIPS: "AIPs", - data.FIELD_AIP_SIZE: "Aip Size", - data.FIELD_ALL_AIPS: "All Aips", + data.FIELD_AIP_SIZE: "AIP Size", + data.FIELD_ALL_AIPS: "All AIPs", data.FIELD_COUNT: "Count", data.FIELD_CREATED_DATE: "Created Date", data.FIELD_DERIVATIVE_COUNT: "Derivative Count", data.FIELD_DERIVATIVE_FORMAT: "Derivative Format", data.FIELD_DERIVATIVE_UUID: "Derivative UUID", data.FIELD_FILE_COUNT: "File Count", + data.FIELD_FILE_TYPE: "Type", + data.FIELD_FILENAME: "Filename", + data.FIELD_FORMAT: "Format", data.FIELD_FORMATS: "Formats", data.FIELD_NAME: "Name", data.FIELD_ORIGINAL_UUID: "Original UUID", data.FIELD_ORIGINAL_FORMAT: "Original Format", + data.FIELD_PUID: "PUID", data.FIELD_RELATED_PAIRING: "Related Pairing", + data.FIELD_SIZE: "Size", data.FIELD_STORAGE_NAME: "Storage Service Name", data.FIELD_TRANSFER_NAME: "Transfer Name", data.FIELD_VERSION: "Version", diff --git a/AIPscan/Reporter/report_largest_files.py b/AIPscan/Reporter/report_largest_files.py new file mode 100644 index 00000000..f0d45532 --- /dev/null +++ b/AIPscan/Reporter/report_largest_files.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +from flask import render_template, request + +from AIPscan.Data import data +from AIPscan.Reporter import reporter, translate_headers + + +@reporter.route("/largest_files/", methods=["GET"]) +def largest_files(): + """Return largest files.""" + storage_service_id = request.args.get("amss_id") + file_type = request.args.get("file_type") + limit = 20 + try: + limit = int(request.args.get("limit", 20)) + except ValueError: + pass + # TODO: Make limit configurable - currently set to default of 20 + file_data = data.largest_files( + storage_service_id=storage_service_id, file_type=file_type, limit=limit + ) + storage_service_name = file_data[data.FIELD_STORAGE_NAME] + headers = [ + data.FIELD_FILENAME, + data.FIELD_SIZE, + data.FIELD_FORMAT, + data.FIELD_PUID, + data.FIELD_FILE_TYPE, + data.FIELD_AIP, + ] + return render_template( + "report_largest_files.html", + storage_service_id=storage_service_id, + storage_service_name=storage_service_name, + columns=translate_headers(headers), + files=file_data[data.FIELD_FILES], + file_type=file_type, + limit=limit, + ) diff --git a/AIPscan/Reporter/templates/file.html b/AIPscan/Reporter/templates/file.html index e14b740d..44128807 100644 --- a/AIPscan/Reporter/templates/file.html +++ b/AIPscan/Reporter/templates/file.html @@ -5,13 +5,14 @@

File: {{ file_.name }}

+ diff --git a/AIPscan/Reporter/templates/report_largest_files.html b/AIPscan/Reporter/templates/report_largest_files.html new file mode 100644 index 00000000..265ffb32 --- /dev/null +++ b/AIPscan/Reporter/templates/report_largest_files.html @@ -0,0 +1,123 @@ + + + + {{ storage_service_name }}: Largest files + + + + + + + + + {% include "datatable.html" %} + + + +
+ +
+ + + +Report: Largest files +
+Storage Service: {{ storage_service_name }} +
+{% if file_type == "original" %} + File type: Original files +{% elif file_type == "preservation" %} + File type: Preservation files +{% else %} + File type: All files +{% endif %} + + + + +
+ +
+ File type: + +
+ +{% if files %} +
Filepath{{ file_.filepath }}
AIP - {{ aip.transfer_name }} {{ aip.uuid }} + {{ aip.transfer_name }}-{{ aip.uuid }}
+ + + {% for column in columns %} + + {% endfor %} + + + + {% for file_ in files %} + + + + {% if file_["Version"] %} + + {% else %} + + {% endif %} + {% if file_["PUID"] %} + + {% else %} + + {% endif %} + + + + {% endfor %} + +
{{ column }}
+ {{ file_["Name"] }} +
+ UUID: {{ file_["UUID"] }} +
{{ file_["Size"] | filesizeformat }}{{ file_["Format"] }} ({{ file_["Version"] }}){{ file_["Format"] }}{{ file_["PUID"] }}n/a{{ file_["FileType"] }} + {{ file_["AIPName"] }} +
+ UUID: {{ file_["AIPUUID"] }} +
+{% else %} +

No files to display.

+{% endif %} + + + + + + diff --git a/AIPscan/Reporter/templates/reports.html b/AIPscan/Reporter/templates/reports.html index 84e0ca6f..7ea89dc3 100644 --- a/AIPscan/Reporter/templates/reports.html +++ b/AIPscan/Reporter/templates/reports.html @@ -65,6 +65,13 @@ + + Largest files + + + + + @@ -192,6 +199,17 @@ ); window.open(url); }); + $("#report9a").on("click", function() { + const URL_LARGEST_FILES = "/reporter/largest_files/"; + var storageServiceID = $('#ss').val(); + var url = ( + window.location.origin + + URL_LARGEST_FILES + + '?amss_id=' + + storageServiceID + ); + window.open(url); + }); }); diff --git a/AIPscan/Reporter/views.py b/AIPscan/Reporter/views.py index 23ca35bd..7c6b26ef 100644 --- a/AIPscan/Reporter/views.py +++ b/AIPscan/Reporter/views.py @@ -18,6 +18,7 @@ report_aip_contents, report_formats_count, report_originals_with_derivatives, + report_largest_files, )