From a9b7b2601a4f44ce4c789507cb4adcbf3ec8c2e6 Mon Sep 17 00:00:00 2001 From: Mike Cantelon Date: Sat, 30 Sep 2023 16:36:29 -0700 Subject: [PATCH] Add test data generation tool. (#217) Added a tool to populate AIPscan with randomly generated example data. --- README.md | 23 ++++++++ requirements/test.txt | 1 + tools/__init__.py | 0 tools/app/__init__.py | 0 tools/app/init.py | 7 +++ tools/generate-test-data | 100 ++++++++++++++++++++++++++++++++ tools/helpers/__init__.py | 0 tools/helpers/data.py | 119 ++++++++++++++++++++++++++++++++++++++ tools/tests/test_data.py | 88 ++++++++++++++++++++++++++++ 9 files changed, 338 insertions(+) create mode 100644 tools/__init__.py create mode 100644 tools/app/__init__.py create mode 100644 tools/app/init.py create mode 100755 tools/generate-test-data create mode 100644 tools/helpers/__init__.py create mode 100644 tools/helpers/data.py create mode 100644 tools/tests/test_data.py diff --git a/README.md b/README.md index 1239adc1..b6577ab4 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,29 @@ Shut down the AIPscan Docker containers and remove the rabbitmq volumes: docker-composer down --volumes ``` +## Tools + +The `tools` directory contains scripts that can be run by developers and system +adminsitrators. + +#### Test data generator + +The test data generator, `tools/generate-test-data`, tool populates +AIPscan's databse with randomly generated example data. + +### Running tools + +These should be run using the same system user and virtual environment that +AIPscan is running under. + +Here's how you would run the `generate-test-data` tool, for example: + + $ cd + $ sudo -u /bin/bash + $ source /bin/activate + $ ./tools/generate-test-data.py + + # Usage * Ensure that the Flask Server, RabbitMQ server, and Celery worker queue are up and running. diff --git a/requirements/test.txt b/requirements/test.txt index 89fd6608..e518ef95 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,6 @@ -r base.txt +faker==14.2.1 flake8==5.0.4 pytest==6.2.5 pytest_cov==2.11.1 diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/app/__init__.py b/tools/app/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/app/init.py b/tools/app/init.py new file mode 100644 index 00000000..38c1e801 --- /dev/null +++ b/tools/app/init.py @@ -0,0 +1,7 @@ +import os +import sys + +relpath = f"{os.path.dirname(__file__)}/../../../AIPscan" +sys.path.append(os.path.abspath(relpath)) + +config_name = "default" diff --git a/tools/generate-test-data b/tools/generate-test-data new file mode 100755 index 00000000..1f960fc6 --- /dev/null +++ b/tools/generate-test-data @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +import sys + +import click +from app import init +from faker import Faker +from flask import Flask +from helpers import data + +from AIPscan import db +from AIPscan.models import FetchJob +from config import CONFIGS + + +@click.command() +@click.option("--storage-services-to-create", default=2) +@click.option("--locations-per-storage-service", default=2) +@click.option("--locations-min-aip-count", default=10) +@click.option("--locations-max-aip-count", default=30) +@click.option("--aip-min-file-count", default=10) +@click.option("--aip-max-file-count", default=30) +def main( + storage_services_to_create, + locations_per_storage_service, + locations_min_aip_count, + locations_max_aip_count, + aip_min_file_count, + aip_max_file_count, +): + # Initialize Flash app context + app = Flask(__name__) + app.config.from_object(CONFIGS[init.config_name]) + + db.init_app(app) + + fake = Faker() + fake.seed_instance(0) + randint = fake.random.randint + + with app.app_context(): + # Add example storage services + print(f"Creating pipeline and {storage_services_to_create} storage services...") + pipeline = data.create_fake_pipeline() + + ss_ids = [] + fetch_jobs = {} + + for _ in range(storage_services_to_create): + is_default = len(ss_ids) == 0 + + ss = data.create_fake_storage_service(is_default) + ss_ids.append(ss.id) + + fetch_job = data.create_fake_fetch_job(ss.id) + fetch_jobs[ss.id] = fetch_job.id + + # Populate storage service locations + ss_locations_to_create = ( + storage_services_to_create * locations_per_storage_service + ) + + print( + f"Creating {ss_locations_to_create} storage service locations (and their AIPs)..." + ) + + aip_batches_created = 0 + total_aip_batches = len(ss_ids) * locations_per_storage_service + for ss_id in ss_ids: + for _ in range(locations_per_storage_service): + # Add location + sl = data.create_fake_location(ss_id) + + # Add AIPs + aip_batches_created += 1 + + print(f"Creating AIPs ({aip_batches_created}/{total_aip_batches})...") + + aipcount = 0 + for _ in range( + 1, randint(locations_min_aip_count, locations_max_aip_count) + ): + aip = data.create_fake_aip( + pipeline.id, ss_id, sl.id, fetch_jobs[ss.id] + ) + data.create_fake_aip_files( + aip_min_file_count, aip_max_file_count, aip.id + ) + aipcount += 1 + + # Update package/AIP counts in fetch job + fetch_job = FetchJob.query.get(fetch_jobs[ss_id]) + fetch_job.total_packages += aipcount + fetch_job.total_aips += aipcount + db.session.commit() + + print("Done.") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/helpers/__init__.py b/tools/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/helpers/data.py b/tools/helpers/data.py new file mode 100644 index 00000000..3f7f001c --- /dev/null +++ b/tools/helpers/data.py @@ -0,0 +1,119 @@ +import os +from datetime import date + +from faker import Faker + +from AIPscan import db +from AIPscan.models import ( + AIP, + FetchJob, + File, + Pipeline, + StorageLocation, + StorageService, +) + +fake = Faker() +randint = fake.random.randint + + +def create_fake_pipeline(): + pipeline = Pipeline(origin_pipeline=fake.uuid4(), dashboard_url=fake.url()) + + db.session.add(pipeline) + db.session.commit() + + return pipeline + + +def create_fake_storage_service(default): + ss = StorageService( + name=fake.text(20)[:-1], + url=fake.url(), + user_name=fake.profile()["username"], + api_key=fake.password(), + download_limit=0, + download_offset=0, + default=default, + ) + + db.session.add(ss) + db.session.commit() + + return ss + + +def create_fake_fetch_job(storage_service_id): + fetch_job = FetchJob( + total_packages=0, + total_aips=0, + total_deleted_aips=0, + download_start=date.today(), + download_end=date.today(), + download_directory=fake.file_path(), + storage_service_id=storage_service_id, + ) + fetch_job.total_dips = 0 + fetch_job.total_sips = 0 + fetch_job.total_replicas = 0 + + db.session.add(fetch_job) + db.session.commit() + + return fetch_job + + +def create_fake_location(storage_service_id): + current_location = os.path.join(os.path.dirname(fake.file_path(3)), fake.uuid4()) + + location = StorageLocation( + current_location=current_location, + description=fake.text(20)[:-1], + storage_service_id=storage_service_id, + ) + + db.session.add(location) + db.session.commit() + + return location + + +def create_fake_aip(pipeline_id, storage_service_id, storage_location_id, fetch_job_id): + aip = AIP( + uuid=fake.uuid4(), + transfer_name=fake.text(20)[:-1], + create_date=date.today(), + mets_sha256=fake.sha256(), + size=randint(10000, 100_000_000), + storage_service_id=storage_service_id, + storage_location_id=storage_location_id, + fetch_job_id=fetch_job_id, + origin_pipeline_id=pipeline_id, + ) + + db.session.add(aip) + db.session.commit() + + return aip + + +def create_fake_aip_files(min, max, aip_id): + for _ in range(1, randint(min, max)): + aipfile = File( + aip_id=aip_id, + name=fake.text(20)[:-1], + filepath=fake.file_path(), + uuid=fake.uuid4(), + file_type="original", + size=randint(1000, 1_000_000), + date_created=date.today(), + puid=fake.text(20)[:-1], + file_format=fake.text(20)[:-1], + format_version=fake.text(20)[:-1], + checksum_type=fake.text(20)[:-1], + checksum_value=fake.text(20)[:-1], + premis_object="", + ) + + db.session.add(aipfile) + db.session.commit() diff --git a/tools/tests/test_data.py b/tools/tests/test_data.py new file mode 100644 index 00000000..2c97458f --- /dev/null +++ b/tools/tests/test_data.py @@ -0,0 +1,88 @@ +import datetime + +import pytest + +from .tools.helpers import data + + +@pytest.fixture +def mock_db_add(mocker): + mocker.patch("AIPscan.db.session.add") + mocker.patch("AIPscan.db.session.commit") + + +def test_create_fake_storage_service(mock_db_add): + ss = data.create_fake_storage_service(True) + + assert ss.name + assert type(ss.name) == str + + assert ss.url + assert type(ss.url) == str + + assert ss.user_name + assert type(ss.user_name) == str + + assert ss.api_key + assert type(ss.api_key) == str + + assert ss.default + assert type(ss.default) == bool + + ss = data.create_fake_storage_service(False) + assert not ss.default + + +def test_create_fake_fetch_job(mock_db_add): + ss = data.create_fake_storage_service(True) + ss.id = 1 + + fetch_job = data.create_fake_fetch_job(ss.id) + + assert fetch_job.download_start + assert type(fetch_job.download_start) == datetime.date + + assert fetch_job.download_end + assert type(fetch_job.download_end) == datetime.date + + assert fetch_job.download_directory + assert type(fetch_job.download_directory) == str + + assert fetch_job.storage_service_id == ss.id + + +def test_create_fake_location(mock_db_add): + location = data.create_fake_location(1) + + assert location.current_location + assert type(location.current_location) == str + + assert location.description + assert type(location.description) == str + + assert location.storage_service_id == 1 + + +def test_create_fake_aip(mock_db_add): + aip = data.create_fake_aip(1, 2, 3, 4) + + assert aip.uuid + assert type(aip.uuid) == str + + assert aip.transfer_name + assert type(aip.transfer_name) == str + + assert aip.create_date + assert type(aip.create_date) == datetime.date + + assert aip.mets_sha256 + assert type(aip.mets_sha256) == str + + assert aip.size + assert type(aip.size) == int + + assert aip.origin_pipeline_id == 1 + assert aip.storage_service_id == 2 + assert aip.storage_location_id == 3 + assert aip.fetch_job_id == 4 + assert aip.origin_pipeline_id == 1