Skip to content

Commit

Permalink
Add test data generation tool. (#217)
Browse files Browse the repository at this point in the history
Added a tool to populate AIPscan with randomly generated example data.
  • Loading branch information
mcantelon committed Oct 24, 2023
1 parent 112d31b commit a9b7b26
Show file tree
Hide file tree
Showing 9 changed files with 338 additions and 0 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,29 @@ Shut down the AIPscan Docker containers and remove the rabbitmq volumes:
docker-composer down --volumes
```

## Tools

The `tools` directory contains scripts that can be run by developers and system
adminsitrators.

#### Test data generator

The test data generator, `tools/generate-test-data`, tool populates
AIPscan's databse with randomly generated example data.

### Running tools

These should be run using the same system user and virtual environment that
AIPscan is running under.

Here's how you would run the `generate-test-data` tool, for example:

$ cd <path to AIPscan base directory>
$ sudo -u <AIPscan system user> /bin/bash
$ source <path to AIPscan virtual environment>/bin/activate
$ ./tools/generate-test-data.py


# Usage

* Ensure that the Flask Server, RabbitMQ server, and Celery worker queue are up and running.
Expand Down
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-r base.txt

faker==14.2.1
flake8==5.0.4
pytest==6.2.5
pytest_cov==2.11.1
Expand Down
Empty file added tools/__init__.py
Empty file.
Empty file added tools/app/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions tools/app/init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os
import sys

relpath = f"{os.path.dirname(__file__)}/../../../AIPscan"
sys.path.append(os.path.abspath(relpath))

config_name = "default"
100 changes: 100 additions & 0 deletions tools/generate-test-data
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
import sys

import click
from app import init
from faker import Faker
from flask import Flask
from helpers import data

from AIPscan import db
from AIPscan.models import FetchJob
from config import CONFIGS


@click.command()
@click.option("--storage-services-to-create", default=2)
@click.option("--locations-per-storage-service", default=2)
@click.option("--locations-min-aip-count", default=10)
@click.option("--locations-max-aip-count", default=30)
@click.option("--aip-min-file-count", default=10)
@click.option("--aip-max-file-count", default=30)
def main(
storage_services_to_create,
locations_per_storage_service,
locations_min_aip_count,
locations_max_aip_count,
aip_min_file_count,
aip_max_file_count,
):
# Initialize Flash app context
app = Flask(__name__)
app.config.from_object(CONFIGS[init.config_name])

db.init_app(app)

fake = Faker()
fake.seed_instance(0)
randint = fake.random.randint

with app.app_context():
# Add example storage services
print(f"Creating pipeline and {storage_services_to_create} storage services...")
pipeline = data.create_fake_pipeline()

ss_ids = []
fetch_jobs = {}

for _ in range(storage_services_to_create):
is_default = len(ss_ids) == 0

ss = data.create_fake_storage_service(is_default)
ss_ids.append(ss.id)

fetch_job = data.create_fake_fetch_job(ss.id)
fetch_jobs[ss.id] = fetch_job.id

# Populate storage service locations
ss_locations_to_create = (
storage_services_to_create * locations_per_storage_service
)

print(
f"Creating {ss_locations_to_create} storage service locations (and their AIPs)..."
)

aip_batches_created = 0
total_aip_batches = len(ss_ids) * locations_per_storage_service
for ss_id in ss_ids:
for _ in range(locations_per_storage_service):
# Add location
sl = data.create_fake_location(ss_id)

# Add AIPs
aip_batches_created += 1

print(f"Creating AIPs ({aip_batches_created}/{total_aip_batches})...")

aipcount = 0
for _ in range(
1, randint(locations_min_aip_count, locations_max_aip_count)
):
aip = data.create_fake_aip(
pipeline.id, ss_id, sl.id, fetch_jobs[ss.id]
)
data.create_fake_aip_files(
aip_min_file_count, aip_max_file_count, aip.id
)
aipcount += 1

# Update package/AIP counts in fetch job
fetch_job = FetchJob.query.get(fetch_jobs[ss_id])
fetch_job.total_packages += aipcount
fetch_job.total_aips += aipcount
db.session.commit()

print("Done.")


if __name__ == "__main__":
sys.exit(main())
Empty file added tools/helpers/__init__.py
Empty file.
119 changes: 119 additions & 0 deletions tools/helpers/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
from datetime import date

from faker import Faker

from AIPscan import db
from AIPscan.models import (
AIP,
FetchJob,
File,
Pipeline,
StorageLocation,
StorageService,
)

fake = Faker()
randint = fake.random.randint


def create_fake_pipeline():
pipeline = Pipeline(origin_pipeline=fake.uuid4(), dashboard_url=fake.url())

db.session.add(pipeline)
db.session.commit()

return pipeline


def create_fake_storage_service(default):
ss = StorageService(
name=fake.text(20)[:-1],
url=fake.url(),
user_name=fake.profile()["username"],
api_key=fake.password(),
download_limit=0,
download_offset=0,
default=default,
)

db.session.add(ss)
db.session.commit()

return ss


def create_fake_fetch_job(storage_service_id):
fetch_job = FetchJob(
total_packages=0,
total_aips=0,
total_deleted_aips=0,
download_start=date.today(),
download_end=date.today(),
download_directory=fake.file_path(),
storage_service_id=storage_service_id,
)
fetch_job.total_dips = 0
fetch_job.total_sips = 0
fetch_job.total_replicas = 0

db.session.add(fetch_job)
db.session.commit()

return fetch_job


def create_fake_location(storage_service_id):
current_location = os.path.join(os.path.dirname(fake.file_path(3)), fake.uuid4())

location = StorageLocation(
current_location=current_location,
description=fake.text(20)[:-1],
storage_service_id=storage_service_id,
)

db.session.add(location)
db.session.commit()

return location


def create_fake_aip(pipeline_id, storage_service_id, storage_location_id, fetch_job_id):
aip = AIP(
uuid=fake.uuid4(),
transfer_name=fake.text(20)[:-1],
create_date=date.today(),
mets_sha256=fake.sha256(),
size=randint(10000, 100_000_000),
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=pipeline_id,
)

db.session.add(aip)
db.session.commit()

return aip


def create_fake_aip_files(min, max, aip_id):
for _ in range(1, randint(min, max)):
aipfile = File(
aip_id=aip_id,
name=fake.text(20)[:-1],
filepath=fake.file_path(),
uuid=fake.uuid4(),
file_type="original",
size=randint(1000, 1_000_000),
date_created=date.today(),
puid=fake.text(20)[:-1],
file_format=fake.text(20)[:-1],
format_version=fake.text(20)[:-1],
checksum_type=fake.text(20)[:-1],
checksum_value=fake.text(20)[:-1],
premis_object="",
)

db.session.add(aipfile)
db.session.commit()
88 changes: 88 additions & 0 deletions tools/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import datetime

import pytest

from .tools.helpers import data


@pytest.fixture
def mock_db_add(mocker):
mocker.patch("AIPscan.db.session.add")
mocker.patch("AIPscan.db.session.commit")


def test_create_fake_storage_service(mock_db_add):
ss = data.create_fake_storage_service(True)

assert ss.name
assert type(ss.name) == str

assert ss.url
assert type(ss.url) == str

assert ss.user_name
assert type(ss.user_name) == str

assert ss.api_key
assert type(ss.api_key) == str

assert ss.default
assert type(ss.default) == bool

ss = data.create_fake_storage_service(False)
assert not ss.default


def test_create_fake_fetch_job(mock_db_add):
ss = data.create_fake_storage_service(True)
ss.id = 1

fetch_job = data.create_fake_fetch_job(ss.id)

assert fetch_job.download_start
assert type(fetch_job.download_start) == datetime.date

assert fetch_job.download_end
assert type(fetch_job.download_end) == datetime.date

assert fetch_job.download_directory
assert type(fetch_job.download_directory) == str

assert fetch_job.storage_service_id == ss.id


def test_create_fake_location(mock_db_add):
location = data.create_fake_location(1)

assert location.current_location
assert type(location.current_location) == str

assert location.description
assert type(location.description) == str

assert location.storage_service_id == 1


def test_create_fake_aip(mock_db_add):
aip = data.create_fake_aip(1, 2, 3, 4)

assert aip.uuid
assert type(aip.uuid) == str

assert aip.transfer_name
assert type(aip.transfer_name) == str

assert aip.create_date
assert type(aip.create_date) == datetime.date

assert aip.mets_sha256
assert type(aip.mets_sha256) == str

assert aip.size
assert type(aip.size) == int

assert aip.origin_pipeline_id == 1
assert aip.storage_service_id == 2
assert aip.storage_location_id == 3
assert aip.fetch_job_id == 4
assert aip.origin_pipeline_id == 1

0 comments on commit a9b7b26

Please sign in to comment.