diff --git a/api/bin/setup_localstack.py b/api/bin/setup_localstack.py index 2c78861a6..2603bd39c 100644 --- a/api/bin/setup_localstack.py +++ b/api/bin/setup_localstack.py @@ -6,6 +6,7 @@ import src.logging from src.adapters.aws import S3Config, get_s3_client +from src.util import file_util from src.util.local import error_if_not_local logger = logging.getLogger(__name__) @@ -26,6 +27,14 @@ def does_s3_bucket_exist(s3_client: botocore.client.BaseClient, bucket_name: str return False +def create_bucket_if_not_exists(s3_client: botocore.client.BaseClient, bucket_name: str) -> None: + if not does_s3_bucket_exist(s3_client, bucket_name): + logger.info("Creating S3 bucket %s", bucket_name) + s3_client.create_bucket(Bucket=bucket_name) + else: + logger.info("S3 bucket %s already exists - skipping creation", bucket_name) + + def setup_s3() -> None: s3_config = S3Config() # This is only used locally - to avoid any accidental running of commands here @@ -35,14 +44,12 @@ def setup_s3() -> None: s3_config, boto3.Session(aws_access_key_id="NO_CREDS", aws_secret_access_key="NO_CREDS") ) - if s3_config.s3_opportunity_bucket is None: - raise Exception("S3_OPPORTUNITY_BUCKET env var must be set") - - if not does_s3_bucket_exist(s3_client, s3_config.s3_opportunity_bucket): - logger.info("Creating S3 bucket %s", s3_config.s3_opportunity_bucket) - s3_client.create_bucket(Bucket=s3_config.s3_opportunity_bucket) - else: - logger.info("S3 bucket %s already exists - skipping", s3_config.s3_opportunity_bucket) + create_bucket_if_not_exists( + s3_client, file_util.get_s3_bucket(s3_config.public_files_bucket_path) + ) + create_bucket_if_not_exists( + s3_client, file_util.get_s3_bucket(s3_config.draft_files_bucket_path) + ) def main() -> None: diff --git a/api/local.env b/api/local.env index 3dcd72f69..80502e371 100644 --- a/api/local.env +++ b/api/local.env @@ -99,7 +99,9 @@ S3_ENDPOINT_URL=http://localstack:4566 # S3 ############################ -S3_OPPORTUNITY_BUCKET=local-opportunities +# Our terraform sets these as s3 paths, so include s3:// on the bucket name +PUBLIC_FILES_BUCKET=s3://local-mock-public-bucket +DRAFT_FILES_BUCKET=s3://local-mock-draft-bucket # This env var is used to set local AWS credentials IS_LOCAL_AWS=1 diff --git a/api/src/adapters/aws/s3_adapter.py b/api/src/adapters/aws/s3_adapter.py index 83f6273bf..c06fac62c 100644 --- a/api/src/adapters/aws/s3_adapter.py +++ b/api/src/adapters/aws/s3_adapter.py @@ -1,6 +1,7 @@ import boto3 import botocore.client import botocore.config +from pydantic import Field from src.util.env_config import PydanticBaseEnvConfig @@ -16,9 +17,9 @@ class S3Config(PydanticBaseEnvConfig): # so that we don't need to set all of these for every # process that uses S3 - # TODO - I'm not sure how we want to organize our - # s3 buckets so this will likely change in the future - s3_opportunity_bucket: str | None = None + # Note these env vars get set as "s3://..." + public_files_bucket_path: str = Field(alias="PUBLIC_FILES_BUCKET") + draft_files_bucket_path: str = Field(alias="DRAFT_FILES_BUCKET") def get_s3_client( diff --git a/api/src/data_migration/transformation/subtask/transform_opportunity.py b/api/src/data_migration/transformation/subtask/transform_opportunity.py index 236d942f0..a547e65b1 100644 --- a/api/src/data_migration/transformation/subtask/transform_opportunity.py +++ b/api/src/data_migration/transformation/subtask/transform_opportunity.py @@ -1,18 +1,31 @@ import logging -from typing import Tuple +from typing import Tuple, cast import src.data_migration.transformation.transform_constants as transform_constants import src.data_migration.transformation.transform_util as transform_util +from src.adapters.aws import S3Config from src.data_migration.transformation.subtask.abstract_transform_subtask import ( AbstractTransformSubTask, ) from src.db.models.opportunity_models import Opportunity from src.db.models.staging.opportunity import Topportunity +from src.services.opportunity_attachments import attachment_util +from src.task.task import Task +from src.util import file_util logger = logging.getLogger(__name__) class TransformOpportunity(AbstractTransformSubTask): + + def __init__(self, task: Task, s3_config: S3Config | None = None): + super().__init__(task) + + if s3_config is None: + s3_config = S3Config() + + self.s3_config = s3_config + def transform_records(self) -> None: # Fetch all opportunities that were modified # Alongside that, grab the existing opportunity record @@ -53,11 +66,18 @@ def process_opportunity( extra, ) + # Cleanup the attachments from s3 + if target_opportunity is not None: + for attachment in target_opportunity.opportunity_attachments: + file_util.delete_file(attachment.file_location) + else: # To avoid incrementing metrics for records we fail to transform, record # here whether it's an insert/update and we'll increment after transforming is_insert = target_opportunity is None + was_draft = target_opportunity.is_draft if target_opportunity else None + logger.info("Transforming and upserting opportunity", extra=extra) transformed_opportunity = transform_util.transform_opportunity( source_opportunity, target_opportunity @@ -76,5 +96,23 @@ def process_opportunity( ) self.db_session.merge(transformed_opportunity) + # If an opportunity went from being a draft to not a draft (published) + # then we need to move all of its attachments to the public bucket + # from the draft s3 bucket. + if was_draft and transformed_opportunity.is_draft is False: + for attachment in cast(Opportunity, target_opportunity).opportunity_attachments: + # Determine the new path + file_name = attachment_util.adjust_legacy_file_name(attachment.file_name) + s3_path = attachment_util.get_s3_attachment_path( + file_name, + attachment.attachment_id, + transformed_opportunity, + self.s3_config, + ) + + # Move the file + file_util.move_file(attachment.file_location, s3_path) + attachment.file_location = s3_path + logger.info("Processed opportunity", extra=extra) source_opportunity.transformed_at = self.transform_time diff --git a/api/src/data_migration/transformation/subtask/transform_opportunity_attachment.py b/api/src/data_migration/transformation/subtask/transform_opportunity_attachment.py index 82162fb17..a9d26f4dd 100644 --- a/api/src/data_migration/transformation/subtask/transform_opportunity_attachment.py +++ b/api/src/data_migration/transformation/subtask/transform_opportunity_attachment.py @@ -3,18 +3,30 @@ import src.data_migration.transformation.transform_constants as transform_constants import src.data_migration.transformation.transform_util as transform_util +from src.adapters.aws import S3Config from src.constants.lookup_constants import OpportunityAttachmentType from src.data_migration.transformation.subtask.abstract_transform_subtask import ( AbstractTransformSubTask, ) from src.db.models.opportunity_models import Opportunity, OpportunityAttachment from src.db.models.staging.attachment import TsynopsisAttachment +from src.services.opportunity_attachments import attachment_util +from src.task.task import Task +from src.util import file_util logger = logging.getLogger(__name__) class TransformOpportunityAttachment(AbstractTransformSubTask): + def __init__(self, task: Task, s3_config: S3Config | None = None): + super().__init__(task) + + if s3_config is None: + s3_config = S3Config() + + self.s3_config = s3_config + def transform_records(self) -> None: # Fetch staging attachment / our attachment / opportunity groups @@ -63,9 +75,6 @@ def process_opportunity_attachment( logger.info("Processing opportunity attachment", extra=extra) if source_attachment.is_deleted: - # TODO - https://github.com/HHS/simpler-grants-gov/issues/3322 - # deletes are more complex because of s3 - # this just handles deleting the DB record at the moment self._handle_delete( source=source_attachment, target=target_attachment, @@ -73,6 +82,10 @@ def process_opportunity_attachment( extra=extra, ) + # Delete the file from s3 as well + if target_attachment is not None: + file_util.delete_file(target_attachment.file_location) + elif opportunity is None: # This shouldn't be possible as the incoming data has foreign keys, but as a safety net # we'll make sure the opportunity actually exists @@ -85,13 +98,29 @@ def process_opportunity_attachment( # here whether it's an insert/update and we'll increment after transforming is_insert = target_attachment is None + prior_attachment_location = ( + target_attachment.file_location if target_attachment else None + ) + logger.info("Transforming and upserting opportunity attachment", extra=extra) transformed_opportunity_attachment = transform_opportunity_attachment( - source_attachment, target_attachment + source_attachment, target_attachment, opportunity, self.s3_config ) - # TODO - we'll need to handle more with the s3 files here + # Write the file to s3 + write_file(source_attachment, transformed_opportunity_attachment) + + # If this was an update, and the file name changed + # Cleanup the old file from s3. + if ( + prior_attachment_location is not None + and prior_attachment_location != transformed_opportunity_attachment.file_location + ): + file_util.delete_file(prior_attachment_location) + + logger.info("Transforming and upserting opportunity attachment", extra=extra) + if is_insert: self.increment( transform_constants.Metrics.TOTAL_RECORDS_INSERTED, @@ -110,7 +139,10 @@ def process_opportunity_attachment( def transform_opportunity_attachment( - source_attachment: TsynopsisAttachment, incoming_attachment: OpportunityAttachment | None + source_attachment: TsynopsisAttachment, + incoming_attachment: OpportunityAttachment | None, + opportunity: Opportunity, + s3_config: S3Config, ) -> OpportunityAttachment: log_extra = transform_util.get_log_extra_opportunity_attachment(source_attachment) @@ -118,6 +150,15 @@ def transform_opportunity_attachment( if incoming_attachment is None: logger.info("Creating new opportunity attachment record", extra=log_extra) + # Adjust the file_name to remove characters clunky in URLs + if source_attachment.file_name is None: + raise ValueError("Opportunity attachment does not have a file name, cannot process.") + file_name = attachment_util.adjust_legacy_file_name(source_attachment.file_name) + + file_location = attachment_util.get_s3_attachment_path( + file_name, source_attachment.syn_att_id, opportunity, s3_config + ) + # We always create a new record here and merge it in the calling function # this way if there is any error doing the transformation, we don't modify the existing one. target_attachment = OpportunityAttachment( @@ -125,11 +166,11 @@ def transform_opportunity_attachment( opportunity_id=source_attachment.opportunity_id, # TODO - we'll eventually remove attachment type, for now just arbitrarily set the value opportunity_attachment_type=OpportunityAttachmentType.OTHER, - # TODO - in https://github.com/HHS/simpler-grants-gov/issues/3322 - # we'll actually handle the file location logic with s3 - file_location="TODO", # TODO - next PR + # Note we calculate the file location here, but haven't yet done anything + # with s3, the calling function, will handle writing the file to s3. + file_location=file_location, mime_type=source_attachment.mime_type, - file_name=source_attachment.file_name, + file_name=file_name, file_description=source_attachment.file_desc, file_size_bytes=source_attachment.file_lob_size, created_by=source_attachment.creator_id, @@ -142,3 +183,10 @@ def transform_opportunity_attachment( ) return target_attachment + + +def write_file( + source_attachment: TsynopsisAttachment, destination_attachment: OpportunityAttachment +) -> None: + with file_util.open_stream(destination_attachment.file_location, "wb") as outfile: + outfile.write(source_attachment.file_lob) diff --git a/api/src/search/backend/load_opportunities_to_index.py b/api/src/search/backend/load_opportunities_to_index.py index a1e7d0c9c..5b0e5ba80 100644 --- a/api/src/search/backend/load_opportunities_to_index.py +++ b/api/src/search/backend/load_opportunities_to_index.py @@ -3,7 +3,6 @@ from enum import StrEnum from typing import Iterator, Sequence -import smart_open from opensearchpy.exceptions import ConnectionTimeout, TransportError from pydantic import Field from pydantic_settings import SettingsConfigDict @@ -22,11 +21,16 @@ OpportunitySearchIndexQueue, ) from src.task.task import Task +from src.util import file_util from src.util.datetime_util import get_now_us_eastern_datetime from src.util.env_config import PydanticBaseEnvConfig logger = logging.getLogger(__name__) +ALLOWED_ATTACHMENT_SUFFIXES = set( + ["txt", "pdf", "docx", "doc", "xlsx", "xlsm", "html", "htm", "pptx", "ppt", "rtf"] +) + class LoadOpportunitiesToIndexConfig(PydanticBaseEnvConfig): model_config = SettingsConfigDict(env_prefix="LOAD_OPP_SEARCH_") @@ -97,6 +101,7 @@ def _create_multi_attachment_pipeline(self) -> None: "field": "_ingest._value.data", } }, + "ignore_missing": True, } } ], @@ -275,10 +280,9 @@ def fetch_existing_opportunity_ids_in_index(self) -> set[int]: return opportunity_ids - def filter_attachments( - self, attachments: list[OpportunityAttachment] - ) -> list[OpportunityAttachment]: - return [attachment for attachment in attachments] + def filter_attachment(self, attachment: OpportunityAttachment) -> bool: + file_suffix = attachment.file_name.lower().split(".")[-1] + return file_suffix in ALLOWED_ATTACHMENT_SUFFIXES def get_attachment_json_for_opportunity( self, opp_attachments: list[OpportunityAttachment] @@ -286,17 +290,18 @@ def get_attachment_json_for_opportunity( attachments = [] for att in opp_attachments: - with smart_open.open( - att.file_location, - "rb", - ) as file: - file_content = file.read() - attachments.append( - { - "filename": att.file_name, - "data": base64.b64encode(file_content).decode("utf-8"), - } - ) + if self.filter_attachment(att): + with file_util.open_stream( + att.file_location, + "rb", + ) as file: + file_content = file.read() + attachments.append( + { + "filename": att.file_name, + "data": base64.b64encode(file_content).decode("utf-8"), + } + ) return attachments diff --git a/api/src/services/opportunity_attachments/__init__.py b/api/src/services/opportunity_attachments/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/src/services/opportunity_attachments/attachment_util.py b/api/src/services/opportunity_attachments/attachment_util.py new file mode 100644 index 000000000..5dcdca30d --- /dev/null +++ b/api/src/services/opportunity_attachments/attachment_util.py @@ -0,0 +1,60 @@ +import re + +from src.adapters.aws import S3Config +from src.db.models.opportunity_models import Opportunity +from src.util import file_util + + +def get_s3_attachment_path( + file_name: str, opportunity_attachment_id: int, opportunity: Opportunity, s3_config: S3Config +) -> str: + """Construct a path to the attachments on s3 + + Will be formatted like: + + s3:///opportunities//attachments// + + Note that we store the files under a "folder" with the attachment ID as + the legacy system doesn't guarantee file names are unique within an opportunity. + """ + + base_path = ( + s3_config.draft_files_bucket_path + if opportunity.is_draft + else s3_config.public_files_bucket_path + ) + + return file_util.join( + base_path, + "opportunities", + str(opportunity.opportunity_id), + "attachments", + str(opportunity_attachment_id), + file_name, + ) + + +def adjust_legacy_file_name(existing_file_name: str) -> str: + """Correct the file names to remove any characters problematic for URL/s3 processing. + + We only keep the following characters: + * A-Z + * a-z + * 0-9 + * _ + * - + * ~ + * . + + Whitespace will be replaced with underscores. + + All other characters will be removed. + """ + + # Replace one-or-more whitespace with a single underscore + file_name = re.sub(r"\s+", "_", existing_file_name) + + # Remove all non-accepted characters + file_name = re.sub(r"[^a-zA-Z0-9_.\-~]", "", file_name) + + return file_name diff --git a/api/src/util/file_util.py b/api/src/util/file_util.py index b99653ad6..2943f65c9 100644 --- a/api/src/util/file_util.py +++ b/api/src/util/file_util.py @@ -1,8 +1,10 @@ import os -from pathlib import PosixPath -from typing import Any, Optional, Tuple +import shutil +from pathlib import Path +from typing import Any from urllib.parse import urlparse +import botocore import smart_open from botocore.config import Config @@ -13,23 +15,23 @@ ################################## -def is_s3_path(path: str | PosixPath) -> bool: +def is_s3_path(path: str | Path) -> bool: return str(path).startswith("s3://") -def split_s3_url(path: str) -> Tuple[str, str]: - parts = urlparse(path) +def split_s3_url(path: str | Path) -> tuple[str, str]: + parts = urlparse(str(path)) bucket_name = parts.netloc prefix = parts.path.lstrip("/") - return (bucket_name, prefix) + return bucket_name, prefix -def get_s3_bucket(path: str) -> Optional[str]: - return urlparse(path).hostname +def get_s3_bucket(path: str | Path) -> str: + return split_s3_url(path)[0] -def get_s3_file_key(path: str) -> str: - return urlparse(path).path[1:] +def get_s3_file_key(path: str | Path) -> str: + return split_s3_url(path)[1] def get_file_name(path: str) -> str: @@ -45,15 +47,17 @@ def join(*parts: str) -> str: ################################## -def open_stream(path: str, mode: str = "r", encoding: str | None = None) -> Any: +def open_stream(path: str | Path, mode: str = "r", encoding: str | None = None) -> Any: if is_s3_path(path): + s3_client = get_s3_client() + so_config = Config( max_pool_connections=10, connect_timeout=60, read_timeout=60, retries={"max_attempts": 10}, ) - so_transport_params = {"client_kwargs": {"config": so_config}} + so_transport_params = {"client_kwargs": {"config": so_config}, "client": s3_client} return smart_open.open(path, mode, transport_params=so_transport_params, encoding=encoding) else: @@ -90,3 +94,75 @@ def get_file_length_bytes(path: str) -> int: file_stats = os.stat(path) return file_stats.st_size + + +def copy_file(source_path: str | Path, destination_path: str | Path) -> None: + is_source_s3 = is_s3_path(source_path) + is_dest_s3 = is_s3_path(destination_path) + + # This isn't a download or upload method + # Don't allow "copying" between mismatched locations + if is_source_s3 != is_dest_s3: + raise Exception("Cannot download/upload between disk and S3 using this method") + + if is_source_s3: + s3_client = get_s3_client() + + source_bucket, source_path = split_s3_url(source_path) + dest_bucket, dest_path = split_s3_url(destination_path) + + s3_client.copy({"Bucket": source_bucket, "Key": source_path}, dest_bucket, dest_path) + else: + os.makedirs(os.path.dirname(destination_path), exist_ok=True) + shutil.copy2(source_path, destination_path) + + +def delete_file(path: str | Path) -> None: + """Delete a file from s3 or local disk""" + if is_s3_path(path): + bucket, s3_path = split_s3_url(path) + + s3_client = get_s3_client() + s3_client.delete_object(Bucket=bucket, Key=s3_path) + else: + os.remove(path) + + +def move_file(source_path: str | Path, destination_path: str | Path) -> None: + is_source_s3 = is_s3_path(source_path) + is_dest_s3 = is_s3_path(destination_path) + + # This isn't a download or upload method + # Don't allow "copying" between mismatched locations + if is_source_s3 != is_dest_s3: + raise Exception("Cannot download/upload between disk and S3 using this method") + + if is_source_s3: + copy_file(source_path, destination_path) + delete_file(source_path) + + else: + os.renames(source_path, destination_path) + + +def file_exists(path: str | Path) -> bool: + """Get whether a file exists or not""" + if is_s3_path(path): + s3_client = get_s3_client() + + bucket, key = split_s3_url(path) + + try: + s3_client.head_object(Bucket=bucket, Key=key) + return True + except botocore.exceptions.ClientError: + return False + + # Local file system + return Path(path).exists() + + +def read_file(path: str | Path, mode: str = "r", encoding: str | None = None) -> str: + """Simple function for just getting all of the contents of a file""" + with open_stream(path, mode, encoding) as input_file: + return input_file.read() diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 85d2dadb5..43aa2590f 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -18,6 +18,7 @@ import src.auth.login_gov_jwt_auth as login_gov_jwt_auth import tests.src.db.models.factories as factories from src.adapters import search +from src.adapters.aws import S3Config from src.adapters.oauth.login_gov.mock_login_gov_oauth_client import MockLoginGovOauthClient from src.auth.api_jwt_auth import create_jwt_for_user from src.constants.schema import Schemas @@ -382,6 +383,26 @@ def mock_s3_bucket(mock_s3_bucket_resource): yield mock_s3_bucket_resource.name +@pytest.fixture +def other_mock_s3_bucket_resource(mock_s3): + bucket = mock_s3.Bucket("other_test_bucket") + bucket.create() + yield bucket + + +@pytest.fixture +def other_mock_s3_bucket(other_mock_s3_bucket_resource): + yield other_mock_s3_bucket_resource.name + + +@pytest.fixture +def s3_config(mock_s3_bucket, other_mock_s3_bucket): + return S3Config( + PUBLIC_FILES_BUCKET=f"s3://{mock_s3_bucket}", + DRAFT_FILES_BUCKET=f"s3://{other_mock_s3_bucket}", + ) + + #################### # Class-based testing #################### diff --git a/api/tests/lib/seed_local_db.py b/api/tests/lib/seed_local_db.py index 2c96efe42..4439227a9 100644 --- a/api/tests/lib/seed_local_db.py +++ b/api/tests/lib/seed_local_db.py @@ -14,6 +14,7 @@ from src.adapters.aws import S3Config, get_s3_client from src.adapters.db import PostgresDBClient from src.db.models.opportunity_models import Opportunity +from src.util import file_util from src.util.local import error_if_not_local from tests.lib.seed_agencies import _build_agencies @@ -35,7 +36,11 @@ def _upload_opportunity_attachments_s3(): object_name = os.path.relpath(file_path, test_folder_path) try: - s3_client.upload_file(file_path, s3_config.s3_opportunity_bucket, object_name) + s3_client.upload_file( + file_path, + file_util.get_s3_bucket(s3_config.public_files_bucket_path), + object_name, + ) logger.info("Successfully uploaded files") except ClientError as e: logger.error( diff --git a/api/tests/src/data_migration/transformation/conftest.py b/api/tests/src/data_migration/transformation/conftest.py index 47a320b51..4ee0bb281 100644 --- a/api/tests/src/data_migration/transformation/conftest.py +++ b/api/tests/src/data_migration/transformation/conftest.py @@ -4,6 +4,7 @@ import pytest import tests.src.db.models.factories as f +from src.adapters.aws import S3Config from src.constants.lookup_constants import ApplicantType, FundingCategory, FundingInstrument from src.data_migration.transformation.transform_oracle_data_task import TransformOracleDataTask from src.db.models import staging @@ -17,6 +18,8 @@ OpportunityAttachment, OpportunitySummary, ) +from src.services.opportunity_attachments import attachment_util +from src.util import file_util from tests.conftest import BaseTestClass @@ -49,6 +52,7 @@ def setup_opportunity( if create_existing: f.OpportunityFactory.create( opportunity_id=source_opportunity.opportunity_id, + opportunity_attachments=[], # set created_at/updated_at to an earlier time so its clear # when they were last updated timestamps_in_past=True, @@ -334,6 +338,7 @@ def setup_agency( def setup_opportunity_attachment( create_existing: bool, opportunity: Opportunity, + config: S3Config, is_delete: bool = False, is_already_processed: bool = False, source_values: dict | None = None, @@ -350,9 +355,17 @@ def setup_opportunity_attachment( ) if create_existing: + s3_path = attachment_util.get_s3_attachment_path( + synopsis_attachment.file_name, synopsis_attachment.syn_att_id, opportunity, config + ) + + with file_util.open_stream(s3_path, "w") as outfile: + outfile.write(f.fake.sentence(25)) + f.OpportunityAttachmentFactory.create( attachment_id=synopsis_attachment.syn_att_id, opportunity=opportunity, + file_location=s3_path, ) return synopsis_attachment @@ -824,3 +837,12 @@ def validate_opportunity_attachment( ], expect_values_to_match, ) + + # Validate the contents of the file and that the file exists on s3 + with file_util.open_stream(opportunity_attachment.file_location) as s3_file: + contents = s3_file.read() + + if expect_values_to_match: + assert contents.encode() == source_attachment.file_lob + else: + assert contents.encode() != source_attachment.file_lob diff --git a/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity.py b/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity.py index 92177d301..28f05f442 100644 --- a/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity.py +++ b/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity.py @@ -2,17 +2,20 @@ import src.data_migration.transformation.transform_constants as transform_constants from src.data_migration.transformation.subtask.transform_opportunity import TransformOpportunity +from src.services.opportunity_attachments import attachment_util +from src.util import file_util from tests.src.data_migration.transformation.conftest import ( BaseTransformTestClass, setup_opportunity, validate_opportunity, ) +from tests.src.db.models.factories import OpportunityAttachmentFactory, OpportunityFactory class TestTransformOpportunity(BaseTransformTestClass): @pytest.fixture() - def transform_opportunity(self, transform_oracle_data_task, truncate_staging_tables): - return TransformOpportunity(transform_oracle_data_task) + def transform_opportunity(self, transform_oracle_data_task, truncate_staging_tables, s3_config): + return TransformOpportunity(transform_oracle_data_task, s3_config) def test_process_opportunities(self, db_session, transform_opportunity): ordinary_delete = setup_opportunity( @@ -97,3 +100,73 @@ def test_process_opportunity_invalid_category(self, db_session, transform_opport transform_opportunity.process_opportunity(insert_that_will_fail, None) validate_opportunity(db_session, insert_that_will_fail, expect_in_db=False) + + def test_process_opportunity_delete_with_attachments( + self, db_session, transform_opportunity, s3_config + ): + + source_opportunity = setup_opportunity(create_existing=False, is_delete=True) + + target_opportunity = OpportunityFactory.create( + opportunity_id=source_opportunity.opportunity_id, opportunity_attachments=[] + ) + + attachments = [] + for i in range(10): + s3_path = attachment_util.get_s3_attachment_path( + f"my_file{i}.txt", i, target_opportunity, s3_config + ) + + with file_util.open_stream(s3_path, "w") as outfile: + outfile.write(f"This is the {i}th file") + + attachment = OpportunityAttachmentFactory.create( + opportunity=target_opportunity, file_location=s3_path + ) + attachments.append(attachment) + + transform_opportunity.process_opportunity(source_opportunity, target_opportunity) + + validate_opportunity(db_session, source_opportunity, expect_in_db=False) + + # Verify all of the files were deleted + for attachment in attachments: + assert file_util.file_exists(attachment.file_location) is False + + def test_process_opportunity_update_to_non_draft_with_attachments( + self, db_session, transform_opportunity, s3_config + ): + + source_opportunity = setup_opportunity( + create_existing=False, source_values={"is_draft": "N"} + ) + + target_opportunity = OpportunityFactory.create( + opportunity_id=source_opportunity.opportunity_id, + is_draft=True, + opportunity_attachments=[], + ) + + attachments = [] + for i in range(10): + s3_path = attachment_util.get_s3_attachment_path( + f"my_file{i}.txt", i, target_opportunity, s3_config + ) + assert s3_path.startswith(s3_config.draft_files_bucket_path) is True + + with file_util.open_stream(s3_path, "w") as outfile: + outfile.write(f"This is the {i}th file") + + attachment = OpportunityAttachmentFactory.create( + opportunity=target_opportunity, file_location=s3_path + ) + attachments.append(attachment) + + transform_opportunity.process_opportunity(source_opportunity, target_opportunity) + + validate_opportunity(db_session, source_opportunity) + + # Verify all of the files were moved to the public bucket + for attachment in attachments: + assert attachment.file_location.startswith(s3_config.public_files_bucket_path) is True + assert file_util.file_exists(attachment.file_location) is True diff --git a/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_attachment.py b/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_attachment.py index 0a44a3f7d..a40462c56 100644 --- a/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_attachment.py +++ b/api/tests/src/data_migration/transformation/subtask/test_transform_opportunity_attachment.py @@ -5,6 +5,8 @@ from src.data_migration.transformation.subtask.transform_opportunity_attachment import ( TransformOpportunityAttachment, ) +from src.services.opportunity_attachments import attachment_util +from src.util import file_util from tests.src.data_migration.transformation.conftest import ( BaseTransformTestClass, setup_opportunity_attachment, @@ -13,41 +15,86 @@ class TestTransformOpportunitySummary(BaseTransformTestClass): + @pytest.fixture() - def transform_opportunity_attachment(self, transform_oracle_data_task): - return TransformOpportunityAttachment(transform_oracle_data_task) + def transform_opportunity_attachment(self, transform_oracle_data_task, s3_config): + return TransformOpportunityAttachment(transform_oracle_data_task, s3_config) - def test_transform_opportunity_attachment(self, db_session, transform_opportunity_attachment): + def test_transform_opportunity_attachment( + self, db_session, transform_opportunity_attachment, s3_config + ): opportunity1 = f.OpportunityFactory.create(opportunity_attachments=[]) - insert1 = setup_opportunity_attachment(create_existing=False, opportunity=opportunity1) - insert2 = setup_opportunity_attachment(create_existing=False, opportunity=opportunity1) + insert1 = setup_opportunity_attachment( + create_existing=False, opportunity=opportunity1, config=s3_config + ) + insert2 = setup_opportunity_attachment( + create_existing=False, opportunity=opportunity1, config=s3_config + ) - update1 = setup_opportunity_attachment(create_existing=True, opportunity=opportunity1) - update2 = setup_opportunity_attachment(create_existing=True, opportunity=opportunity1) + update1 = setup_opportunity_attachment( + create_existing=True, opportunity=opportunity1, config=s3_config + ) + update2 = setup_opportunity_attachment( + create_existing=True, opportunity=opportunity1, config=s3_config + ) delete1 = setup_opportunity_attachment( - create_existing=True, is_delete=True, opportunity=opportunity1 + create_existing=True, + is_delete=True, + opportunity=opportunity1, + config=s3_config, ) opportunity2 = f.OpportunityFactory.create(opportunity_attachments=[]) - insert3 = setup_opportunity_attachment(create_existing=False, opportunity=opportunity2) - update3 = setup_opportunity_attachment(create_existing=True, opportunity=opportunity2) + insert3 = setup_opportunity_attachment( + create_existing=False, opportunity=opportunity2, config=s3_config + ) + update3 = setup_opportunity_attachment( + create_existing=True, opportunity=opportunity2, config=s3_config + ) delete2 = setup_opportunity_attachment( - create_existing=True, is_delete=True, opportunity=opportunity2 + create_existing=True, + is_delete=True, + opportunity=opportunity2, + config=s3_config, ) already_processed_insert = setup_opportunity_attachment( - create_existing=False, opportunity=opportunity2, is_already_processed=True + create_existing=False, + opportunity=opportunity2, + is_already_processed=True, + config=s3_config, ) already_processed_update = setup_opportunity_attachment( - create_existing=True, opportunity=opportunity2, is_already_processed=True + create_existing=True, + opportunity=opportunity2, + is_already_processed=True, + config=s3_config, ) delete_but_current_missing = setup_opportunity_attachment( - create_existing=False, opportunity=opportunity2, is_delete=True + create_existing=False, + opportunity=opportunity2, + is_delete=True, + config=s3_config, + ) + + # Draft opportunity + opportunity3 = f.OpportunityFactory.create(is_draft=True, opportunity_attachments=[]) + insert4 = setup_opportunity_attachment( + create_existing=False, opportunity=opportunity3, config=s3_config + ) + update4 = setup_opportunity_attachment( + create_existing=True, opportunity=opportunity3, config=s3_config + ) + delete3 = setup_opportunity_attachment( + create_existing=True, + is_delete=True, + opportunity=opportunity3, + config=s3_config, ) transform_opportunity_attachment.run_subtask() @@ -55,13 +102,16 @@ def test_transform_opportunity_attachment(self, db_session, transform_opportunit validate_opportunity_attachment(db_session, insert1) validate_opportunity_attachment(db_session, insert2) validate_opportunity_attachment(db_session, insert3) + validate_opportunity_attachment(db_session, insert4) validate_opportunity_attachment(db_session, update1) validate_opportunity_attachment(db_session, update2) validate_opportunity_attachment(db_session, update3) + validate_opportunity_attachment(db_session, update4) validate_opportunity_attachment(db_session, delete1, expect_in_db=False) validate_opportunity_attachment(db_session, delete2, expect_in_db=False) + validate_opportunity_attachment(db_session, delete3, expect_in_db=False) validate_opportunity_attachment(db_session, already_processed_insert, expect_in_db=False) validate_opportunity_attachment( @@ -71,26 +121,29 @@ def test_transform_opportunity_attachment(self, db_session, transform_opportunit validate_opportunity_attachment(db_session, delete_but_current_missing, expect_in_db=False) metrics = transform_opportunity_attachment.metrics - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_PROCESSED] == 9 - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_DELETED] == 2 - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_INSERTED] == 3 - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_UPDATED] == 3 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_PROCESSED] == 12 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_DELETED] == 3 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_INSERTED] == 4 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_UPDATED] == 4 assert metrics[transform_constants.Metrics.TOTAL_DELETE_ORPHANS_SKIPPED] == 1 db_session.commit() # commit to end any existing transactions as run_subtask starts a new one transform_opportunity_attachment.run_subtask() - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_PROCESSED] == 9 - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_DELETED] == 2 - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_INSERTED] == 3 - assert metrics[transform_constants.Metrics.TOTAL_RECORDS_UPDATED] == 3 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_PROCESSED] == 12 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_DELETED] == 3 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_INSERTED] == 4 + assert metrics[transform_constants.Metrics.TOTAL_RECORDS_UPDATED] == 4 assert metrics[transform_constants.Metrics.TOTAL_DELETE_ORPHANS_SKIPPED] == 1 def test_transform_opportunity_attachment_delete_but_current_missing( - self, db_session, transform_opportunity_attachment + self, db_session, transform_opportunity_attachment, s3_config ): opportunity = f.OpportunityFactory.create(opportunity_attachments=[]) delete_but_current_missing = setup_opportunity_attachment( - create_existing=False, opportunity=opportunity, is_delete=True + create_existing=False, + opportunity=opportunity, + is_delete=True, + config=s3_config, ) transform_opportunity_attachment.process_opportunity_attachment( @@ -102,10 +155,12 @@ def test_transform_opportunity_attachment_delete_but_current_missing( assert delete_but_current_missing.transformation_notes == "orphaned_delete_record" def test_transform_opportunity_attachment_no_opportunity( - self, db_session, transform_opportunity_attachment + self, db_session, transform_opportunity_attachment, s3_config ): opportunity = f.OpportunityFactory.create(opportunity_attachments=[]) - insert = setup_opportunity_attachment(create_existing=False, opportunity=opportunity) + insert = setup_opportunity_attachment( + create_existing=False, opportunity=opportunity, config=s3_config + ) # Don't pass the opportunity in - as if it wasn't found with pytest.raises( @@ -115,3 +170,66 @@ def test_transform_opportunity_attachment_no_opportunity( transform_opportunity_attachment.process_opportunity_attachment(insert, None, None) assert insert.transformed_at is None + + def test_transform_opportunity_attachment_update_file_renamed( + self, db_session, transform_opportunity_attachment, s3_config + ): + opportunity = f.OpportunityFactory.create(is_draft=False, opportunity_attachments=[]) + update = setup_opportunity_attachment( + # Don't create the existing, we'll do that below + create_existing=False, + opportunity=opportunity, + config=s3_config, + ) + + old_s3_path = attachment_util.get_s3_attachment_path( + "old_file_name.txt", update.syn_att_id, opportunity, s3_config + ) + + with file_util.open_stream(old_s3_path, "w") as outfile: + outfile.write(f.fake.sentence(25)) + + target_attachment = f.OpportunityAttachmentFactory.create( + attachment_id=update.syn_att_id, + opportunity=opportunity, + file_location=old_s3_path, + file_name="old_file_name.txt", + ) + + transform_opportunity_attachment.process_opportunity_attachment( + update, target_attachment, opportunity + ) + + validate_opportunity_attachment(db_session, update) + + # Verify the old file name was deleted + assert file_util.file_exists(old_s3_path) is False + + def test_transform_opportunity_attachment_delete_file_missing_on_s3( + self, db_session, transform_opportunity_attachment, s3_config + ): + opportunity = f.OpportunityFactory.create(opportunity_attachments=[]) + + synopsis_attachment = f.StagingTsynopsisAttachmentFactory.create( + opportunity=None, + opportunity_id=opportunity.opportunity_id, + is_deleted=True, + ) + + # Make a realistic path, but don't actually create the file + s3_path = attachment_util.get_s3_attachment_path( + synopsis_attachment.file_name, synopsis_attachment.syn_att_id, opportunity, s3_config + ) + + target_attachment = f.OpportunityAttachmentFactory.create( + attachment_id=synopsis_attachment.syn_att_id, + opportunity=opportunity, + file_location=s3_path, + ) + + # This won't error, s3 delete object doesn't error if the object doesn't exist + transform_opportunity_attachment.process_opportunity_attachment( + synopsis_attachment, target_attachment, opportunity + ) + + validate_opportunity_attachment(db_session, synopsis_attachment, expect_in_db=False) diff --git a/api/tests/src/data_migration/transformation/test_transform_oracle_data_task.py b/api/tests/src/data_migration/transformation/test_transform_oracle_data_task.py index 40c5dc512..0cf9be3f8 100644 --- a/api/tests/src/data_migration/transformation/test_transform_oracle_data_task.py +++ b/api/tests/src/data_migration/transformation/test_transform_oracle_data_task.py @@ -36,7 +36,12 @@ def truncate_all_staging_tables(self, db_session): @pytest.fixture() def transform_oracle_data_task( - self, db_session, enable_factory_create, truncate_opportunities, truncate_all_staging_tables + self, + db_session, + enable_factory_create, + truncate_opportunities, + truncate_all_staging_tables, + s3_config, ) -> TransformOracleDataTask: return TransformOracleDataTask(db_session) @@ -466,7 +471,10 @@ def test_delete_opportunity_with_deleted_children(self, db_session, transform_or # but we'll still have delete events for the others - this verfies how we handle that. existing_opportunity = f.OpportunityFactory( - no_current_summary=True, opportunity_assistance_listings=[], agency_code="AGENCYXYZ" + no_current_summary=True, + opportunity_assistance_listings=[], + agency_code="AGENCYXYZ", + opportunity_attachments=[], ) opportunity = f.StagingTopportunityFactory( opportunity_id=existing_opportunity.opportunity_id, cfdas=[], is_deleted=True diff --git a/api/tests/src/db/models/factories.py b/api/tests/src/db/models/factories.py index a64476dea..16012a838 100644 --- a/api/tests/src/db/models/factories.py +++ b/api/tests/src/db/models/factories.py @@ -227,11 +227,11 @@ class CustomProvider(BaseProvider): YN_YESNO_BOOLEAN_VALUES = ["Y", "N", "Yes", "No"] OPPORTUNITY_ATTACHMENT_S3_PATHS = [ - "s3://local-opportunities/test_file_1.txt", - "s3://local-opportunities/test_file_2.txt", - "s3://local-opportunities/test_file_3.txt", - "s3://local-opportunities/test_file_4.pdf", - "s3://local-opportunities/test_file_5.pdf", + "s3://local-mock-public-bucket/test_file_1.txt", + "s3://local-mock-public-bucket/test_file_2.txt", + "s3://local-mock-public-bucket/test_file_3.txt", + "s3://local-mock-public-bucket/test_file_4.pdf", + "s3://local-mock-public-bucket/test_file_5.pdf", ] def agency_code(self) -> str: @@ -332,7 +332,7 @@ class Meta: file_location = factory.Faker("s3_file_location") mime_type = factory.Faker("mime_type") - file_name = factory.Faker("file_name") + file_name = factory.Faker("file_name", category="text") file_description = factory.Faker("sentence") file_size_bytes = factory.Faker("random_int", min=1000, max=10000000) opportunity_attachment_type = factory.fuzzy.FuzzyChoice(OpportunityAttachmentType) @@ -1006,9 +1006,9 @@ class Meta: att_type: factory.Faker("att_type") mime_type = factory.Faker("mime_type") link_url = factory.Faker("relevant_url") - file_name = factory.Faker("file_name") + file_name = factory.Faker("file_name", category="text") file_desc = factory.Faker("sentence") - file_lob = b"Test attachment" + file_lob = factory.LazyFunction(lambda: fake.sentence(25).encode()) file_lob_size = factory.LazyAttribute(lambda x: len(x.file_lob)) create_date = factory.Faker("date_time_between", start_date="-1y", end_date="now") created_date = factory.LazyAttribute( diff --git a/api/tests/src/search/backend/test_load_opportunities_to_index.py b/api/tests/src/search/backend/test_load_opportunities_to_index.py index 64a5190f1..36296baaa 100644 --- a/api/tests/src/search/backend/test_load_opportunities_to_index.py +++ b/api/tests/src/search/backend/test_load_opportunities_to_index.py @@ -149,18 +149,27 @@ def test_opportunity_attachment_pipeline( opportunity_index_alias, search_client, ): - filename = "test_file_1.txt" - file_path = f"s3://{mock_s3_bucket}/{filename}" + filename_1 = "test_file_1.txt" + file_path_1 = f"s3://{mock_s3_bucket}/{filename_1}" content = "I am a file" - with file_util.open_stream(file_path, "w") as outfile: + + with file_util.open_stream(file_path_1, "w") as outfile: outfile.write(content) + filename_2 = "test_file_2.css" + file_path_2 = f"s3://{mock_s3_bucket}/{filename_2}" + opportunity = OpportunityFactory.create(opportunity_attachments=[]) OpportunityAttachmentFactory.create( - mime_type="text/plain", opportunity=opportunity, - file_location=file_path, - file_name=filename, + file_location=file_path_1, + file_name=filename_1, + ) + + OpportunityAttachmentFactory.create( + opportunity=opportunity, + file_location=file_path_2, + file_name=filename_2, ) load_opportunities_to_index.index_name = ( @@ -172,11 +181,14 @@ def test_opportunity_attachment_pipeline( resp = search_client.search(opportunity_index_alias, {"size": 100}) record = [d for d in resp.records if d.get("opportunity_id") == opportunity.opportunity_id] - attachment = record[0]["attachments"][0] + attachments = record[0]["attachments"] + + # assert only one (allowed) opportunity attachment was uploaded + assert len(attachments) == 1 # assert correct attachment was uploaded - assert attachment["filename"] == filename + assert attachments[0]["filename"] == filename_1 # assert data was b64encoded - assert attachment["attachment"]["content"] == content # decoded b64encoded attachment + assert attachments[0]["attachment"]["content"] == content # decoded b64encoded attachment class TestLoadOpportunitiesToIndexPartialRefresh(BaseTestClass): diff --git a/api/tests/src/services/opportunity_attachments/__init__.py b/api/tests/src/services/opportunity_attachments/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/tests/src/services/opportunity_attachments/test_attachment_util.py b/api/tests/src/services/opportunity_attachments/test_attachment_util.py new file mode 100644 index 000000000..6be17a793 --- /dev/null +++ b/api/tests/src/services/opportunity_attachments/test_attachment_util.py @@ -0,0 +1,61 @@ +import pytest + +from src.adapters.aws import S3Config +from src.services.opportunity_attachments import attachment_util +from tests.src.db.models.factories import OpportunityFactory + + +@pytest.mark.parametrize( + "is_draft,opportunity_id,attachment_id,file_name,expected_path", + [ + ( + False, + 123, + 456, + "my_file.txt", + "s3://test-public-bucket/opportunities/123/attachments/456/my_file.txt", + ), + ( + True, + 12345, + 45678, + "example.pdf", + "s3://test-draft-bucket/opportunities/12345/attachments/45678/example.pdf", + ), + ( + False, + 1, + 1, + "example.docx", + "s3://test-public-bucket/opportunities/1/attachments/1/example.docx", + ), + ], +) +def test_get_s3_attachment_path(is_draft, opportunity_id, attachment_id, file_name, expected_path): + config = S3Config( + PUBLIC_FILES_BUCKET="s3://test-public-bucket", DRAFT_FILES_BUCKET="s3://test-draft-bucket" + ) + + opp = OpportunityFactory.build(opportunity_id=opportunity_id, is_draft=is_draft) + + assert ( + attachment_util.get_s3_attachment_path(file_name, attachment_id, opp, config) + == expected_path + ) + + +@pytest.mark.parametrize( + "existing_file_name,expected_file_name", + [ + ("abc.txt", "abc.txt"), + ("my file.pdf", "my_file.pdf"), + ("a.b.c.wav", "a.b.c.wav"), + ("my-valid~file_is.good.txt", "my-valid~file_is.good.txt"), + ("!@#$%^&*()'\",/;'myfile.txt", "myfile.txt"), + ("0123456789 |[]", "0123456789_"), + ("many spaces.txt", "many_spaces.txt"), + ("other\t\twhitespace\n\nremoved.txt", "other_whitespace_removed.txt"), + ], +) +def test_adjust_legacy_file_name(existing_file_name, expected_file_name): + assert attachment_util.adjust_legacy_file_name(existing_file_name) == expected_file_name diff --git a/api/tests/src/util/test_file_util.py b/api/tests/src/util/test_file_util.py index c2a18ca4d..06cddbf0c 100644 --- a/api/tests/src/util/test_file_util.py +++ b/api/tests/src/util/test_file_util.py @@ -5,6 +5,7 @@ from smart_open import open as smart_open import src.util.file_util as file_util +import tests.src.db.models.factories as f def create_file(root_path, file_path): @@ -62,7 +63,7 @@ def test_get_s3_bucket(path, bucket): ("s3://bucket/folder/test.txt", "folder/test.txt"), ("s3://bucket_x/file.csv", "file.csv"), ("s3://bucket-y/folder/path/to/abc.zip", "folder/path/to/abc.zip"), - ("./folder/path", "/folder/path"), + ("./folder/path", "./folder/path"), ("sftp://folder/filename", "filename"), ], ) @@ -110,3 +111,103 @@ def test_get_file_length_bytes_s3_with_content(mock_s3_bucket): # Verify size matches content length assert size == len(test_content) + + +def test_file_exists_local_filesystem(tmp_path): + file_path1 = tmp_path / "test.txt" + file_path2 = tmp_path / "test2.txt" + file_path3 = tmp_path / "test3.txt" + + with file_util.open_stream(file_path1, "w") as outfile: + outfile.write("hello") + with file_util.open_stream(file_path2, "w") as outfile: + outfile.write("hello") + with file_util.open_stream(file_path3, "w") as outfile: + outfile.write("hello") + + assert file_util.file_exists(file_path1) is True + assert file_util.file_exists(file_path2) is True + assert file_util.file_exists(file_path3) is True + assert file_util.file_exists(tmp_path / "test4.txt") is False + assert file_util.file_exists(tmp_path / "test5.txt") is False + + +def test_file_exists_s3(mock_s3_bucket): + file_path1 = f"s3://{mock_s3_bucket}/test.txt" + file_path2 = f"s3://{mock_s3_bucket}/test2.txt" + file_path3 = f"s3://{mock_s3_bucket}/test3.txt" + + with file_util.open_stream(file_path1, "w") as outfile: + outfile.write("hello") + with file_util.open_stream(file_path2, "w") as outfile: + outfile.write("hello") + with file_util.open_stream(file_path3, "w") as outfile: + outfile.write("hello") + + assert file_util.file_exists(file_path1) is True + assert file_util.file_exists(file_path2) is True + assert file_util.file_exists(file_path3) is True + assert file_util.file_exists(f"s3://{mock_s3_bucket}/test4.txt") is False + assert file_util.file_exists(f"s3://{mock_s3_bucket}/test5.txt") is False + + +def test_copy_file_s3(mock_s3_bucket, other_mock_s3_bucket): + file_path = f"s3://{mock_s3_bucket}/my_file.txt" + + with file_util.open_stream(file_path, "w") as outfile: + outfile.write(f.fake.sentence(25)) + + other_file_path = f"s3://{other_mock_s3_bucket}/my_new_file.txt" + file_util.copy_file(file_path, other_file_path) + + assert file_util.file_exists(file_path) is True + assert file_util.file_exists(other_file_path) is True + + assert file_util.read_file(file_path) == file_util.read_file(other_file_path) + + +def test_copy_file_local_disk(tmp_path): + file_path = tmp_path / "my_file.txt" + + with file_util.open_stream(file_path, "w") as outfile: + outfile.write(f.fake.sentence(25)) + + other_file_path = tmp_path / "my_file2.txt" + file_util.copy_file(file_path, other_file_path) + + assert file_util.file_exists(file_path) is True + assert file_util.file_exists(other_file_path) is True + + assert file_util.read_file(file_path) == file_util.read_file(other_file_path) + + +def test_move_file_s3(mock_s3_bucket, other_mock_s3_bucket): + file_path = f"s3://{mock_s3_bucket}/my_file_to_copy.txt" + + contents = f.fake.sentence(25) + with file_util.open_stream(file_path, "w") as outfile: + outfile.write(contents) + + other_file_path = f"s3://{other_mock_s3_bucket}/my_destination_file.txt" + file_util.move_file(file_path, other_file_path) + + assert file_util.file_exists(file_path) is False + assert file_util.file_exists(other_file_path) is True + + assert file_util.read_file(other_file_path) == contents + + +def test_move_file_local_disk(tmp_path): + file_path = tmp_path / "my_file_to_move.txt" + + contents = f.fake.sentence(25) + with file_util.open_stream(file_path, "w") as outfile: + outfile.write(contents) + + other_file_path = tmp_path / "my_moved_file.txt" + file_util.move_file(file_path, other_file_path) + + assert file_util.file_exists(file_path) is False + assert file_util.file_exists(other_file_path) is True + + assert file_util.read_file(other_file_path) == contents