From ff3370835c48459038c4f48f69db86df85a1a599 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 6 Nov 2023 17:51:47 -0800
Subject: [PATCH 01/42] Add RikoltiStorage utility class

---
 utils/rikolti_storage.py | 138 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 utils/rikolti_storage.py

diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
new file mode 100644
index 000000000..0f84be0fe
--- /dev/null
+++ b/utils/rikolti_storage.py
@@ -0,0 +1,138 @@
+import os
+import boto3
+from urllib.parse import urlparse
+from typing import Optional
+
+class RikoltiStorage():
+    def __init__(self, data_url: str):
+        self.data_url = data_url
+        data_loc = urlparse(data_url)
+        self.data_store = data_loc.scheme
+        self.data_bucket = data_loc.netloc
+        self.data_path = data_loc.path
+
+        self.s3 = boto3.client('s3')
+
+    def list_pages(self) -> list:
+        if self.data_store == 's3':
+            return self.list_s3_pages()
+        elif self.data_store == 'file':
+            return self.list_file_pages()
+        else:
+            raise Exception(f"Unknown data store: {self.data_store}")
+
+    def list_s3_pages(self) -> list:
+        """
+        List all objects in s3_bucket with prefix s3_prefix
+        """
+        keys = self.s3.list_objects_v2(
+            Bucket=self.data_bucket, 
+            Prefix=self.data_path
+        )
+        return keys
+
+    def list_file_pages(self) -> list:
+        """
+        List all files in file_path
+        """
+        file_objects = []
+        for root, dirs, files in os.walk(self.data_path):
+            for file in files:
+                file_objects.append(os.path.join(root, file))
+        return file_objects
+
+    def search_page(self, search_str: str, page: str) -> bool:
+        if self.data_store == 's3':
+            return self.search_s3_contents(search_str, page)
+        elif self.data_store == 'file':
+            return self.search_file_contents(search_str, page)
+        else:
+            raise Exception(f"Unknown data store: {self.data_store}")
+    
+    def search_s3_page(self, search_str: str, s3_key: str) -> bool:
+        """
+        Check if search_str is in the body of the object located at s3_key
+        Returns the s3_key of the object if so, otherwise returns None
+        """
+        obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key)
+        body = obj['Body'].read().decode('utf-8')
+        if search_str in body:
+            return True
+        else:
+            return False
+    
+    def search_file_page(self, search_str: str, file_path: str) -> bool:
+        """
+        Check if search_str is in the body of the file located at file_path
+        """
+        with open(file_path, 'r') as f:
+            body = f.read()
+            if search_str in body:
+                return True
+            else:
+                return False
+
+    def get_page_content(self, page: str):
+        if self.data_store == 's3':
+            return self.get_s3_contents(page)
+        elif self.data_store == 'file':
+            return self.get_file_contents(page)
+        else:
+            raise Exception(f"Unknown data store: {self.data_store}")
+    
+    def get_s3_contents(self, s3_key: str):
+        """
+        Get the body of the object located at s3_key
+        """
+        obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key)
+        return obj['Body'].read().decode('utf-8')
+    
+    def get_file_contents(self, file_path: str):
+        """
+        Get the body of the file located at file_path
+        """
+        with open(file_path, 'r') as f:
+            return f.read()
+
+    def put_page_content(self, content:str, relative_path: Optional[str]=None):
+        """
+        Write content to a file at relative_path (relative to data_path).
+        relative_path is a list of strings, each string is a directory name 
+        representing a directory tree.
+        handle s3 or file storage, use '/' as separator for s3 key and os.sep
+        as separtors for file storage
+        """
+        path = self.data_path
+        if relative_path:
+            path += relative_path
+
+        if self.data_store == 's3':
+            return self.put_s3_content(path, content)
+        elif self.data_store == 'file':
+            return self.put_file_content(path, content)
+        else:
+            raise Exception(f"Unknown data store: {self.data_store}")
+
+    def put_file_content(self, file_path, content):
+        """
+        Write content to a file at file_path
+        """
+        file_path = os.sep.join(file_path.split('/'))
+        directory_path = os.path.dirname(file_path)
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
+
+        with open(file_path, 'w') as f:
+            f.write(content)
+    
+    def put_s3_content(self, s3_key, content):
+        """
+        Write content to an object named s3_key
+        """
+        self.s3.put_object(
+            ACL='bucket-owner-full-control',
+            Bucket=self.data_bucket,
+            Key=s3_key,
+            Body=content
+        )
+

From 09c1c29ee2eb40070a6db5814dad4bd99f39f4dc Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 6 Nov 2023 17:52:25 -0800
Subject: [PATCH 02/42] Factor out storage considerations from metadata fetcher

---
 metadata_fetcher/fetchers/Fetcher.py          | 62 ++++---------------
 metadata_fetcher/fetchers/nuxeo_fetcher.py    |  5 --
 metadata_fetcher/fetchers/ucd_json_fetcher.py | 14 +++--
 metadata_fetcher/settings.py                  |  5 --
 4 files changed, 21 insertions(+), 65 deletions(-)

diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index 43c16c41c..0d27e78d3 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -1,12 +1,9 @@
 import logging
-import os
-import sys
-
-import boto3
 import requests
 
 from .. import settings
 from requests.adapters import HTTPAdapter, Retry
+from rikolti.utils.rikolti_storage import RikoltiStorage
 
 
 logger = logging.getLogger(__name__)
@@ -29,53 +26,11 @@ def __init__(self, params):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        bucket = settings.DATA_DEST["BUCKET"]
+        self.data_destination = RikoltiStorage(settings.DATA_DEST_URL)
 
-        self.s3_data = {
-            "ACL": 'bucket-owner-full-control',
-            "Bucket": bucket,
-            "Key": f"{self.collection_id}/vernacular_metadata/"
-        }
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
 
-    def fetchtolocal(self, page):
-        path = self.get_local_path()
-
-        filename = os.path.join(path, f"{self.write_page}")
-        f = open(filename, "w+")
-
-        f.write(page)
-
-    def get_local_path(self):
-        local_path = os.sep.join([
-            settings.DATA_DEST["PATH"],
-            str(self.collection_id),
-            'vernacular_metadata',
-        ])
-        if not os.path.exists(local_path):
-            os.makedirs(local_path)
-
-        return local_path
-
-    def fetchtos3(self, page):
-        s3_client = boto3.client('s3')
-        s3_key = self.s3_data['Key']
-
-        try:
-            # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_object
-            s3_client.put_object(
-                ACL=self.s3_data['ACL'],
-                Bucket=self.s3_data['Bucket'],
-                Key=(
-                    f"{s3_key}"
-                    f"{self.write_page}"
-                ),
-                Body=page)
-        except Exception as e:
-            print(f"Metadata Fetcher: {e}", file=sys.stderr)
-            raise(e)
-
     def fetch_page(self):
         page = self.build_fetch_request()
         logger.debug(
@@ -92,10 +47,15 @@ def fetch_page(self):
         record_count = self.check_page(response)
         if record_count:
             content = self.aggregate_vernacular_content(response.text)
-            if settings.DATA_DEST["STORE"] != 's3':
-                self.fetchtolocal(content)
-            else:
-                self.fetchtos3(content)
+            try:
+                self.data_destination.put_page_content(
+                    content, relative_path=(
+                        f"{self.collection_id}/vernacular_metadata/{self.write_page}"
+                    )
+                )
+            except Exception as e:
+                print(f"Metadata Fetcher: {e}")
+                raise(e)
 
         self.increment(response)
 
diff --git a/metadata_fetcher/fetchers/nuxeo_fetcher.py b/metadata_fetcher/fetchers/nuxeo_fetcher.py
index b1a301971..754c88390 100644
--- a/metadata_fetcher/fetchers/nuxeo_fetcher.py
+++ b/metadata_fetcher/fetchers/nuxeo_fetcher.py
@@ -79,11 +79,6 @@ def __init__(self, params):
             }
 
         if self.nuxeo['query_type'] == 'children':
-            if settings.DATA_DEST != 's3':
-                path = self.get_local_path()
-                children_path = os.path.join(path, "children")
-                if not os.path.exists(children_path):
-                    os.mkdir(children_path)
             self.write_page = (
                 "children/"
                 f"{self.nuxeo['current_path']['uid']}-"
diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index a8b99d6c8..7b646b192 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -1,4 +1,5 @@
 import json
+import sys
 from .Fetcher import Fetcher, FetchError
 import requests
 from xml.etree import ElementTree
@@ -63,10 +64,15 @@ def fetch_all_pages(self, response: requests.Response) -> int:
             records = list(filter(None, [self.fetch_json_ld(url.text) for url in urls]))
             content = json.dumps(records)
 
-            if settings.DATA_DEST.get("STORE") == "file":
-                self.fetchtolocal(content)
-            else:
-                self.fetchtos3(content)
+            try:
+                self.data_destination.put_page_content(
+                    content, relative_path=(
+                        f"{self.collection_id}/vernacular_metadata/{self.write_page}"
+                    )
+                )
+            except Exception as e:
+                print(f"Metadata Fetcher: {e}", file=sys.stderr)
+                raise(e)
 
             self.write_page += 1
         return len(loc_nodes)
diff --git a/metadata_fetcher/settings.py b/metadata_fetcher/settings.py
index 7c6c2f729..e18110918 100644
--- a/metadata_fetcher/settings.py
+++ b/metadata_fetcher/settings.py
@@ -13,11 +13,6 @@
 FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY')
 
 DATA_DEST_URL = os.environ.get("FETCHER_DATA_DEST", "file:///tmp")
-DATA_DEST = {
-    "STORE": urlparse(DATA_DEST_URL).scheme,
-    "BUCKET": urlparse(DATA_DEST_URL).netloc,
-    "PATH": urlparse(DATA_DEST_URL).path
-}
 
 for key, value in os.environ.items():
     logger.debug(f"{key}={value}")

From 21576bc5e850094a0f08a0288d8279960daca5b0 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 12:05:48 -0800
Subject: [PATCH 03/42] Factor out metadata mapper source data storage
 considerations

---
 metadata_mapper/lambda_function.py |  5 +++-
 metadata_mapper/lambda_shepherd.py | 40 +++++++++---------------------
 metadata_mapper/mappers/mapper.py  | 31 -----------------------
 utils/rikolti_storage.py           | 28 +++++++++++----------
 4 files changed, 31 insertions(+), 73 deletions(-)

diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 0b1676c84..2c2763480 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -7,6 +7,7 @@
 
 from . import settings
 from .mappers.mapper import Record, UCLDCWriter, Vernacular
+from rikolti.utils.rikolti_storage import RikoltiStorage
 
 logger = logging.getLogger(__name__)
 
@@ -78,7 +79,9 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
     source_vernacular = vernacular_reader(collection_id, page_filename)
-    api_resp = source_vernacular.get_api_response()
+    storage = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata/{page_filename}")
+    api_resp = storage.get_page_content()
+
     source_metadata_records = source_vernacular.parse(api_resp)
 
     source_metadata_records = run_enrichments(
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index 19d09b369..a371c2a1c 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -1,8 +1,6 @@
 import json
-import os
 import sys
 
-import boto3
 import requests
 
 from urllib.parse import urlparse
@@ -10,6 +8,7 @@
 from . import settings, validate_mapping
 from .lambda_function import map_page
 from .mappers.mapper import Record
+from rikolti.utils.rikolti_storage import RikoltiStorage
 
 
 def get_collection(collection_id):
@@ -39,34 +38,19 @@ def check_for_missing_enrichments(collection):
 
 
 def get_vernacular_pages(collection_id):
-    page_list = []
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata")
 
-    if settings.DATA_SRC["STORE"] == 'file':
-        vernacular_path = settings.local_path(
-            collection_id, 'vernacular_metadata')
-        try:
-            page_list = [f for f in os.listdir(vernacular_path)
-                         if os.path.isfile(os.path.join(vernacular_path, f))]
-            children_path = os.path.join(vernacular_path, 'children')
-            if os.path.exists(children_path):
-                page_list += [os.path.join('children', f)
-                              for f in os.listdir(children_path)
-                              if os.path.isfile(os.path.join(children_path, f))]
-        except FileNotFoundError as e:
-            print(
-                f"{e} - have you fetched {collection_id}? "
-                f"looked in dir {e.filename}"
-            )
-            raise(e)
-    elif settings.DATA_SRC["STORE"] == 's3':
-        s3_client = boto3.client('s3')
-        resp = s3_client.list_objects_v2(
-            Bucket=settings.DATA_SRC["BUCKET"],
-            Prefix=f"{collection_id}/vernacular_metadata"
+    try:
+        page_list = rikolti_data.list_pages()
+    except FileNotFoundError as e:
+        print(
+            f"{e} - have you fetched {collection_id}? "
+            f"looked in dir {e.filename} for vernacular pages"
         )
-        # TODO: check resp['IsTruncated'] and use ContinuationToken if needed
-        page_list = [page['Key'] for page in resp['Contents']]
-        # TODO: split page_list into pages and children
+        raise(e)
+
+    # TODO: split page_list into pages and children
     return page_list
 
 
diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py
index 116b232ac..1bea9af1d 100644
--- a/metadata_mapper/mappers/mapper.py
+++ b/metadata_mapper/mappers/mapper.py
@@ -19,7 +19,6 @@
 from .iso639_1 import iso_639_1
 from .iso639_3 import iso_639_3, language_regexes, wb_language_regexes
 
-
 class UCLDCWriter(object):
     def __init__(self, collection_id: int, page_filename: str):
         self.collection_id = collection_id
@@ -56,36 +55,6 @@ def __init__(self, collection_id: int, page_filename: str) -> None:
         self.collection_id = collection_id
         self.page_filename = page_filename
 
-    def get_api_response(self) -> dict:
-        if settings.DATA_SRC["STORE"] == 'file':
-            return self.get_local_api_response()
-        else:
-            return self.get_s3_api_response()
-
-    def get_local_api_response(self) -> str:
-        local_path = settings.local_path(
-            self.collection_id, 'vernacular_metadata')
-        page_path = os.sep.join([local_path, str(self.page_filename)])
-        page = open(page_path, "r")
-        api_response = page.read()
-        return api_response
-
-    def get_s3_api_response(self) -> str:
-        s3_client = boto3.client('s3')
-        if not self.page_filename.startswith(
-            f'{self.collection_id}/vernacular_metadata'):
-            self.page_filename = (
-                f"{self.collection_id}/vernacular_metadata/"
-                f"{self.page_filename}"
-            )
-
-        page = s3_client.get_object(
-            Bucket=settings.DATA_SRC["BUCKET"],
-            Key=self.page_filename
-        )
-        api_response = page['Body'].read()
-        return api_response
-
     def get_records(self, records):
         return [
             self.record_cls(self.collection_id, record)
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 0f84be0fe..4964b363a 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -25,10 +25,12 @@ def list_s3_pages(self) -> list:
         """
         List all objects in s3_bucket with prefix s3_prefix
         """
-        keys = self.s3.list_objects_v2(
+        s3_objects = self.s3.list_objects_v2(
             Bucket=self.data_bucket, 
             Prefix=self.data_path
         )
+        # TODO: check resp['IsTruncated'] and use ContinuationToken if needed
+        keys = [obj['Key'] for obj in s3_objects['Contents']]
         return keys
 
     def list_file_pages(self) -> list:
@@ -43,12 +45,12 @@ def list_file_pages(self) -> list:
 
     def search_page(self, search_str: str, page: str) -> bool:
         if self.data_store == 's3':
-            return self.search_s3_contents(search_str, page)
+            return self.search_s3_page(search_str, page)
         elif self.data_store == 'file':
-            return self.search_file_contents(search_str, page)
+            return self.search_file_page(search_str, page)
         else:
             raise Exception(f"Unknown data store: {self.data_store}")
-    
+
     def search_s3_page(self, search_str: str, s3_key: str) -> bool:
         """
         Check if search_str is in the body of the object located at s3_key
@@ -60,7 +62,7 @@ def search_s3_page(self, search_str: str, s3_key: str) -> bool:
             return True
         else:
             return False
-    
+
     def search_file_page(self, search_str: str, file_path: str) -> bool:
         """
         Check if search_str is in the body of the file located at file_path
@@ -72,26 +74,26 @@ def search_file_page(self, search_str: str, file_path: str) -> bool:
             else:
                 return False
 
-    def get_page_content(self, page: str):
+    def get_page_content(self):
         if self.data_store == 's3':
-            return self.get_s3_contents(page)
+            return self.get_s3_contents()
         elif self.data_store == 'file':
-            return self.get_file_contents(page)
+            return self.get_file_contents()
         else:
             raise Exception(f"Unknown data store: {self.data_store}")
-    
-    def get_s3_contents(self, s3_key: str):
+
+    def get_s3_contents(self):
         """
         Get the body of the object located at s3_key
         """
-        obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key)
+        obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path)
         return obj['Body'].read().decode('utf-8')
     
-    def get_file_contents(self, file_path: str):
+    def get_file_contents(self):
         """
         Get the body of the file located at file_path
         """
-        with open(file_path, 'r') as f:
+        with open(self.data_path, 'r') as f:
             return f.read()
 
     def put_page_content(self, content:str, relative_path: Optional[str]=None):

From d36ba39e8da926e6e580244ce9eaa1ce2478b57b Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 12:12:50 -0800
Subject: [PATCH 04/42] factor out metadata_mapper.settings.local_path

---
 metadata_mapper/mappers/mapper.py               | 16 ++++++++++------
 metadata_mapper/mappers/oai/oai_mapper.py       | 10 +++++++---
 metadata_mapper/settings.py                     |  7 -------
 metadata_mapper/utilities.py                    | 17 ++++++++++++++---
 .../validate_registry_collections.py            |  7 ++++++-
 5 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py
index 1bea9af1d..ee4abdbb9 100644
--- a/metadata_mapper/mappers/mapper.py
+++ b/metadata_mapper/mappers/mapper.py
@@ -25,13 +25,17 @@ def __init__(self, collection_id: int, page_filename: str):
         self.page_filename = page_filename
 
     def write_local_mapped_metadata(self, mapped_metadata):
-        local_path = settings.local_path(
-            self.collection_id, 'mapped_metadata')
-        if not os.path.exists(local_path):
-            os.makedirs(local_path)
-        page_path = os.sep.join([local_path, str(self.page_filename)])
+        mapped_data_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(self.collection_id),
+            'mapped_metadata',
+        ])
+
+        if not os.path.exists(mapped_data_path):
+            os.makedirs(mapped_data_path)
+        page_path = os.sep.join([mapped_data_path, str(self.page_filename)])
         if 'children' in page_path:
-            local_children_path = os.path.join(local_path, 'children')
+            local_children_path = os.path.join(mapped_data_path, 'children')
             if not os.path.exists(local_children_path):
                 os.makedirs(local_children_path)
         page = open(page_path, "w+")
diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py
index 0472a2167..5c9f0d152 100644
--- a/metadata_mapper/mappers/oai/oai_mapper.py
+++ b/metadata_mapper/mappers/oai/oai_mapper.py
@@ -127,9 +127,13 @@ def strip_metadata(self, record_metadata):
     # lxml parser requires bytes input or XML fragments without declaration,
     # so use 'rb' mode
     def get_local_api_response(self):
-        local_path = settings.local_path(
-            self.collection_id, 'vernacular_metadata')
-        page_path = os.sep.join([local_path, str(self.page_filename)])
+        vernacular_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(self.collection_id),
+            'vernacular_metadata',
+        ])
+
+        page_path = os.sep.join([vernacular_path, str(self.page_filename)])
         page = open(page_path, "rb")
         api_response = page.read()
         return api_response
diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py
index adca3d6ab..ec5a989d8 100644
--- a/metadata_mapper/settings.py
+++ b/metadata_mapper/settings.py
@@ -26,10 +26,3 @@
 SOLR_API_KEY = os.environ.get('UCLDC_SOLR_API_KEY', False)
 COUCH_URL = os.environ.get('UCLDC_COUCH_URL', False)
 
-def local_path(collection_id, folder):
-    local_path = os.sep.join([
-        DATA_SRC["PATH"],
-        str(collection_id),
-        folder,
-    ])
-    return local_path
diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py
index 03dd4e3fa..47f29a6e6 100644
--- a/metadata_mapper/utilities.py
+++ b/metadata_mapper/utilities.py
@@ -52,7 +52,12 @@ def get_files(collection_id: int, directory: str) -> list[str]:
     Gets a list of filenames in a given directory.
     """
     if settings.DATA_SRC["STORE"] == "file":
-        path = settings.local_path(collection_id, directory)
+        path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            directory,
+        ])
+
         try:
             return [f for f in os.listdir(path)
                     if os.path.isfile(os.path.join(path, f))]
@@ -103,7 +108,9 @@ def read_from_bucket(collection_id: int, directory: str,
     """
     if settings.DATA_SRC["STORE"] == 'file':
         page_path = os.sep.join([
-            settings.local_path(collection_id, directory),
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            directory,
             str(file_name)
         ])
         try:
@@ -173,7 +180,11 @@ def write_to_bucket(collection_id: int, directory: str,
         content = json.dumps(content)
 
     if settings.DATA_SRC["STORE"] == 'file':
-        dir_path = settings.local_path(collection_id, directory)
+        dir_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            directory,
+        ])
         if not os.path.exists(dir_path):
             os.makedirs(dir_path)
         page_path = os.sep.join([dir_path, str(file_name)])
diff --git a/metadata_mapper/validate_registry_collections.py b/metadata_mapper/validate_registry_collections.py
index 1242f8361..df5a7f6d1 100644
--- a/metadata_mapper/validate_registry_collections.py
+++ b/metadata_mapper/validate_registry_collections.py
@@ -59,7 +59,12 @@ def validate_endpoint(url):
                 continue
             results.append(collection_validation)
 
-            validation_path = settings.local_path(collection_id, 'validation')
+            validation_path = os.sep.join([
+                settings.DATA_SRC["PATH"],
+                str(collection_id),
+                'validation',
+            ])
+
             if not os.path.exists(validation_path):
                 os.makedirs(validation_path)
             page_path = os.sep.join([

From ac36c0ad3970f4643804bbbc45ec062d3ceeed1b Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 16:38:57 -0800
Subject: [PATCH 05/42] Add recursive and relative flags to list_pages

---
 metadata_mapper/lambda_function.py |  2 +-
 metadata_mapper/lambda_shepherd.py |  2 +-
 utils/rikolti_storage.py           | 45 ++++++++++++++++++++++++------
 3 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 2c2763480..2812d7905 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -78,10 +78,10 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
 
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
-    source_vernacular = vernacular_reader(collection_id, page_filename)
     storage = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata/{page_filename}")
     api_resp = storage.get_page_content()
 
+    source_vernacular = vernacular_reader(collection_id, page_filename)
     source_metadata_records = source_vernacular.parse(api_resp)
 
     source_metadata_records = run_enrichments(
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index a371c2a1c..91cd5999d 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -42,7 +42,7 @@ def get_vernacular_pages(collection_id):
         f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata")
 
     try:
-        page_list = rikolti_data.list_pages()
+        page_list = rikolti_data.list_pages(relative=True)
     except FileNotFoundError as e:
         print(
             f"{e} - have you fetched {collection_id}? "
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 4964b363a..0c924e796 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -1,5 +1,8 @@
 import os
+import re
+
 import boto3
+
 from urllib.parse import urlparse
 from typing import Optional
 
@@ -13,15 +16,15 @@ def __init__(self, data_url: str):
 
         self.s3 = boto3.client('s3')
 
-    def list_pages(self) -> list:
+    def list_pages(self, recursive=True, relative=True) -> list:
         if self.data_store == 's3':
-            return self.list_s3_pages()
+            return self.list_s3_pages(recursive=recursive, relative=relative)
         elif self.data_store == 'file':
-            return self.list_file_pages()
+            return self.list_file_pages(recursive=recursive, relative=relative)
         else:
             raise Exception(f"Unknown data store: {self.data_store}")
 
-    def list_s3_pages(self) -> list:
+    def list_s3_pages(self, recursive=True, relative=True) -> list:
         """
         List all objects in s3_bucket with prefix s3_prefix
         """
@@ -30,17 +33,41 @@ def list_s3_pages(self) -> list:
             Prefix=self.data_path
         )
         # TODO: check resp['IsTruncated'] and use ContinuationToken if needed
-        keys = [obj['Key'] for obj in s3_objects['Contents']]
+
+        keys = [f"s3://{self.data_bucket}/{obj['Key']}" for obj in s3_objects['Contents']]
+        prefix = "s3://{self.data_bucket}/{self.data_path}"
+
+        if not recursive:
+            # prune deeper branches
+            leaf_regex = re.escape(prefix) + r"^\/?[\w!'_.*()-]+\/?$"
+            keys = [key for key in keys if re.match(leaf_regex, key)]
+
+        if relative:
+            keys = [key[len(prefix):] for key in keys]
+
         return keys
 
-    def list_file_pages(self) -> list:
+    def list_file_pages(self, recursive=True, relative=True) -> list:
         """
         List all files in file_path
         """
         file_objects = []
-        for root, dirs, files in os.walk(self.data_path):
-            for file in files:
-                file_objects.append(os.path.join(root, file))
+        if recursive:
+            for root, dirs, files in os.walk(self.data_path):
+                root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}"
+                for file in files:
+                    file_objects.append(f"{root_uri}{file}")
+
+        if not recursive:
+            for file in os.listdir(self.data_path):
+                if os.path.isfile(os.path.join(self.data_path, file)):
+                    root_uri = "file://{self.data_path}/" if self.data_path[-1] != '/' else "file://{self.data_path}"
+                    file_objects.append(f"{root_uri}{file}")
+
+        if relative:
+            prefix = "file://{self.data_path}/"
+            file_objects = [file[len(prefix):] for file in file_objects]
+
         return file_objects
 
     def search_page(self, search_str: str, page: str) -> bool:

From 80e5f0a279452568e25c5f6f131be1ce9c34d450 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 17:54:37 -0800
Subject: [PATCH 06/42] factor out metadata mapper source data from utilities

---
 metadata_mapper/lambda_function.py        |   5 +-
 metadata_mapper/mappers/mapper.py         |   2 +-
 metadata_mapper/mappers/oai/oai_mapper.py |  17 +---
 metadata_mapper/utilities.py              | 113 +++-------------------
 utils/rikolti_storage.py                  |  38 ++++++--
 5 files changed, 51 insertions(+), 124 deletions(-)

diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 2812d7905..566f282bb 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -78,7 +78,10 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
 
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
-    storage = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata/{page_filename}")
+    storage = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/"
+        f"vernacular_metadata/{page_filename}"
+    )
     api_resp = storage.get_page_content()
 
     source_vernacular = vernacular_reader(collection_id, page_filename)
diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py
index ee4abdbb9..a40da6dde 100644
--- a/metadata_mapper/mappers/mapper.py
+++ b/metadata_mapper/mappers/mapper.py
@@ -26,7 +26,7 @@ def __init__(self, collection_id: int, page_filename: str):
 
     def write_local_mapped_metadata(self, mapped_metadata):
         mapped_data_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
+            settings.DATA_DEST["PATH"],
             str(self.collection_id),
             'mapped_metadata',
         ])
diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py
index 5c9f0d152..033f4a5bb 100644
--- a/metadata_mapper/mappers/oai/oai_mapper.py
+++ b/metadata_mapper/mappers/oai/oai_mapper.py
@@ -1,10 +1,8 @@
-import os
 from typing import Union
 
 from lxml import etree
 from sickle import models
 
-from ... import settings
 from ..mapper import Record, Vernacular
 
 
@@ -83,6 +81,7 @@ def map_is_shown_by(self):
 class OaiVernacular(Vernacular):
 
     def parse(self, api_response):
+        api_response = bytes(api_response, 'utf-8')
         namespace = {'oai2': 'http://www.openarchives.org/OAI/2.0/'}
         page = etree.XML(api_response)
 
@@ -123,17 +122,3 @@ def strip_metadata(self, record_metadata):
             stripped[key] = value
 
         return stripped
-
-    # lxml parser requires bytes input or XML fragments without declaration,
-    # so use 'rb' mode
-    def get_local_api_response(self):
-        vernacular_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(self.collection_id),
-            'vernacular_metadata',
-        ])
-
-        page_path = os.sep.join([vernacular_path, str(self.page_filename)])
-        page = open(page_path, "rb")
-        api_response = page.read()
-        return api_response
diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py
index 47f29a6e6..6a2573751 100644
--- a/metadata_mapper/utilities.py
+++ b/metadata_mapper/utilities.py
@@ -1,11 +1,9 @@
 import importlib
 import json
-import os
 from typing import Callable, Union
 
-import boto3
-
 from . import settings
+from rikolti.utils.rikolti_storage import RikoltiStorage
 
 
 def returns_callable(func: Callable) -> Callable:
@@ -51,43 +49,10 @@ def get_files(collection_id: int, directory: str) -> list[str]:
     """
     Gets a list of filenames in a given directory.
     """
-    if settings.DATA_SRC["STORE"] == "file":
-        path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            directory,
-        ])
-
-        try:
-            return [f for f in os.listdir(path)
-                    if os.path.isfile(os.path.join(path, f))]
-        except Exception as e:
-            raise Exception(
-                f"{collection_id:<6}: Error listing files in {path}\n"
-                f"{collection_id:<6}: {e}"
-            )
-    elif settings.DATA_SRC["STORE"] == "s3":
-        s3_client = boto3.client('s3')
-        try:
-            resp = s3_client.list_objects_v2(
-                Bucket=settings.DATA_SRC["BUCKET"],
-                Prefix=f"{collection_id}/{directory}"
-            )
-            # TODO: check resp['IsTruncated'] and use ContinuationToken if needed
-            return [page['Key'] for page in resp['Contents']]
-        except Exception as e:
-            s3_url = (
-                f"s3://{settings.DATA_SRC['BUCKET']}/{collection_id}/"
-                f"{directory}/")
-            url = (
-                f"https://{settings.DATA_SRC['BUCKET']}.s3.us-west-2.amazonaws"
-                ".com/index.html#{collection_id}/"
-            )
-            raise Exception(
-                f"{collection_id<6}: Error listing files at {s3_url}\n"
-                f"{collection_id<6}: Check that {directory} exists at {url}\n"
-                f"{collection_id<6}: {e}"
-            )
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/{directory}")
+    rikolti_data.list_pages(recursive=False, relative=True)
+
 
 def read_from_bucket(collection_id: int, directory: str,
                      file_name: Union[str, int]) -> str:
@@ -106,40 +71,10 @@ def read_from_bucket(collection_id: int, directory: str,
     Returns: str
         The file contents
     """
-    if settings.DATA_SRC["STORE"] == 'file':
-        page_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            directory,
-            str(file_name)
-        ])
-        try:
-            with open(page_path, "r") as metadata_file:
-                return metadata_file.read()
-        except Exception as e:
-            raise Exception(
-                f"{collection_id:<6}: Error reading {page_path}\n"
-                f"{collection_id:<6}: {e}"
-            )
-    elif settings.DATA_SRC["STORE"] == 's3':
-        s3_client = boto3.client('s3')
-        try:
-            s3_obj_summary = s3_client.get_object(
-                Bucket=settings.DATA_SRC["BUCKET"],
-                Key=f"{file_name}"
-            )
-            return s3_obj_summary['Body'].read()
-        except Exception as e:
-            s3_url = (f"s3://{settings.DATA_SRC['BUCKET']}/{file_name}")
-            url = (
-                f"https://{settings.DATA_SRC['BUCKET']}.s3.us-west-2.amazonaws"
-                ".com/index.html#{file_name}/"
-            )
-            raise Exception(
-                f"{collection_id<6}: Error reading file at {s3_url}\n"
-                f"{collection_id<6}: Check {url}\n"
-                f"{collection_id<6}: {e}"
-            )
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}")
+    return rikolti_data.get_page_content()
+    
 
 def read_mapped_metadata(collection_id: int, page_id: int) -> list[dict]:
     """
@@ -174,34 +109,12 @@ def read_vernacular_metadata(collection_id: int, page_id: int) -> list[dict]:
 
 
 def write_to_bucket(collection_id: int, directory: str,
-                    file_name: Union[str, int], content: str,
-                    append: bool = False) -> None:
+                    file_name: Union[str, int], content: str) -> None:
     if isinstance(content, list) or isinstance(content, dict):
         content = json.dumps(content)
 
-    if settings.DATA_SRC["STORE"] == 'file':
-        dir_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            directory,
-        ])
-        if not os.path.exists(dir_path):
-            os.makedirs(dir_path)
-        page_path = os.sep.join([dir_path, str(file_name)])
-
-        with open(page_path, "a" if append else "w") as file:
-            file.write(content)
-        file_location = f"file://{page_path}"
-    elif settings.DATA_SRC["STORE"] == 's3':
-        s3_client = boto3.client('s3')
-        key = (
-            f"{collection_id}/{directory}/"
-            f"{file_name}"
-        )
-        s3_client.put_object(
-            Bucket=settings.DATA_DEST["BUCKET"],
-            Key=key,
-            Body=content)
-        file_location = f"s3://{settings.DATA_DEST['BUCKET']}/{key}"
+    rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}")
+    rikolti_data.put_page_content(content, str(file_name))
+    file_location = f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}"
 
     return file_location
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 0c924e796..722d4ae5f 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -18,9 +18,22 @@ def __init__(self, data_url: str):
 
     def list_pages(self, recursive=True, relative=True) -> list:
         if self.data_store == 's3':
-            return self.list_s3_pages(recursive=recursive, relative=relative)
+            try:
+                return self.list_s3_pages(recursive=recursive, relative=relative)
+            except Exception as e:
+                url = (
+                    f"https://{self.data_bucket}.s3.us-west-2.amazonaws"
+                    ".com/index.html#{self.data_path}/"
+                )
+                raise Exception(
+                    f"Error listing files at {self.data_url}\n"
+                    f"Check that {self.data_path} exists at {url}\n{e}"
+            )
         elif self.data_store == 'file':
-            return self.list_file_pages(recursive=recursive, relative=relative)
+            try:
+                return self.list_file_pages(recursive=recursive, relative=relative)
+            except Exception as e:
+                raise Exception(f"Error listing files in {path}\n{e}")
         else:
             raise Exception(f"Unknown data store: {self.data_store}")
 
@@ -113,15 +126,28 @@ def get_s3_contents(self):
         """
         Get the body of the object located at s3_key
         """
-        obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path)
-        return obj['Body'].read().decode('utf-8')
+        try:
+            obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path)
+            return obj['Body'].read().decode('utf-8')
+        except Exception as e:
+            url = (
+                f"https://{self.data_bucket}.s3.us-west-2.amazonaws.com/"
+                "index.html#{self.data_path}/"
+            )
+            raise Exception(
+                f"Error reading file at {self.data_url}\nCheck: {url}\n{e}"
+            )
     
     def get_file_contents(self):
         """
         Get the body of the file located at file_path
         """
-        with open(self.data_path, 'r') as f:
-            return f.read()
+        try:
+            with open(self.data_path, 'r') as f:
+                return f.read()
+        except Exception as e:
+            raise Exception(f"Error reading {self.data_path}\n{e}")
+
 
     def put_page_content(self, content:str, relative_path: Optional[str]=None):
         """

From 29ced6a829f4495946c2c9903aae0f440809c033 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 17:55:42 -0800
Subject: [PATCH 07/42] remove validate_registry_collections and tests.py

---
 metadata_mapper/tests.py                      | 60 -------------
 .../validate_registry_collections.py          | 90 -------------------
 2 files changed, 150 deletions(-)
 delete mode 100644 metadata_mapper/tests.py
 delete mode 100644 metadata_mapper/validate_registry_collections.py

diff --git a/metadata_mapper/tests.py b/metadata_mapper/tests.py
deleted file mode 100644
index 9bc75e812..000000000
--- a/metadata_mapper/tests.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import argparse
-import json
-import logging
-import os
-
-from . import settings
-from .lambda_shepherd import map_collection
-from .map_registry_collections import map_endpoint
-from .sample_data.islandora_harvests import islandora_harvests
-from .sample_data.nuxeo_harvests import (nuxeo_complex_object_harvests,
-                                         nuxeo_harvests,
-                                         nuxeo_nested_complex_object_harvests)
-from .sample_data.oac_harvests import oac_harvests
-from .validate_mapping import validate_collection
-from .validate_registry_collections import validate_endpoint
-
-
-def main():
-    vernacular_path = settings.DATA_SRC["PATH"]
-    urls = [
-        f"https://registry.cdlib.org/api/v1/rikoltimapper/{f}/?format=json"
-        for f in os.listdir(vernacular_path)
-    ]
-    for url in urls:
-        map_endpoint(url)
-
-    for url in urls:
-        validate_endpoint(url)
-
-
-def test_static_samples():
-    harvests = [
-        oac_harvests[0], islandora_harvests[0],
-        nuxeo_harvests[0], nuxeo_complex_object_harvests[0],
-        nuxeo_nested_complex_object_harvests[0]
-    ]
-
-    for harvest in harvests:
-        print(f"tests.py: {json.dumps(harvest)}")
-        status = map_collection(json.dumps(harvest), {})
-        print(f"Map status: {status}")
-
-    for harvest in harvests:
-        print(f"validate mapping: {json.dumps(harvest)}")
-        validate_collection(json.dumps(harvest))
-        print(f"validated: {str(harvest)}")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '-log',
-        '--loglevel',
-        default='warning',
-        help='log level (default: warning)'
-    )
-    args = parser.parse_args()
-    logging.basicConfig(level=args.loglevel.upper())
-    logging.info('logging now set up')
-    main()
diff --git a/metadata_mapper/validate_registry_collections.py b/metadata_mapper/validate_registry_collections.py
deleted file mode 100644
index df5a7f6d1..000000000
--- a/metadata_mapper/validate_registry_collections.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import argparse
-import json
-import logging
-import os
-import sys
-from datetime import datetime
-
-import requests
-import urllib3
-
-from . import settings
-from .validate_mapping import validate_collection
-
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-
-def validate_endpoint(url):
-    collection_page = url
-    results = []
-
-    while collection_page:
-        try:
-            response = requests.get(url=collection_page)
-            response.raise_for_status()
-        except requests.exceptions.HTTPError as err:
-            msg = (
-                f"[{collection_page}]: "
-                f"{err}; A valid collection id is required for validation"
-            )
-            print(msg)
-            collection_page = None
-            break
-
-        total_collections = response.json().get('meta', {}).get('total_count', 1)
-        print(
-            f">>> Validating {total_collections} collections "
-            f"described at {collection_page}"
-        )
-
-        collection_page = response.json().get('meta', {}).get('next')
-        if collection_page:
-            collection_page = f"https://registry.cdlib.org{collection_page}"
-        logging.debug(f"Next page: {collection_page}")
-        collections = response.json().get('objects', [response.json()])
-        for collection in collections:
-            collection_id = collection['collection_id']
-            log_msg = f"[{collection_id}]: " + "{}"
-            print(log_msg.format(
-                f"Validating collection {collection_id} - "
-                f"{collection['solr_count']} items in solr as of "
-                f"{collection['solr_last_updated']}"
-            ))
-            logging.debug(log_msg.format(f"lambda payload: {collection}"))
-            try:
-                collection_validation = validate_collection(
-                    json.dumps(collection))
-            except FileNotFoundError:
-                print(f"[{collection_id}]: not fetched yet")
-                continue
-            results.append(collection_validation)
-
-            validation_path = os.sep.join([
-                settings.DATA_SRC["PATH"],
-                str(collection_id),
-                'validation',
-            ])
-
-            if not os.path.exists(validation_path):
-                os.makedirs(validation_path)
-            page_path = os.sep.join([
-                validation_path,
-                f"{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
-            ])
-            output = open(page_path, "w")
-            for field_validation in collection_validation:
-                output.write(field_validation)
-                output.write('\n')
-            output.close()
-
-    return results
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Run mapper for registry endpoint")
-    parser.add_argument('endpoint', help='registry api endpoint')
-    args = parser.parse_args(sys.argv[1:])
-    validation_errors = validate_endpoint(args.endpoint)
-    # print(validation_errors)
-    sys.exit(0)

From c4a0b5091a2f2496969e0b70c74b93a379d3a435 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 18:06:31 -0800
Subject: [PATCH 08/42] factor out metadata mapper dest data considerations

---
 metadata_mapper/lambda_function.py |  8 ++------
 metadata_mapper/mappers/mapper.py  | 32 +++++-------------------------
 metadata_mapper/settings.py        | 13 ------------
 3 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 566f282bb..b9509f8f5 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -96,8 +96,7 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
 
     writer = UCLDCWriter(collection_id, page_filename)
     # TODO: write interim mapped but not enriched metadata to s3?
-    # if settings.DATA_DEST["STORE"] == 'file':
-    #     writer.write_local_mapped_metadata(
+    #     writer.write_mapped_metadata(
     #         [record.to_dict() for record in mapped_records])
 
     mapped_records = run_enrichments(
@@ -126,10 +125,7 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
     #                   for record in mapped_records]
 
     mapped_metadata = [record.to_dict() for record in mapped_records]
-    if settings.DATA_DEST["STORE"] == 'file':
-        writer.write_local_mapped_metadata(mapped_metadata)
-    else:
-        writer.write_s3_mapped_metadata(mapped_metadata)
+    writer.write_mapped_metadata(mapped_metadata)
 
     return {
         'status': 'success',
diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py
index a40da6dde..70e0c5d32 100644
--- a/metadata_mapper/mappers/mapper.py
+++ b/metadata_mapper/mappers/mapper.py
@@ -24,34 +24,12 @@ def __init__(self, collection_id: int, page_filename: str):
         self.collection_id = collection_id
         self.page_filename = page_filename
 
-    def write_local_mapped_metadata(self, mapped_metadata):
-        mapped_data_path = os.sep.join([
-            settings.DATA_DEST["PATH"],
-            str(self.collection_id),
-            'mapped_metadata',
-        ])
-
-        if not os.path.exists(mapped_data_path):
-            os.makedirs(mapped_data_path)
-        page_path = os.sep.join([mapped_data_path, str(self.page_filename)])
-        if 'children' in page_path:
-            local_children_path = os.path.join(mapped_data_path, 'children')
-            if not os.path.exists(local_children_path):
-                os.makedirs(local_children_path)
-        page = open(page_path, "w+")
-        page.write(json.dumps(mapped_metadata))
-
-    def write_s3_mapped_metadata(self, mapped_metadata):
-        s3_client = boto3.client('s3')
-        key = (
-            f"{self.collection_id}/mapped_metadata/"
-            f"{self.page_filename.split('/')[-1]}"
+    def write_mapped_metadata(self, mapped_metadata):
+        rikolti_data = RikoltiStorage(
+            f"{settings.DATA_DEST_URL}/{self.collection_id}/"
+            f"mapped_metadata/{self.page_filename}"
         )
-        s3_client.put_object(
-            ACL='bucket-owner-full-control',
-            Bucket=settings.DATA_DEST["BUCKET"],
-            Key=key,
-            Body=json.dumps(mapped_metadata))
+        rikolti_data.write_page_content(json.dumps(mapped_metadata))
 
 
 class Vernacular(ABC, object):
diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py
index ec5a989d8..aaecef5fc 100644
--- a/metadata_mapper/settings.py
+++ b/metadata_mapper/settings.py
@@ -1,24 +1,11 @@
 import os
 
-from urllib.parse import urlparse
-
 from dotenv import load_dotenv
 
 load_dotenv()
 
 DATA_SRC_URL = os.environ.get('MAPPER_DATA_SRC', 'file:///tmp')
-DATA_SRC = {
-    "STORE": urlparse(DATA_SRC_URL).scheme,
-    "BUCKET": urlparse(DATA_SRC_URL).netloc,
-    "PATH": urlparse(DATA_SRC_URL).path
-}
-
 DATA_DEST_URL = os.environ.get('MAPPER_DATA_DEST', 'file:///tmp')
-DATA_DEST = {
-    "STORE": urlparse(DATA_DEST_URL).scheme,
-    "BUCKET": urlparse(DATA_DEST_URL).netloc,
-    "PATH": urlparse(DATA_DEST_URL).path
-}
 
 SKIP_UNDEFINED_ENRICHMENTS = os.environ.get('SKIP_UNDEFINED_ENRICHMENTS', False)
 

From 5b4317b374320b8582b04333817efe3d8305e834 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 18:10:25 -0800
Subject: [PATCH 09/42] UCLDCWriter no longer does anything

---
 metadata_mapper/lambda_function.py | 17 ++++++++++++-----
 metadata_mapper/mappers/mapper.py  | 14 --------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index b9509f8f5..71da9db65 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -6,7 +6,7 @@
 from urllib.parse import parse_qs, urlparse
 
 from . import settings
-from .mappers.mapper import Record, UCLDCWriter, Vernacular
+from .mappers.mapper import Record, Vernacular
 from rikolti.utils.rikolti_storage import RikoltiStorage
 
 logger = logging.getLogger(__name__)
@@ -94,10 +94,13 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
         record.to_UCLDC()
     mapped_records = source_metadata_records
 
-    writer = UCLDCWriter(collection_id, page_filename)
     # TODO: write interim mapped but not enriched metadata to s3?
-    #     writer.write_mapped_metadata(
-    #         [record.to_dict() for record in mapped_records])
+    # rikolti_data = RikoltiStorage(
+    #     f"{settings.DATA_DEST_URL}/{collection_id}/"
+    #     f"interim_mapped_metadata/{page_filename}"
+    # )
+    # rikolti_data.put_page_content(json.dumps(
+    #     [record.to_dict() for record in mapped_records]))
 
     mapped_records = run_enrichments(
         mapped_records, collection, 'rikolti__enrichments', page_filename)
@@ -125,7 +128,11 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
     #                   for record in mapped_records]
 
     mapped_metadata = [record.to_dict() for record in mapped_records]
-    writer.write_mapped_metadata(mapped_metadata)
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_DEST_URL}/{collection_id}/"
+        f"mapped_metadata/{page_filename}"
+    )
+    rikolti_data.put_page_content(json.dumps(mapped_metadata))
 
     return {
         'status': 'success',
diff --git a/metadata_mapper/mappers/mapper.py b/metadata_mapper/mappers/mapper.py
index 70e0c5d32..bb5ae3981 100644
--- a/metadata_mapper/mappers/mapper.py
+++ b/metadata_mapper/mappers/mapper.py
@@ -8,10 +8,8 @@
 from datetime import timezone
 from typing import Any, Callable
 
-import boto3
 from markupsafe import Markup
 
-from .. import settings
 from ..utilities import returns_callable
 from ..validator.validation_log import ValidationLog  # noqa: F401
 from ..validator.validator import Validator
@@ -19,18 +17,6 @@
 from .iso639_1 import iso_639_1
 from .iso639_3 import iso_639_3, language_regexes, wb_language_regexes
 
-class UCLDCWriter(object):
-    def __init__(self, collection_id: int, page_filename: str):
-        self.collection_id = collection_id
-        self.page_filename = page_filename
-
-    def write_mapped_metadata(self, mapped_metadata):
-        rikolti_data = RikoltiStorage(
-            f"{settings.DATA_DEST_URL}/{self.collection_id}/"
-            f"mapped_metadata/{self.page_filename}"
-        )
-        rikolti_data.write_page_content(json.dumps(mapped_metadata))
-
 
 class Vernacular(ABC, object):
     def __init__(self, collection_id: int, page_filename: str) -> None:

From da89de336c680284ca22d39d77898824010fc872 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 18:17:28 -0800
Subject: [PATCH 10/42] factor out content_harvester.settings.local_path

---
 content_harvester/by_collection.py |  6 +++++-
 content_harvester/by_page.py       | 30 +++++++++++++++++++++++++-----
 content_harvester/settings.py      |  8 --------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index ea420e6a4..a7987be3a 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -10,7 +10,11 @@
 def get_mapped_pages(collection_id):
     page_list = []
     if settings.DATA_SRC['STORE'] == 'file':
-        mapped_path = settings.local_path(collection_id, 'mapped_metadata')
+        mapped_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            'mapped_metadata',
+        ])
         try:
             page_list = [f for f in os.listdir(mapped_path)
                             if os.path.isfile(os.path.join(mapped_path, f))]
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index a3e48f1e4..1bef1378e 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -26,8 +26,13 @@ class UnsupportedMimetype(Exception):
 def get_mapped_records(collection_id, page_filename, s3_client) -> list:
     mapped_records = []
     if settings.DATA_SRC["STORE"] == 'file':
-        local_path = settings.local_path(collection_id, 'mapped_metadata')
-        page_path = os.path.join(local_path, str(page_filename))
+        local_mapped_data_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            'mapped_metadata',
+        ])
+
+        page_path = os.path.join(local_mapped_data_path, str(page_filename))
         page = open(page_path, "r")
         mapped_records = json.loads(page.read())
     else:
@@ -41,7 +46,12 @@ def get_mapped_records(collection_id, page_filename, s3_client) -> list:
 
 def write_mapped_record(collection_id, record, s3_client):
     if settings.DATA_DEST["STORE"] == 'file':
-        local_path = settings.local_path(collection_id, 'mapped_with_content')
+        local_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            'mapped_with_content',
+        ])
+
         if not os.path.exists(local_path):
             os.makedirs(local_path)
         
@@ -67,7 +77,12 @@ def write_mapped_record(collection_id, record, s3_client):
 
 def write_mapped_page(collection_id, page, records):
     if settings.DATA_DEST["STORE"] == 'file':
-        local_path = settings.local_path(collection_id, 'mapped_with_content')
+        local_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            'mapped_with_content',
+        ])
+
         if not os.path.exists(local_path):
             os.makedirs(local_path)
         page_path = os.path.join(local_path, page)
@@ -78,7 +93,12 @@ def write_mapped_page(collection_id, page, records):
 def get_child_records(collection_id, parent_id, s3_client) -> list:
     mapped_child_records = []
     if settings.DATA_SRC["STORE"] == 'file':
-        local_path = settings.local_path(collection_id, 'mapped_metadata')
+        local_path = os.sep.join([
+            settings.DATA_SRC["PATH"],
+            str(collection_id),
+            'mapped_metadata',
+        ])
+
         children_path = os.path.join(local_path, 'children')
 
         if os.path.exists(children_path):
diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index 0a99cfc9c..6a614ad61 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -41,11 +41,3 @@
     'ffmpeg': '/usr/bin/ffmpeg',
     'ffprobe': '/usr/bin/ffprobe',
 }
-
-def local_path(collection_id, folder):
-    local_path = os.sep.join([
-        DATA_SRC["PATH"],
-        str(collection_id),
-        folder,
-    ])
-    return local_path

From bd758837c74bf0846c75eb0a76bd45dd31419dcd Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 21:21:56 -0800
Subject: [PATCH 11/42] factor out content harvester's data source concerns

---
 content_harvester/by_collection.py |  37 +++-------
 content_harvester/by_page.py       | 114 +++++++----------------------
 content_harvester/settings.py      |   5 --
 utils/rikolti_storage.py           |   5 +-
 4 files changed, 38 insertions(+), 123 deletions(-)

diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index a7987be3a..31dcc51f6 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -1,38 +1,19 @@
 import json
-import os
-
-import boto3
 
 from . import settings
 from .by_page import harvest_page_content
-
+from rikolti.utils.rikolti_storage import RikoltiStorage
 
 def get_mapped_pages(collection_id):
     page_list = []
-    if settings.DATA_SRC['STORE'] == 'file':
-        mapped_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            'mapped_metadata',
-        ])
-        try:
-            page_list = [f for f in os.listdir(mapped_path)
-                            if os.path.isfile(os.path.join(mapped_path, f))]
-        except FileNotFoundError as e:
-            print(f"{e} - have you mapped {collection_id}?")
-    else:
-        s3_client = boto3.client(
-            's3',
-            aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
-            aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
-            aws_session_token=settings.AWS_SESSION_TOKEN,
-            region_name=settings.AWS_REGION
-        )
-        response = s3_client.list_objects_v2(
-            Bucket=settings.DATA_SRC["BUCKET"],
-            Prefix=f'{collection_id}/mapped_metadata/'
-        )
-        page_list = [obj['Key'].split('/')[-1] for obj in response['Contents']]
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata",
+        aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
+        aws_session_token=settings.AWS_SESSION_TOKEN,
+        region_name=settings.AWS_REGION
+    )
+    page_list = rikolti_data.list_pages(recursive=False, relative=True)
     return page_list
 
 
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 1bef1378e..acc064891 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,6 +14,7 @@
 from . import derivatives
 from . import settings
 
+from rikolti.utils.rikolti_storage import RikoltiStorage
 
 class DownloadError(Exception):
     pass
@@ -23,103 +24,40 @@ class UnsupportedMimetype(Exception):
     pass
 
 
-def get_mapped_records(collection_id, page_filename, s3_client) -> list:
+def get_mapped_records(collection_id, page_filename) -> list:
     mapped_records = []
-    if settings.DATA_SRC["STORE"] == 'file':
-        local_mapped_data_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            'mapped_metadata',
-        ])
-
-        page_path = os.path.join(local_mapped_data_path, str(page_filename))
-        page = open(page_path, "r")
-        mapped_records = json.loads(page.read())
-    else:
-        page = s3_client.get_object(
-            Bucket=settings.DATA_SRC["BUCKET"],
-            Key=f"{collection_id}/mapped_metadata/{page_filename}"
-        )
-        mapped_records = json.loads(page['Body'].read())
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/{page_filename}")
+    mapped_records = json.loads(rikolti_data.get_page_content())
     return mapped_records
 
 
-def write_mapped_record(collection_id, record, s3_client):
-    if settings.DATA_DEST["STORE"] == 'file':
-        local_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            'mapped_with_content',
-        ])
-
-        if not os.path.exists(local_path):
-            os.makedirs(local_path)
-        
-        # some ids have slashes
-        page_path = os.path.join(
-            local_path,
-            record.get('calisphere-id').replace(os.sep, '_')
-        )
-        
-        page = open(page_path, "w")
-        page.write(json.dumps(record))
-    else:
-        upload_status = s3_client.put_object(
-            Bucket=settings.DATA_DEST["BUCKET"],
-            Key=(
-                f"{collection_id}/mapped_with_content/"
-                f"{record.get('calisphere-id')}"
-            ),
-            Body=json.dumps(record)
-        )
-        print(f"Upload status: {upload_status}")
+def write_mapped_record(collection_id, record):
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/"
+        f"{record.get('calisphere-id').replace(os.sep, '_')}"
+    )
+    rikolti_data.put_page_content(json.dumps(record))
 
 
 def write_mapped_page(collection_id, page, records):
-    if settings.DATA_DEST["STORE"] == 'file':
-        local_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            'mapped_with_content',
-        ])
-
-        if not os.path.exists(local_path):
-            os.makedirs(local_path)
-        page_path = os.path.join(local_path, page)
-        page = open(page_path, "w")
-        page.write(json.dumps(records))
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/{page}"
+    )
+    rikolti_data.put_page_content(json.dumps(records))
 
 
 def get_child_records(collection_id, parent_id, s3_client) -> list:
     mapped_child_records = []
-    if settings.DATA_SRC["STORE"] == 'file':
-        local_path = os.sep.join([
-            settings.DATA_SRC["PATH"],
-            str(collection_id),
-            'mapped_metadata',
-        ])
-
-        children_path = os.path.join(local_path, 'children')
-
-        if os.path.exists(children_path):
-            child_pages = [file for file in os.listdir(children_path)
-                        if file.startswith(parent_id)]
-            for child_page in child_pages:
-                child_page_path = os.path.join(children_path, child_page)
-                page = open(child_page_path, "r")
-                mapped_child_records.extend(json.loads(page.read()))
-    else:
-        child_pages = s3_client.list_objects_v2(
-            Bucket=settings.DATA_SRC["BUCKET"],
-            Prefix=f"{collection_id}/mapped_metadata/children/{parent_id}"
-        )
-        for child_page in child_pages['Contents']:
-            page = s3_client.get_object(
-                Bucket=settings.DATA_SRC["BUCKET"],
-                Key=child_page['Key']
-            )
-            mapped_child_records.extend(json.loads(page['Body'].read()))
-
+    rikolti_data = RikoltiStorage(
+        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children")
+    children = rikolti_data.list_pages(recursive=False, relative=False)
+    if rikolti_data.data_store == 'file':
+        children = [page for page in children
+                    if os.path.basename(page).startswith(parent_id)]
+    for child in children:
+        child_data = RikoltiStorage(child)
+        mapped_child_records.extend(json.loads(child_data.get_page_content()))
     return mapped_child_records
 
 
@@ -423,7 +361,7 @@ def harvest_page_content(collection_id, page_filename, **kwargs):
         src_auth=auth
     )
 
-    records = get_mapped_records(collection_id, page_filename, harvester.s3)
+    records = get_mapped_records(collection_id, page_filename)
     print(
         f"[{collection_id}, {page_filename}]: "
         f"Harvesting content for {len(records)} records"
@@ -438,7 +376,7 @@ def harvest_page_content(collection_id, page_filename, **kwargs):
         try:
             record_with_content = harvester.harvest(record)
             # write_mapped_record(
-            #    collection_id, record_with_content, harvester.s3)
+            #     collection_id, record_with_content)
             if not record_with_content.get('thumbnail'):
                 warn_level = "ERROR"
                 if 'sound' in record.get('type', []):
diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index 6a614ad61..e54c550d4 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -7,11 +7,6 @@
 load_dotenv()
 
 DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp')
-DATA_SRC = {
-    "STORE": urlparse(DATA_SRC_URL).scheme,
-    "BUCKET": urlparse(DATA_SRC_URL).netloc,
-    "PATH": urlparse(DATA_SRC_URL).path
-}
 
 DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp')
 DATA_DEST = {
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 722d4ae5f..1b419838f 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -7,14 +7,15 @@
 from typing import Optional
 
 class RikoltiStorage():
-    def __init__(self, data_url: str):
+    def __init__(self, data_url: str, **kwargs):
         self.data_url = data_url
         data_loc = urlparse(data_url)
         self.data_store = data_loc.scheme
         self.data_bucket = data_loc.netloc
         self.data_path = data_loc.path
 
-        self.s3 = boto3.client('s3')
+        if self.data_store == 's3':
+            self.s3 = boto3.client('s3', **kwargs)
 
     def list_pages(self, recursive=True, relative=True) -> list:
         if self.data_store == 's3':

From bb16c3fae44b5b983a3c6427de0868bc6f22490f Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 21:22:41 -0800
Subject: [PATCH 12/42] factor out content harvester's data dest concerns

---
 content_harvester/settings.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index e54c550d4..60b825605 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -7,13 +7,7 @@
 load_dotenv()
 
 DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp')
-
 DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp')
-DATA_DEST = {
-    "STORE": urlparse(DATA_DEST_URL).scheme,
-    "BUCKET": urlparse(DATA_DEST_URL).netloc,
-    "PATH": urlparse(DATA_DEST_URL).path
-}
 
 CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp')
 CONTENT_DEST = {

From 39a03a3cf581ff3cb4c218388cc4f4808a72f217 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 21:23:16 -0800
Subject: [PATCH 13/42] factor out content harvester's content dest

---
 content_harvester/settings.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index 60b825605..f7bebe969 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -8,13 +8,7 @@
 
 DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp')
 DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp')
-
 CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp')
-CONTENT_DEST = {
-    "STORE": urlparse(CONTENT_DEST_URL).scheme,
-    "BUCKET": urlparse(CONTENT_DEST_URL).netloc,
-    "PATH": urlparse(CONTENT_DEST_URL).path
-}
 
 AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False)
 AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', False)

From 4cd9471ece9ad419d82dbe29b7ad12b01c0f2af7 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 7 Nov 2023 21:33:21 -0800
Subject: [PATCH 14/42] use registry_endpoint generator in content_harvester

---
 content_harvester/by_registry_endpoint.py | 65 ++++++++++-------------
 1 file changed, 28 insertions(+), 37 deletions(-)

diff --git a/content_harvester/by_registry_endpoint.py b/content_harvester/by_registry_endpoint.py
index b5b73b453..d60ac8761 100644
--- a/content_harvester/by_registry_endpoint.py
+++ b/content_harvester/by_registry_endpoint.py
@@ -4,50 +4,41 @@
 
 from .by_collection import harvest_collection
 
-
-def harvest_endpoint(url):
-    registry_page = url
-    results = []
-
-    while registry_page:
-        try:
-            response = requests.get(url=registry_page)
-            response.raise_for_status()
-        except requests.exceptions.HTTPError as err:
-            print(
-                f"[{registry_page}]: {err}"
-            )
-            registry_page = None
-            break
-
-        total_collections = response.json().get(
-            'meta', {}).get('total_count', 1)
-        print(
-            f">>> Harvesting content for {total_collections} collections "
-            f"described at {registry_page}"
-        )
+def registry_endpoint(url):
+    page = url
+    while page:
+        response = requests.get(url=page)
+        response.raise_for_status()
+        page = response.json().get('meta', {}).get('next', None)
+        if page:
+            page = f"https://registry.cdlib.org{page}"
 
         collections = response.json().get('objects', [response.json()])
         for collection in collections:
-            print(
-                f"> Harvesting content from collection "
-                f"{collection['collection_id']} - {collection['solr_count']} "
-                f"items in solr as of {collection['solr_last_updated']}"
-            )
+            yield collection
 
-            # TODO: what is return val? 
-            collection_stats = harvest_collection(collection)
 
-            collection_stats.update({'solr_count': collection['solr_count']})
-
-            results.append(collection_stats)
+def harvest_endpoint(url, limit=None):
+    response = requests.get(url=url)
+    response.raise_for_status()
+    total = response.json().get('meta', {}).get('total_count', 1)
+    if not limit:
+        limit = total
+    print(
+        f">>> Content harvest for {limit/total} collections described at {url}"
+    )
+    results = []
 
-        print(f">>> Harvested {len(results)} collections")
+    for collection in registry_endpoint(url):
+        print(
+            f"{collection['id']:<6}: {collection['solr_count']} items in solr "
+            f"as of {collection['solr_last_updated']}"
+        )
 
-        registry_page = response.json().get('meta', {}).get('next')
-        if registry_page:
-            registry_page = f"https://registry.cdlib.org{registry_page}"
-        print(f">>> Next page: {registry_page}")
+        # TODO: what is return val? 
+        collection_stats = harvest_collection(collection)
+        collection_stats.update({'solr_count': collection['solr_count']})
+        results.append(collection_stats)
 
     return results
 

From 3d062686eb4874dc88c31f187326945a7d117e4b Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Wed, 8 Nov 2023 12:33:17 -0800
Subject: [PATCH 15/42] fetcher, mapper, content harvester all use
 RikoltiStorage now

---
 content_harvester/by_page.py                  | 4 ++--
 metadata_fetcher/fetchers/Fetcher.py          | 9 +++++----
 metadata_fetcher/fetchers/ucd_json_fetcher.py | 4 +---
 metadata_mapper/utilities.py                  | 4 ++--
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index acc064891..ae3b9bf87 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -47,7 +47,7 @@ def write_mapped_page(collection_id, page, records):
     rikolti_data.put_page_content(json.dumps(records))
 
 
-def get_child_records(collection_id, parent_id, s3_client) -> list:
+def get_child_records(collection_id, parent_id) -> list:
     mapped_child_records = []
     rikolti_data = RikoltiStorage(
         f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children")
@@ -249,7 +249,7 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict:
 
         # Recurse through the record's children (if any)
         child_records = get_child_records(
-            self.collection_id, calisphere_id, self.s3)
+            self.collection_id, calisphere_id)
         if child_records:
             print(
                 f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: "
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index 0d27e78d3..58e42a04c 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -26,7 +26,10 @@ def __init__(self, params):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.data_destination = RikoltiStorage(settings.DATA_DEST_URL)
+        self.data_destination = RikoltiStorage(
+            f"{settings.DATA_DEST_URL}/{self.collection_id}/"
+            "vernacular_metadata/"
+        )
 
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
@@ -49,9 +52,7 @@ def fetch_page(self):
             content = self.aggregate_vernacular_content(response.text)
             try:
                 self.data_destination.put_page_content(
-                    content, relative_path=(
-                        f"{self.collection_id}/vernacular_metadata/{self.write_page}"
-                    )
+                    content, relative_path=f"{self.write_page}"
                 )
             except Exception as e:
                 print(f"Metadata Fetcher: {e}")
diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index 7b646b192..9e3936be0 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -66,9 +66,7 @@ def fetch_all_pages(self, response: requests.Response) -> int:
 
             try:
                 self.data_destination.put_page_content(
-                    content, relative_path=(
-                        f"{self.collection_id}/vernacular_metadata/{self.write_page}"
-                    )
+                    content, relative_path=f"{self.write_page}"
                 )
             except Exception as e:
                 print(f"Metadata Fetcher: {e}", file=sys.stderr)
diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py
index 6a2573751..0ba9f45b8 100644
--- a/metadata_mapper/utilities.py
+++ b/metadata_mapper/utilities.py
@@ -51,7 +51,7 @@ def get_files(collection_id: int, directory: str) -> list[str]:
     """
     rikolti_data = RikoltiStorage(
         f"{settings.DATA_SRC_URL}/{collection_id}/{directory}")
-    rikolti_data.list_pages(recursive=False, relative=True)
+    return rikolti_data.list_pages(recursive=False, relative=True)
 
 
 def read_from_bucket(collection_id: int, directory: str,
@@ -113,7 +113,7 @@ def write_to_bucket(collection_id: int, directory: str,
     if isinstance(content, list) or isinstance(content, dict):
         content = json.dumps(content)
 
-    rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}")
+    rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/")
     rikolti_data.put_page_content(content, str(file_name))
     file_location = f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}"
 

From a00f75ba6e156eb0542feb47dd2cef52988df0a1 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 9 Nov 2023 11:33:41 -0800
Subject: [PATCH 16/42] make paths more absolute throughout codebase

---
 content_harvester/by_collection.py            |  10 +-
 content_harvester/by_page.py                  |  45 +--
 dags/shared_tasks.py                          |  12 +-
 metadata_fetcher/fetchers/Fetcher.py          |  20 +-
 metadata_fetcher/fetchers/ucd_json_fetcher.py |  39 +-
 metadata_fetcher/lambda_function.py           |  19 +-
 metadata_mapper/lambda_function.py            |  36 +-
 metadata_mapper/lambda_shepherd.py            |  10 +-
 metadata_mapper/utilities.py                  |  75 ----
 metadata_mapper/validate_mapping.py           |  24 +-
 metadata_mapper/validator/validation_log.py   |  15 +-
 utils/rikolti_storage.py                      | 351 +++++++++++-------
 12 files changed, 340 insertions(+), 316 deletions(-)

diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index 31dcc51f6..9d7215b5d 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -2,18 +2,18 @@
 
 from . import settings
 from .by_page import harvest_page_content
-from rikolti.utils.rikolti_storage import RikoltiStorage
+from rikolti.utils.rikolti_storage import list_pages
 
 def get_mapped_pages(collection_id):
     page_list = []
-    rikolti_data = RikoltiStorage(
+    page_list = list_pages(
         f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata",
+        recursive=False,
         aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
         aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
         aws_session_token=settings.AWS_SESSION_TOKEN,
         region_name=settings.AWS_REGION
     )
-    page_list = rikolti_data.list_pages(recursive=False, relative=True)
     return page_list
 
 
@@ -32,8 +32,8 @@ def harvest_collection(collection):
 
     print(f"[{collection_id}]: Harvesting content for {len(page_list)} pages")
     collection_stats = {}
-    for page in page_list:
-        collection.update({'page_filename': page})
+    for page_path in page_list:
+        collection.update({'page_path': page_path})
         page_stats = harvest_page_content(**collection)
 
         # in some cases, value is int and in some cases, value is Counter
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index ae3b9bf87..4789a4cbd 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,7 +14,7 @@
 from . import derivatives
 from . import settings
 
-from rikolti.utils.rikolti_storage import RikoltiStorage
+from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content
 
 class DownloadError(Exception):
     pass
@@ -24,40 +24,40 @@ class UnsupportedMimetype(Exception):
     pass
 
 
-def get_mapped_records(collection_id, page_filename) -> list:
+def get_mapped_records(page_path) -> list:
     mapped_records = []
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/{page_filename}")
-    mapped_records = json.loads(rikolti_data.get_page_content())
+    mapped_records = json.loads(get_page_content(page_path))
     return mapped_records
 
 
 def write_mapped_record(collection_id, record):
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/"
-        f"{record.get('calisphere-id').replace(os.sep, '_')}"
-    )
-    rikolti_data.put_page_content(json.dumps(record))
+    put_page_content(
+        json.dumps(record), 
+        (
+            f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/"
+            f"{record.get('calisphere-id').replace(os.sep, '_')}"
+        )
+)
 
 
 def write_mapped_page(collection_id, page, records):
-    rikolti_data = RikoltiStorage(
+    put_page_content(
+        json.dumps(records),
         f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/{page}"
     )
-    rikolti_data.put_page_content(json.dumps(records))
 
 
 def get_child_records(collection_id, parent_id) -> list:
     mapped_child_records = []
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children")
-    children = rikolti_data.list_pages(recursive=False, relative=False)
+    children = list_pages(
+        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children",
+        recursive=False
+    )
     if rikolti_data.data_store == 'file':
         children = [page for page in children
                     if os.path.basename(page).startswith(parent_id)]
     for child in children:
-        child_data = RikoltiStorage(child)
-        mapped_child_records.extend(json.loads(child_data.get_page_content()))
+        mapped_child_records.extend(json.loads(get_page_content(child)))
     return mapped_child_records
 
 
@@ -346,9 +346,10 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] =
         return dest_path
 
 
-# {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "r-0"}
-def harvest_page_content(collection_id, page_filename, **kwargs):
+# {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"}
+def harvest_page_content(collection_id, page_path, **kwargs):
     rikolti_mapper_type = kwargs.get('rikolti_mapper_type')
+    page_filename = os.path.basename(page_path)
 
     # Weird how we have to use username/pass to hit this endpoint
     # but we have to use auth token to hit API endpoint
@@ -361,7 +362,7 @@ def harvest_page_content(collection_id, page_filename, **kwargs):
         src_auth=auth
     )
 
-    records = get_mapped_records(collection_id, page_filename)
+    records = get_mapped_records(page_path)
     print(
         f"[{collection_id}, {page_filename}]: "
         f"Harvesting content for {len(records)} records"
@@ -454,12 +455,12 @@ def harvest_page_content(collection_id, page_filename, **kwargs):
     parser = argparse.ArgumentParser(
         description="Harvest content using a page of mapped metadata")
     parser.add_argument('collection_id', help="Collection ID")
-    parser.add_argument('page_filename', help="Page Filename")
+    parser.add_argument('page_path', help="URI-formatted path to a mapped metadata page")
     parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth")
     args = parser.parse_args()
     arguments = {
         'collection_id': args.collection_id,
-        'page_filename': args.page_filename,
+        'page_filename': args.page_path,
     }
     if args.nuxeo:
         arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo'
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 76ce61f64..04e72d62c 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -39,10 +39,9 @@ def get_collection_fetchdata_task(params=None):
 @task()
 def fetch_collection_task(collection: dict):
     fetch_status = fetch_collection(collection, {})
-
     success = all([page['status'] == 'success' for page in fetch_status])
     total_items = sum([page['document_count'] for page in fetch_status])
-    total_pages = fetch_status[-1]['page'] + 1
+    total_pages = len(fetch_status)
     diff_items = total_items - collection['solr_count']
     date = datetime.strptime(
         collection['solr_last_updated'],
@@ -67,9 +66,12 @@ def fetch_collection_task(collection: dict):
             f"{'more' if diff_items > 0 else 'fewer'} items."
         )
 
-    return [
-        str(page['page']) for page in fetch_status if page['status']=='success'
-    ]
+    vernacular_filepaths = [page['vernacular_filepath'] for page in fetch_status]
+    if not vernacular_filepaths or not success:
+        raise Exception(
+            'vernacular metadata not successfully fetched\n{fetch_status}')
+
+    return vernacular_filepaths
 
 
 @task()
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index 58e42a04c..a6ecd15ef 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -1,7 +1,7 @@
 import logging
 import requests
+import os
 
-from .. import settings
 from requests.adapters import HTTPAdapter, Retry
 from rikolti.utils.rikolti_storage import RikoltiStorage
 
@@ -26,10 +26,7 @@ def __init__(self, params):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.data_destination = RikoltiStorage(
-            f"{settings.DATA_DEST_URL}/{self.collection_id}/"
-            "vernacular_metadata/"
-        )
+        self.data_destination = RikoltiStorage(self.collection_id)
 
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
@@ -48,19 +45,24 @@ def fetch_page(self):
                 f"[{self.collection_id}]: unable to fetch page {page}")
 
         record_count = self.check_page(response)
+        filepath = None
         if record_count:
             content = self.aggregate_vernacular_content(response.text)
             try:
-                self.data_destination.put_page_content(
-                    content, relative_path=f"{self.write_page}"
-                )
+                filepath = self.data_destination.save_fetched_content(
+                    content, self.write_page)
+                print(filepath)
             except Exception as e:
                 print(f"Metadata Fetcher: {e}")
                 raise(e)
 
         self.increment(response)
 
-        return record_count
+        return {
+            'document_count': record_count,
+            'vernacular_filepath': filepath,
+            'status': 'success'
+        }
 
     def aggregate_vernacular_content(self, response):
         return response
diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index 9e3936be0..4db788462 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -1,13 +1,16 @@
 import json
+import math
+import os
 import sys
-from .Fetcher import Fetcher, FetchError
+
+from typing import Optional
+
 import requests
+
 from xml.etree import ElementTree
 from bs4 import BeautifulSoup
-from .. import settings
-import math
-from typing import Optional
 
+from .Fetcher import Fetcher, FetchError
 
 class UcdJsonFetcher(Fetcher):
     def __init__(self, params: dict[str]):
@@ -21,7 +24,7 @@ def __init__(self, params: dict[str]):
         self.url = params.get("harvest_data").get("url")
         self.per_page = 10
 
-    def fetch_page(self) -> int:
+    def fetch_page(self) -> dict[str, int or str]:
         """
         UCD's harvesting endpoint gets us an XML document listing a URL for every record
         in a collection, but not the actual metadata records themselves. fetch_page
@@ -43,7 +46,7 @@ def fetch_page(self) -> int:
 
         return self.fetch_all_pages(response)
 
-    def fetch_all_pages(self, response: requests.Response) -> int:
+    def fetch_all_pages(self, response: requests.Response) -> list:
         """
         Parameters:
             response: requests.Response
@@ -56,24 +59,28 @@ def fetch_all_pages(self, response: requests.Response) -> int:
         loc_nodes = xml.findall(".//ns:loc", ns)
         pages = math.ceil(len(loc_nodes) / self.per_page)
 
+        fetch_status = []
         for page in range(pages):
             print(f"[{self.collection_id}]: Fetching URLs for page {page + 1} "
                   f"({page + 1}/{pages})")
-            skip = self.write_page * self.per_page
-            urls = loc_nodes[skip:(skip + self.per_page)]
-            records = list(filter(None, [self.fetch_json_ld(url.text) for url in urls]))
-            content = json.dumps(records)
-
+            offset = self.write_page * self.per_page
+            urls = loc_nodes[offset:(offset + self.per_page)]
+            urls = list(filter(None, [url.text for url in urls]))
+            records = [self.fetch_json_ld(url) for url in urls]
+            document_count = len(records)
             try:
-                self.data_destination.put_page_content(
-                    content, relative_path=f"{self.write_page}"
-                )
+                filepath = self.data_destination.save_fetched_content(
+                    json.dumps(records), self.write_page)
+                fetch_status.append({
+                    'document_count': document_count,
+                    'vernacular_filepath': filepath,
+                    'status': 'success'
+                })
             except Exception as e:
                 print(f"Metadata Fetcher: {e}", file=sys.stderr)
                 raise(e)
-
             self.write_page += 1
-        return len(loc_nodes)
+        return fetch_status
 
     def fetch_json_ld(self, url: str) -> Optional[dict]:
         """
diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py
index db12b0bc6..e6b2f3376 100644
--- a/metadata_fetcher/lambda_function.py
+++ b/metadata_fetcher/lambda_function.py
@@ -28,28 +28,29 @@ def fetch_collection(payload, context):
 
     fetcher_class = import_fetcher(payload.get('harvest_type'))
 
-    fetch_status = {'page': payload.get('write_page', 0), 'document_count': 0}
+    fetch_status = []
     try:
         fetcher = fetcher_class(payload)
-        fetch_status['document_count'] = fetcher.fetch_page()
+        fetch_status.append(fetcher.fetch_page())
     except InvalidHarvestEndpoint as e:
         logger.error(e)
-        fetch_status.update({
+        fetch_status.append({
             'status': 'error',
             'body': json.dumps({
                 'error': repr(e),
                 'payload': payload
             })
         })
-        return [fetch_status]
+        return fetch_status
 
     next_page = fetcher.json()
-    fetch_status.update({
-        'status': 'success',
-        'next_page': next_page
-    })
 
-    fetch_status = [fetch_status]
+    # this is a ucd json fetcher workaround
+    # TODO: could be cleaner to stash ucd's table of contents in a known
+    # location and have each iteration of the fetcher reference that location,
+    # then we could resolve this difference in return values
+    if len(fetch_status) == 1 and type(fetch_status[0]) == list:
+        fetch_status = fetch_status[0]
 
     if not json.loads(next_page).get('finished'):
         fetch_status.extend(fetch_collection(next_page, {}))
diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 71da9db65..1d63d85de 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -7,7 +7,7 @@
 
 from . import settings
 from .mappers.mapper import Record, Vernacular
-from rikolti.utils.rikolti_storage import RikoltiStorage
+from rikolti.utils.rikolti_storage import get_page_content, put_page_content
 
 logger = logging.getLogger(__name__)
 
@@ -72,17 +72,14 @@ def run_enrichments(records, collection, enrichment_set, page_filename):
     return records
 
 
-def map_page(collection_id: int, page_filename: str, collection: Union[dict, str]):
+def map_page(collection_id: int, page_path: str, collection: Union[dict, str]):
     if isinstance(collection, str):
          collection = json.loads(collection)
 
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
-    storage = RikoltiStorage(
-        f"{settings.DATA_SRC_URL}/{collection_id}/"
-        f"vernacular_metadata/{page_filename}"
-    )
-    api_resp = storage.get_page_content()
+    page_filename = os.path.basename(page_path)
+    api_resp = get_page_content(page_path)
 
     source_vernacular = vernacular_reader(collection_id, page_filename)
     source_metadata_records = source_vernacular.parse(api_resp)
@@ -95,12 +92,13 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
     mapped_records = source_metadata_records
 
     # TODO: write interim mapped but not enriched metadata to s3?
-    # rikolti_data = RikoltiStorage(
-    #     f"{settings.DATA_DEST_URL}/{collection_id}/"
-    #     f"interim_mapped_metadata/{page_filename}"
+    # put_page_content(
+    #   json.dumps([record.to_dict() for record in mapped_records]),
+    #   (
+    #       f"{settings.DATA_DEST_URL}/{collection_id}/"
+    #       f"interim_mapped_metadata/{page_filename}"
+    #   )
     # )
-    # rikolti_data.put_page_content(json.dumps(
-    #     [record.to_dict() for record in mapped_records]))
 
     mapped_records = run_enrichments(
         mapped_records, collection, 'rikolti__enrichments', page_filename)
@@ -128,11 +126,13 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
     #                   for record in mapped_records]
 
     mapped_metadata = [record.to_dict() for record in mapped_records]
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_DEST_URL}/{collection_id}/"
-        f"mapped_metadata/{page_filename}"
+    put_page_content(
+        json.dumps(mapped_metadata),
+        (
+            f"{settings.DATA_DEST_URL}/{collection_id}/"
+            f"mapped_metadata/{page_filename}"
+        )
     )
-    rikolti_data.put_page_content(json.dumps(mapped_metadata))
 
     return {
         'status': 'success',
@@ -147,11 +147,11 @@ def map_page(collection_id: int, page_filename: str, collection: Union[dict, str
     parser = argparse.ArgumentParser(
         description="Map metadata from the institution's vernacular")
     parser.add_argument('collection_id', help='collection id')
-    parser.add_argument('page_filename', help='vernauclar metadata page filename')
+    parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename')
     parser.add_argument('collection', help='json collection metadata from registry')
 
     args = parser.parse_args(sys.argv[1:])
-    mapped_page = map_page(args.collection_id, args.page_filename, args.collection)
+    mapped_page = map_page(args.collection_id, args.page_path, args.collection)
 
     print(f"{mapped_page.get('num_records_mapped')} records mapped")
 
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index 91cd5999d..c40c9dd2f 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -8,7 +8,7 @@
 from . import settings, validate_mapping
 from .lambda_function import map_page
 from .mappers.mapper import Record
-from rikolti.utils.rikolti_storage import RikoltiStorage
+from rikolti.utils.rikolti_storage import list_pages
 
 
 def get_collection(collection_id):
@@ -38,11 +38,11 @@ def check_for_missing_enrichments(collection):
 
 
 def get_vernacular_pages(collection_id):
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata")
-
     try:
-        page_list = rikolti_data.list_pages(relative=True)
+        page_list = list_pages(
+            f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata",
+            recursive=True
+        )
     except FileNotFoundError as e:
         print(
             f"{e} - have you fetched {collection_id}? "
diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py
index 0ba9f45b8..af188f419 100644
--- a/metadata_mapper/utilities.py
+++ b/metadata_mapper/utilities.py
@@ -3,7 +3,6 @@
 from typing import Callable, Union
 
 from . import settings
-from rikolti.utils.rikolti_storage import RikoltiStorage
 
 
 def returns_callable(func: Callable) -> Callable:
@@ -44,77 +43,3 @@ def import_vernacular_reader(mapper_type):
         exit()
     return vernacular_class
 
-
-def get_files(collection_id: int, directory: str) -> list[str]:
-    """
-    Gets a list of filenames in a given directory.
-    """
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_SRC_URL}/{collection_id}/{directory}")
-    return rikolti_data.list_pages(recursive=False, relative=True)
-
-
-def read_from_bucket(collection_id: int, directory: str,
-                     file_name: Union[str, int]) -> str:
-    """
-    Reads the contents of a file from the appropriate content bucket.
-
-    Data comes from local filesystem or S3, depending on ENV vars.
-
-    Parameters:
-        directory: str
-        collection_id: str
-            Files are separated into directories by collection_id
-        file_name: Union[str, int]
-            The name of the file to read
-
-    Returns: str
-        The file contents
-    """
-    rikolti_data = RikoltiStorage(
-        f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}")
-    return rikolti_data.get_page_content()
-    
-
-def read_mapped_metadata(collection_id: int, page_id: int) -> list[dict]:
-    """
-    Reads and parses the content of a mapped metadata file.
-
-    Parameters:
-        collection_id: int
-            The collection ID
-        page_id: int
-            The page ID (filename) to read and parse
-
-    Returns: list[dict]
-        The parsed data
-    """
-    return json.loads(read_from_bucket(collection_id, "mapped_metadata", page_id))
-
-
-def read_vernacular_metadata(collection_id: int, page_id: int) -> list[dict]:
-    """
-    Reads and parses the content of a vernacular (unmapped) metadata file.
-
-    Parameters:
-        collection_id: int
-            The collection ID
-        page_id: int
-            The page ID (filename) to read and parse
-
-    Returns: list[dict]
-        The parsed data
-    """
-    return json.loads(read_from_bucket(collection_id, "vernacular_metadata", page_id))
-
-
-def write_to_bucket(collection_id: int, directory: str,
-                    file_name: Union[str, int], content: str) -> None:
-    if isinstance(content, list) or isinstance(content, dict):
-        content = json.dumps(content)
-
-    rikolti_data = RikoltiStorage(f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/")
-    rikolti_data.put_page_content(content, str(file_name))
-    file_location = f"{settings.DATA_SRC_URL}/{collection_id}/{directory}/{file_name}"
-
-    return file_location
diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py
index 96779b0d3..584e53aa7 100644
--- a/metadata_mapper/validate_mapping.py
+++ b/metadata_mapper/validate_mapping.py
@@ -10,6 +10,7 @@
 from .validator.validation_log import ValidationLogLevel
 from .validator.validation_mode import ValidationMode
 from .validator.validator import Validator
+from rikolti.utils.rikolti_storage import list_pages, get_page_content
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -49,13 +50,18 @@ def validate_collection(collection_id: int,
                                     log_level = log_level,
                                     verbose = verbose)
 
-    for page_id in utilities.get_files(collection_id, "mapped_metadata"):
-        validate_page(collection_id, page_id, validator)
+    mapped_pages = list_pages(
+        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/",
+        recursive=False
+    )
+
+    for page_path in mapped_pages:
+        validate_page(collection_id, page_path, validator)
 
     return validator
 
 
-def validate_page(collection_id: int, page_id: int,
+def validate_page(collection_id: int, page_path: str,
                   validator: Validator) -> Validator:
     """
     Validates a provided page of a provided collection of mapped data.
@@ -63,8 +69,8 @@ def validate_page(collection_id: int, page_id: int,
     Parameters:
         collection_id: int
             The collection ID
-        page_id: int
-            The page number within the collection
+        page_path: str
+            The absolute path to a page within the collection
         validator: Validator
             The validator instance to use
 
@@ -73,10 +79,10 @@ def validate_page(collection_id: int, page_id: int,
     """
     context = {
         "collection_id": collection_id,
-        "page_id": page_id
+        "page_path": page_path
     }
     mapped_metadata = validator.generate_keys(
-                        get_mapped_data(collection_id, page_id),
+                        get_mapped_data(page_path),
                         type="Rikolti",
                         context=context
                       )
@@ -117,8 +123,8 @@ def create_collection_validation_csv(collection_id: int, **options) -> tuple[int
 ## Private-ish
 
 
-def get_mapped_data(collection_id: int, page_id: int) -> dict:
-    return utilities.read_mapped_metadata(collection_id, page_id)
+def get_mapped_data(page_path: str) -> list[dict]:
+    return json.loads(get_page_content(page_path))
 
 
 def get_comparison_data(collection_id: int, harvest_ids: list[str]) -> list[dict]:
diff --git a/metadata_mapper/validator/validation_log.py b/metadata_mapper/validator/validation_log.py
index 41ef2e9fc..72e6eeeae 100644
--- a/metadata_mapper/validator/validation_log.py
+++ b/metadata_mapper/validator/validation_log.py
@@ -2,7 +2,8 @@
 from enum import Enum
 from typing import IO, Any
 
-from .. import utilities
+from .. import settings
+from rikolti.utils.rikolti_storage import put_page_content
 
 
 class ValidationLogLevel(Enum):
@@ -110,7 +111,7 @@ def output_csv_to_file(self, file: IO[str], append: bool = False,
             f.write(self._csv_content_string(include_fields, append))
 
     def output_csv_to_bucket(self, collection_id: int, filename: str = None,
-                             include_fields: list[str] = None) -> None:
+                             include_fields: list[str] = None) -> str:
         """
         Writes a CSV to the env-appropriate bucket (local or S3).
 
@@ -126,12 +127,16 @@ def output_csv_to_bucket(self, collection_id: int, filename: str = None,
         if not filename:
             filename = f"{datetime.now().strftime('%m-%d-%YT%H:%M:%S')}.csv"
 
-        file_location = utilities.write_to_bucket(
-            collection_id, "validation", filename,
-            self._csv_content_string(include_fields))
+        content = self._csv_content_string(include_fields)
+        if isinstance(content, list) or isinstance(content, dict):
+            content = json.dumps(content)
+
+        file_location = f"{settings.DATA_SRC_URL}/{collection_id}/validation/{filename}"
+        put_page_content(content, file_location)
         
         return file_location
 
+
     def _csv_content(self, include_fields: list[str] = None,
                      include_headers: bool = True) -> list[list[str]]:
         """
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 1b419838f..2be23c9ef 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -2,87 +2,236 @@
 import re
 
 import boto3
+from datetime import datetime
 
 from urllib.parse import urlparse
 from typing import Optional
+from collections import namedtuple
 
-class RikoltiStorage():
-    def __init__(self, data_url: str, **kwargs):
-        self.data_url = data_url
-        data_loc = urlparse(data_url)
-        self.data_store = data_loc.scheme
-        self.data_bucket = data_loc.netloc
-        self.data_path = data_loc.path
+DataStorage = namedtuple(
+    "DateStorage", "uri, store, bucket, path"
+)
 
-        if self.data_store == 's3':
-            self.s3 = boto3.client('s3', **kwargs)
+def parse_data_uri(data_uri: str):
+    data_loc = urlparse(data_uri)
+    return DataStorage(
+        data_uri, data_loc.scheme, data_loc.netloc, data_loc.path)
 
-    def list_pages(self, recursive=True, relative=True) -> list:
-        if self.data_store == 's3':
-            try:
-                return self.list_s3_pages(recursive=recursive, relative=relative)
-            except Exception as e:
-                url = (
-                    f"https://{self.data_bucket}.s3.us-west-2.amazonaws"
-                    ".com/index.html#{self.data_path}/"
-                )
-                raise Exception(
-                    f"Error listing files at {self.data_url}\n"
-                    f"Check that {self.data_path} exists at {url}\n{e}"
+
+def list_dirs(data_uri: str, **kwargs) -> list[str]:
+    data = parse_data_uri(data_uri)
+    if data.store == 's3': 
+        s3 = boto3.client('s3', **kwargs)
+        s3_objects = s3.list_objects_v2(
+            Bucket=data.bucket, 
+            Prefix=data.path,
+            Delimiter='/'
+        )
+        keys = [
+            obj['Prefix'][len(data.path):-1] 
+            for obj in s3_objects['CommonPrefixes']
+        ]
+        return keys
+    elif data.store == 'file':
+        dir_contents = os.listdir(data.path)
+        dirs = [
+            file for file in dir_contents
+            if os.path.isdir(os.path.join(data.path, file))
+        ]
+        return dirs
+    else:
+        raise Exception(f"Unknown data store: {data.store}")
+
+
+def list_pages(data_uri: str, recursive: bool=True, **kwargs) -> list:
+    data = parse_data_uri(data_uri)
+
+    if data.store == 's3':
+        try:
+            return list_s3_pages(data, recursive=recursive, **kwargs)
+        except Exception as e:
+            url = (
+                f"https://{data.bucket}.s3.us-west-2.amazonaws"
+                ".com/index.html#{data.path}/"
             )
-        elif self.data_store == 'file':
-            try:
-                return self.list_file_pages(recursive=recursive, relative=relative)
-            except Exception as e:
-                raise Exception(f"Error listing files in {path}\n{e}")
-        else:
-            raise Exception(f"Unknown data store: {self.data_store}")
+            raise Exception(
+                f"Error listing files at {data.uri}\n"
+                f"Check that {data.path} exists at {url}\n{e}"
+        )
+    elif data.store == 'file':
+        try:
+            return list_file_pages(data, recursive=recursive)
+        except Exception as e:
+            raise Exception(f"Error listing files in {data.path}\n{e}")
+    else:
+        raise Exception(f"Unknown data store: {data.store}")
 
-    def list_s3_pages(self, recursive=True, relative=True) -> list:
-        """
-        List all objects in s3_bucket with prefix s3_prefix
-        """
-        s3_objects = self.s3.list_objects_v2(
-            Bucket=self.data_bucket, 
-            Prefix=self.data_path
+
+def list_s3_pages(data: DataStorage, recursive: bool=True, **kwargs) -> list:
+    """
+    List all objects in s3_bucket with prefix s3_prefix
+    """
+    s3 = boto3.client('s3', **kwargs)
+
+    s3_objects = s3.list_objects_v2(
+        Bucket=data.bucket, 
+        Prefix=data.path
+    )
+    # TODO: check resp['IsTruncated'] and use ContinuationToken if needed
+
+    keys = [f"s3://{data.bucket}/{obj['Key']}" for obj in s3_objects['Contents']]
+    prefix = f"s3://{data.bucket}/{data.path}"
+
+    if not recursive:
+        # prune deeper branches
+        leaf_regex = re.escape(prefix) + r"^\/?[\w!'_.*()-]+\/?$"
+        keys = [key for key in keys if re.match(leaf_regex, key)]
+
+    return keys
+
+
+def list_file_pages(data: DataStorage, recursive: bool=True) -> list:
+    """
+    List all files in file_path
+    """
+    file_objects = []
+    if recursive:
+        for root, dirs, files in os.walk(data.path):
+            root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}"
+            for file in files:
+                file_objects.append(f"{root_uri}{file}")
+
+    if not recursive:
+        for file in os.listdir(data.path):
+            if os.path.isfile(os.path.join(data.path, file)):
+                root_uri = "file://{data.path}/" if data.path[-1] != '/' else "file://{data.path}"
+                file_objects.append(f"{root_uri}{file}")
+
+    return file_objects
+
+
+def get_page_content(data_uri: str, **kwargs):
+    data = parse_data_uri(data_uri)
+    if data.store == 's3':
+        return get_s3_contents(data)
+    elif data.store == 'file':
+        return get_file_contents(data)
+    else:
+        raise Exception(f"Unknown data store: {data.store}")
+
+
+def get_s3_contents(data: DataStorage, **kwargs):
+    """
+    Get the body of the object located at data.path
+    """
+    s3 = boto3.client('s3', **kwargs)
+
+    try:
+        obj = s3.get_object(Bucket=data.bucket, Key=data.path)
+        return obj['Body'].read().decode('utf-8')
+    except Exception as e:
+        url = (
+            f"https://{data.bucket}.s3.us-west-2.amazonaws.com/"
+            f"index.html#{data.path}/"
+        )
+        raise Exception(
+            f"Error reading file at {data.uri}\nCheck: {url}\n{e}"
         )
-        # TODO: check resp['IsTruncated'] and use ContinuationToken if needed
 
-        keys = [f"s3://{self.data_bucket}/{obj['Key']}" for obj in s3_objects['Contents']]
-        prefix = "s3://{self.data_bucket}/{self.data_path}"
 
-        if not recursive:
-            # prune deeper branches
-            leaf_regex = re.escape(prefix) + r"^\/?[\w!'_.*()-]+\/?$"
-            keys = [key for key in keys if re.match(leaf_regex, key)]
+def get_file_contents(data: DataStorage):
+    """
+    Get the body of the file located at file_path
+    """
+    try:
+        with open(data.path, 'r') as f:
+            return f.read()
+    except Exception as e:
+        raise Exception(f"Error reading {data.path}\n{e}")
 
-        if relative:
-            keys = [key[len(prefix):] for key in keys]
 
-        return keys
+def put_page_content(content:str, data_uri: str, **kwargs) -> str:
+    """
+    Write content to a file at relative_path (relative to data_path).
+    relative_path is a list of strings, each string is a directory name 
+    representing a directory tree.
+    handle s3 or file storage, use '/' as separator for s3 key and os.sep
+    as separtors for file storage
+    """
+    data = parse_data_uri(data_uri)
+
+    if data.store == 's3':
+        return put_s3_content(data, content, **kwargs)
+    elif data.store == 'file':
+        return put_file_content(data, content)
+    else:
+        raise Exception(f"Unknown data store: {data.store}")
 
-    def list_file_pages(self, recursive=True, relative=True) -> list:
-        """
-        List all files in file_path
-        """
-        file_objects = []
-        if recursive:
-            for root, dirs, files in os.walk(self.data_path):
-                root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}"
-                for file in files:
-                    file_objects.append(f"{root_uri}{file}")
 
-        if not recursive:
-            for file in os.listdir(self.data_path):
-                if os.path.isfile(os.path.join(self.data_path, file)):
-                    root_uri = "file://{self.data_path}/" if self.data_path[-1] != '/' else "file://{self.data_path}"
-                    file_objects.append(f"{root_uri}{file}")
+def put_s3_content(data: DataStorage, content, **kwargs) -> str:
+    """
+    Write content to an object named data.path
+    """
+    s3 = boto3.client('s3', **kwargs)
+    s3.put_object(
+        ACL='bucket-owner-full-control',
+        Bucket=data.bucket,
+        Key=data.path,
+        Body=content
+    )
+    return data.uri
 
-        if relative:
-            prefix = "file://{self.data_path}/"
-            file_objects = [file[len(prefix):] for file in file_objects]
+def put_file_content(data: DataStorage, content) -> str:
+    """
+    Write content to a file at data.path
+    """
+    file_path = os.sep.join(data.path.split('/'))
+    directory_path = os.path.dirname(file_path)
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
 
-        return file_objects
+    with open(file_path, 'w') as f:
+        f.write(content)
+    return data.uri
+
+class RikoltiStorage():
+    def __init__(
+            self, 
+            collection_id: int or str, 
+            vernacular_suffix: Optional[str] = None,
+            vernacular_path: Optional[str] = None,
+            **kwargs):
+
+        self.collection_id = collection_id
+
+        fetcher_data_dest = os.environ.get("FETCHER_DATA_DEST", "file:///tmp")
+        vernacular_root = (
+            f"{fetcher_data_dest.rstrip('/')}/{collection_id}/"
+        )
+        if not vernacular_path:
+            if not vernacular_suffix:
+                vernacular_suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+            vernacular_path = (
+                f"vernacular_metadata_{vernacular_suffix}/"
+            )
+
+        self.vernacular = f"{vernacular_root}{vernacular_path.rstrip('/')}/"
+
+        # mapped_data_src = os.environ.get("MAPPED_DATA_SRC", fetcher_data_dest)
+        # mapped_root = (
+        #     f"{mapped_data_src.rstrip('/')}/{self.collection_id}/"
+        # )
+        
+
+    def save_fetched_content(self, content: str, filename: str):
+        return put_page_content(content, f"{self.vernacular}data/{filename}")
+
+    # def list_fetched_content(self, recursive: bool=True, **kwargs) -> list:
+    #     return list_pages(
+    #         f"{self.vernacular_data}/{self.collection_id}/"
+    #         f"vernacular_metadata{self.suffix}/",
+    #         recursive=recursive
+    #     )
 
     def search_page(self, search_str: str, page: str) -> bool:
         if self.data_store == 's3':
@@ -115,80 +264,6 @@ def search_file_page(self, search_str: str, file_path: str) -> bool:
             else:
                 return False
 
-    def get_page_content(self):
-        if self.data_store == 's3':
-            return self.get_s3_contents()
-        elif self.data_store == 'file':
-            return self.get_file_contents()
-        else:
-            raise Exception(f"Unknown data store: {self.data_store}")
-
-    def get_s3_contents(self):
-        """
-        Get the body of the object located at s3_key
-        """
-        try:
-            obj = self.s3.get_object(Bucket=self.data_bucket, Key=self.data_path)
-            return obj['Body'].read().decode('utf-8')
-        except Exception as e:
-            url = (
-                f"https://{self.data_bucket}.s3.us-west-2.amazonaws.com/"
-                "index.html#{self.data_path}/"
-            )
-            raise Exception(
-                f"Error reading file at {self.data_url}\nCheck: {url}\n{e}"
-            )
-    
-    def get_file_contents(self):
-        """
-        Get the body of the file located at file_path
-        """
-        try:
-            with open(self.data_path, 'r') as f:
-                return f.read()
-        except Exception as e:
-            raise Exception(f"Error reading {self.data_path}\n{e}")
-
-
-    def put_page_content(self, content:str, relative_path: Optional[str]=None):
-        """
-        Write content to a file at relative_path (relative to data_path).
-        relative_path is a list of strings, each string is a directory name 
-        representing a directory tree.
-        handle s3 or file storage, use '/' as separator for s3 key and os.sep
-        as separtors for file storage
-        """
-        path = self.data_path
-        if relative_path:
-            path += relative_path
 
-        if self.data_store == 's3':
-            return self.put_s3_content(path, content)
-        elif self.data_store == 'file':
-            return self.put_file_content(path, content)
-        else:
-            raise Exception(f"Unknown data store: {self.data_store}")
 
-    def put_file_content(self, file_path, content):
-        """
-        Write content to a file at file_path
-        """
-        file_path = os.sep.join(file_path.split('/'))
-        directory_path = os.path.dirname(file_path)
-        if not os.path.exists(directory_path):
-            os.makedirs(directory_path)
-
-        with open(file_path, 'w') as f:
-            f.write(content)
-    
-    def put_s3_content(self, s3_key, content):
-        """
-        Write content to an object named s3_key
-        """
-        self.s3.put_object(
-            ACL='bucket-owner-full-control',
-            Bucket=self.data_bucket,
-            Key=s3_key,
-            Body=content
-        )
 

From 59d11cc2c68a84be57c91beb5ca2dca62b75e981 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 9 Nov 2023 12:01:26 -0800
Subject: [PATCH 17/42] moved RikoltiStorage init into
 create_vernacular_version

---
 metadata_fetcher/fetchers/Fetcher.py          |  8 ++--
 metadata_fetcher/fetchers/ucd_json_fetcher.py |  6 +--
 utils/rikolti_storage.py                      | 43 +++++++++++++------
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index a6ecd15ef..cfedbaf12 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -3,7 +3,7 @@
 import os
 
 from requests.adapters import HTTPAdapter, Retry
-from rikolti.utils.rikolti_storage import RikoltiStorage
+from rikolti.utils.rikolti_storage import create_vernacular_version, put_page_content
 
 
 logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ def __init__(self, params):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.data_destination = RikoltiStorage(self.collection_id)
+        self.data_destination = create_vernacular_version(self.collection_id)
 
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
@@ -49,8 +49,8 @@ def fetch_page(self):
         if record_count:
             content = self.aggregate_vernacular_content(response.text)
             try:
-                filepath = self.data_destination.save_fetched_content(
-                    content, self.write_page)
+                filepath = put_page_content(
+                    content, f"{self.data_destination}data/{self.write_page}")
                 print(filepath)
             except Exception as e:
                 print(f"Metadata Fetcher: {e}")
diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index 4db788462..860495116 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -1,6 +1,5 @@
 import json
 import math
-import os
 import sys
 
 from typing import Optional
@@ -11,6 +10,7 @@
 from bs4 import BeautifulSoup
 
 from .Fetcher import Fetcher, FetchError
+from rikolti.utils.rikolti_storage import put_page_content
 
 class UcdJsonFetcher(Fetcher):
     def __init__(self, params: dict[str]):
@@ -69,8 +69,8 @@ def fetch_all_pages(self, response: requests.Response) -> list:
             records = [self.fetch_json_ld(url) for url in urls]
             document_count = len(records)
             try:
-                filepath = self.data_destination.save_fetched_content(
-                    json.dumps(records), self.write_page)
+                filepath = put_page_content(
+                    json.dumps(records), f"{self.data_destination}data/{self.write_page}")
                 fetch_status.append({
                     'document_count': document_count,
                     'vernacular_filepath': filepath,
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 2be23c9ef..38e8fb8a7 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -194,33 +194,52 @@ def put_file_content(data: DataStorage, content) -> str:
         f.write(content)
     return data.uri
 
+
+def create_vernacular_version(
+        collection_id: int or str,
+        vernacular_suffix: Optional[str] = None
+    ):
+    fetcher_data_dest = os.environ.get(
+        "FETCHER_DATA_DEST", "file:///tmp")
+    vernacular_root = (
+        f"{fetcher_data_dest.rstrip('/')}/{collection_id}/")
+    if not vernacular_suffix:
+        vernacular_suffix = (
+            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
+    vernacular_path = (
+        f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/")
+    return vernacular_path
+
+
 class RikoltiStorage():
     def __init__(
             self, 
             collection_id: int or str, 
             vernacular_suffix: Optional[str] = None,
             vernacular_path: Optional[str] = None,
+            mapped_data_suffix: Optional[str] = None,
+            mapped_data_path: Optional[str] = None,
             **kwargs):
 
         self.collection_id = collection_id
 
-        fetcher_data_dest = os.environ.get("FETCHER_DATA_DEST", "file:///tmp")
-        vernacular_root = (
-            f"{fetcher_data_dest.rstrip('/')}/{collection_id}/"
-        )
         if not vernacular_path:
+            fetcher_data_dest = os.environ.get(
+                "FETCHER_DATA_DEST", "file:///tmp")
+            vernacular_root = (
+                f"{fetcher_data_dest.rstrip('/')}/{collection_id}/")
             if not vernacular_suffix:
-                vernacular_suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+                vernacular_suffix = (
+                    datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
             vernacular_path = (
-                f"vernacular_metadata_{vernacular_suffix}/"
-            )
+                f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/")
 
-        self.vernacular = f"{vernacular_root}{vernacular_path.rstrip('/')}/"
+        self.vernacular = vernacular_path.rstrip('/')+"/"
 
-        # mapped_data_src = os.environ.get("MAPPED_DATA_SRC", fetcher_data_dest)
-        # mapped_root = (
-        #     f"{mapped_data_src.rstrip('/')}/{self.collection_id}/"
-        # )
+        mapped_data_dest = os.environ.get("MAPPED_DATA_DEST", "file:///tmp")
+        mapped_root = (
+            f"{mapped_data_dest.rstrip('/')}/{self.collection_id}/"
+        )
         
 
     def save_fetched_content(self, content: str, filename: str):

From 8f97daf4f82e36b471a60dfd725fbebc8166f5f6 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 9 Nov 2023 14:34:50 -0800
Subject: [PATCH 18/42] create a vernacular version prior to fetching

---
 dags/fetcher_dag.py                           |   5 +-
 dags/harvest_dag.py                           |   5 +-
 dags/shared_tasks.py                          |  11 +-
 .../fetch_registry_collections.py             |   5 +-
 metadata_fetcher/fetchers/Fetcher.py          |   6 +-
 metadata_fetcher/lambda_function.py           |  10 +-
 metadata_fetcher/tests.py                     |   4 +-
 utils/rikolti_storage.py                      | 139 +++++++++---------
 8 files changed, 102 insertions(+), 83 deletions(-)

diff --git a/dags/fetcher_dag.py b/dags/fetcher_dag.py
index 25e510a30..410eecda2 100644
--- a/dags/fetcher_dag.py
+++ b/dags/fetcher_dag.py
@@ -5,6 +5,7 @@
 
 from rikolti.dags.shared_tasks import get_collection_fetchdata_task
 from rikolti.dags.shared_tasks import fetch_collection_task
+from rikolti.dags.shared_tasks import create_vernacular_version_task
 
 @dag(
     dag_id="fetch_collection",
@@ -16,6 +17,8 @@
 )
 def fetcher_dag():
     fetchdata = get_collection_fetchdata_task()
-    fetch_collection_task(collection=fetchdata)
+    vernacular_version = create_vernacular_version_task(collection=fetchdata)
+    fetch_collection_task(
+        collection=fetchdata, vernacular_version=vernacular_version)
 
 fetcher_dag()
diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py
index 88c6352b4..78f814ce4 100644
--- a/dags/harvest_dag.py
+++ b/dags/harvest_dag.py
@@ -4,6 +4,7 @@
 from airflow.models.param import Param
 
 
+from rikolti.dags.shared_tasks import create_vernacular_version_task
 from rikolti.dags.shared_tasks import fetch_collection_task
 from rikolti.dags.shared_tasks import get_collection_fetchdata_task
 from rikolti.dags.shared_tasks import get_collection_metadata_task
@@ -34,7 +35,9 @@ def harvest():
     fetchdata = get_collection_fetchdata_task()
     collection = get_collection_metadata_task()
 
-    fetched_pages = fetch_collection_task(collection=fetchdata)
+    vernacular_version = create_vernacular_version_task(collection=fetchdata)
+    fetched_pages = fetch_collection_task(
+        collection=fetchdata, vernacular_version=vernacular_version)
     mapped_pages = (
         map_page_task
             .partial(collection=collection)
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 04e72d62c..8102c7324 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -17,6 +17,7 @@
 from rikolti.record_indexer.create_collection_index import get_index_name
 from rikolti.record_indexer.create_collection_index import delete_index
 from rikolti.record_indexer.move_index_to_prod import move_index_to_prod
+from rikolti.utils.rikolti_storage import create_vernacular_version
 
 
 # TODO: remove the rikoltifetcher registry endpoint and restructure
@@ -37,8 +38,14 @@ def get_collection_fetchdata_task(params=None):
 
 
 @task()
-def fetch_collection_task(collection: dict):
-    fetch_status = fetch_collection(collection, {})
+def create_vernacular_version_task(collection):
+    vernacular_version = create_vernacular_version(collection.get('id'))
+    return vernacular_version
+
+
+@task()
+def fetch_collection_task(collection: dict, vernacular_version: str):
+    fetch_status = fetch_collection(collection, vernacular_version, {})
     success = all([page['status'] == 'success' for page in fetch_status])
     total_items = sum([page['document_count'] for page in fetch_status])
     total_pages = len(fetch_status)
diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py
index d8bf8ec0a..27a322b54 100644
--- a/metadata_fetcher/fetch_registry_collections.py
+++ b/metadata_fetcher/fetch_registry_collections.py
@@ -5,6 +5,7 @@
 import requests
 
 from . import lambda_function
+from rikolti.utils.rikolti_storage import create_vernacular_version
 
 logger = logging.getLogger(__name__)
 
@@ -51,7 +52,9 @@ def fetch_endpoint(url, limit=None, job_logger=logger):
         job_logger.debug(
             f"{collection_id:<6}: call lambda with payload: {collection}")
 
-        fetch_result = lambda_function.fetch_collection(collection, None)
+        vernacular_version = create_vernacular_version(collection_id)
+        fetch_result = lambda_function.fetch_collection(
+            collection, vernacular_version, None)
         results[collection_id] = fetch_result
 
         success = all([page['status'] == 'success' for page in fetch_result])
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index cfedbaf12..17f53490a 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -3,7 +3,7 @@
 import os
 
 from requests.adapters import HTTPAdapter, Retry
-from rikolti.utils.rikolti_storage import create_vernacular_version, put_page_content
+from rikolti.utils.rikolti_storage import put_page_content
 
 
 logger = logging.getLogger(__name__)
@@ -22,11 +22,11 @@ class FetchError(Exception):
 
 
 class Fetcher(object):
-    def __init__(self, params):
+    def __init__(self, params, vernacular_data_version):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.data_destination = create_vernacular_version(self.collection_id)
+        self.data_destination = vernacular_data_version
 
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py
index e6b2f3376..5ebd2e1b3 100644
--- a/metadata_fetcher/lambda_function.py
+++ b/metadata_fetcher/lambda_function.py
@@ -4,6 +4,7 @@
 import sys
 
 from .fetchers.Fetcher import Fetcher, InvalidHarvestEndpoint
+from rikolti.utils.rikolti_storage import create_vernacular_version
 
 logger = logging.getLogger(__name__)
 
@@ -20,7 +21,7 @@ def import_fetcher(harvest_type):
 
 
 # AWS Lambda entry point
-def fetch_collection(payload, context):
+def fetch_collection(payload, vernacular_version, context):
     if isinstance(payload, str):
         payload = json.loads(payload)
 
@@ -30,7 +31,7 @@ def fetch_collection(payload, context):
 
     fetch_status = []
     try:
-        fetcher = fetcher_class(payload)
+        fetcher = fetcher_class(payload, vernacular_version)
         fetch_status.append(fetcher.fetch_page())
     except InvalidHarvestEndpoint as e:
         logger.error(e)
@@ -53,7 +54,7 @@ def fetch_collection(payload, context):
         fetch_status = fetch_status[0]
 
     if not json.loads(next_page).get('finished'):
-        fetch_status.extend(fetch_collection(next_page, {}))
+        fetch_status.extend(fetch_collection(next_page, vernacular_version, {}))
 
     return fetch_status
 
@@ -71,7 +72,8 @@ def fetch_collection(payload, context):
         encoding='utf-8',
         level=logging.DEBUG
     )
+    vernacular_version = create_vernacular_version(payload.get('collection_id'))
     print(f"Starting to fetch collection {payload.get('collection_id')}")
-    fetch_collection(payload, {})
+    fetch_collection(payload, vernacular_version, {})
     print(f"Finished fetching collection {payload.get('collection_id')}")
     sys.exit(0)
diff --git a/metadata_fetcher/tests.py b/metadata_fetcher/tests.py
index 6f176780b..85f6cebb1 100644
--- a/metadata_fetcher/tests.py
+++ b/metadata_fetcher/tests.py
@@ -10,6 +10,7 @@
                                         nuxeo_nested_complex_object_harvests)
 from .sample_data.oac_harvests import oac_harvests
 from .sample_data.oai_harvests import oai_harvests
+from rikolti.utils.rikolti_storage import create_vernacular_version
 
 
 def main():
@@ -25,7 +26,8 @@ def main():
 
     for harvest in harvests:
         print(f"tests.py: {json.dumps(harvest)}")
-        status = fetch_collection(json.dumps(harvest), {})
+        vernacular_version = create_vernacular_version(harvest.get('collection_id'))
+        status = fetch_collection(json.dumps(harvest), vernacular_version, {})
         print(f"Harvest status: {status}")
 
     urls = [
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 38e8fb8a7..f885a3927 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -18,7 +18,7 @@ def parse_data_uri(data_uri: str):
         data_uri, data_loc.scheme, data_loc.netloc, data_loc.path)
 
 
-def list_dirs(data_uri: str, **kwargs) -> list[str]:
+def list_dirs(data_uri: str, recursive=False, **kwargs) -> list[str]:
     data = parse_data_uri(data_uri)
     if data.store == 's3': 
         s3 = boto3.client('s3', **kwargs)
@@ -197,53 +197,52 @@ def put_file_content(data: DataStorage, content) -> str:
 
 def create_vernacular_version(
         collection_id: int or str,
-        vernacular_suffix: Optional[str] = None
+        version_suffix: Optional[str] = None
     ):
     fetcher_data_dest = os.environ.get(
         "FETCHER_DATA_DEST", "file:///tmp")
-    vernacular_root = (
+    collection_path = (
         f"{fetcher_data_dest.rstrip('/')}/{collection_id}/")
-    if not vernacular_suffix:
-        vernacular_suffix = (
+    if not version_suffix:
+        version_suffix = (
             datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-    vernacular_path = (
-        f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/")
-    return vernacular_path
-
-
-class RikoltiStorage():
-    def __init__(
-            self, 
-            collection_id: int or str, 
-            vernacular_suffix: Optional[str] = None,
-            vernacular_path: Optional[str] = None,
-            mapped_data_suffix: Optional[str] = None,
-            mapped_data_path: Optional[str] = None,
-            **kwargs):
-
-        self.collection_id = collection_id
-
-        if not vernacular_path:
-            fetcher_data_dest = os.environ.get(
-                "FETCHER_DATA_DEST", "file:///tmp")
-            vernacular_root = (
-                f"{fetcher_data_dest.rstrip('/')}/{collection_id}/")
-            if not vernacular_suffix:
-                vernacular_suffix = (
-                    datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-            vernacular_path = (
-                f"{vernacular_root}vernacular_metadata_{vernacular_suffix}/")
-
-        self.vernacular = vernacular_path.rstrip('/')+"/"
-
-        mapped_data_dest = os.environ.get("MAPPED_DATA_DEST", "file:///tmp")
+    vernacular_version_path = (
+        f"{collection_path}vernacular_metadata_{version_suffix}/")
+    return vernacular_version_path
+
+
+def get_most_recent_vernacular_version(collection_id: int or str):
+    mapper_data_src = os.environ.get("MAPPED_DATA_SRC")
+    vernacular_versions = list_dirs(f"{mapper_data_src}/{collection_id}/")
+    if not vernacular_versions:
+        raise Exception(
+            "No vernacular metadata versions found for {collection_id}")
+    return sorted(vernacular_versions)[-1]
+
+
+def create_mapped_version(
+        collection_id: int or str,
+        vernacular_path: str,
+        mapped_data_suffix: Optional[str] = None,
+):
+    mapper_data_dest = os.environ.get("MAPPED_DATA_DEST")
+    # get path of the vernacular version, not the vernacular data
+    mapped_root = vernacular_path.rsplit('data', 1)[0]
+
+    if mapper_data_dest:
+        # get path relative to collection_id
+        vernacular_path = vernacular_path.split(str(collection_id))[-1]
         mapped_root = (
-            f"{mapped_data_dest.rstrip('/')}/{self.collection_id}/"
+            f"{mapper_data_dest.rstrip('/')}/{collection_id}/{vernacular_path}"
         )
-        
 
-    def save_fetched_content(self, content: str, filename: str):
-        return put_page_content(content, f"{self.vernacular}data/{filename}")
+    if not mapped_data_suffix:
+        mapped_data_suffix = (
+            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
+    mapped_data_path = (
+        f"{mapped_root.rstrip('/')}/mapped_metadata_{mapped_data_suffix}/")
+    return mapped_data_path
+
 
     # def list_fetched_content(self, recursive: bool=True, **kwargs) -> list:
     #     return list_pages(
@@ -252,36 +251,36 @@ def save_fetched_content(self, content: str, filename: str):
     #         recursive=recursive
     #     )
 
-    def search_page(self, search_str: str, page: str) -> bool:
-        if self.data_store == 's3':
-            return self.search_s3_page(search_str, page)
-        elif self.data_store == 'file':
-            return self.search_file_page(search_str, page)
-        else:
-            raise Exception(f"Unknown data store: {self.data_store}")
-
-    def search_s3_page(self, search_str: str, s3_key: str) -> bool:
-        """
-        Check if search_str is in the body of the object located at s3_key
-        Returns the s3_key of the object if so, otherwise returns None
-        """
-        obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key)
-        body = obj['Body'].read().decode('utf-8')
-        if search_str in body:
-            return True
-        else:
-            return False
-
-    def search_file_page(self, search_str: str, file_path: str) -> bool:
-        """
-        Check if search_str is in the body of the file located at file_path
-        """
-        with open(file_path, 'r') as f:
-            body = f.read()
-            if search_str in body:
-                return True
-            else:
-                return False
+    # def search_page(self, search_str: str, page: str) -> bool:
+    #     if self.data_store == 's3':
+    #         return self.search_s3_page(search_str, page)
+    #     elif self.data_store == 'file':
+    #         return self.search_file_page(search_str, page)
+    #     else:
+    #         raise Exception(f"Unknown data store: {self.data_store}")
+
+    # def search_s3_page(self, search_str: str, s3_key: str) -> bool:
+    #     """
+    #     Check if search_str is in the body of the object located at s3_key
+    #     Returns the s3_key of the object if so, otherwise returns None
+    #     """
+    #     obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key)
+    #     body = obj['Body'].read().decode('utf-8')
+    #     if search_str in body:
+    #         return True
+    #     else:
+    #         return False
+
+    # def search_file_page(self, search_str: str, file_path: str) -> bool:
+    #     """
+    #     Check if search_str is in the body of the file located at file_path
+    #     """
+    #     with open(file_path, 'r') as f:
+    #         body = f.read()
+    #         if search_str in body:
+    #             return True
+    #         else:
+    #             return False
 
 
 

From e81427241c021adb3870198591d840a1abbc4cdd Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 9 Nov 2023 14:57:55 -0800
Subject: [PATCH 19/42] update metadata mapper to use versioning

---
 dags/harvest_dag.py                         |  7 +++++-
 dags/mapper_dag.py                          | 15 +++++++++---
 dags/shared_tasks.py                        | 14 ++++++++---
 metadata_mapper/lambda_function.py          | 22 +++++------------
 metadata_mapper/lambda_shepherd.py          | 27 ++++++++++++---------
 metadata_mapper/map_registry_collections.py |  3 +--
 6 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py
index 78f814ce4..49a3f420a 100644
--- a/dags/harvest_dag.py
+++ b/dags/harvest_dag.py
@@ -8,6 +8,7 @@
 from rikolti.dags.shared_tasks import fetch_collection_task
 from rikolti.dags.shared_tasks import get_collection_fetchdata_task
 from rikolti.dags.shared_tasks import get_collection_metadata_task
+from rikolti.dags.shared_tasks  import create_mapped_version_task
 from rikolti.dags.shared_tasks  import map_page_task
 from rikolti.dags.shared_tasks  import get_mapping_status_task
 from rikolti.dags.shared_tasks import validate_collection_task
@@ -38,9 +39,13 @@ def harvest():
     vernacular_version = create_vernacular_version_task(collection=fetchdata)
     fetched_pages = fetch_collection_task(
         collection=fetchdata, vernacular_version=vernacular_version)
+    mapped_data_version = create_mapped_version_task(
+        collection=collection,
+        vernacular_pages=fetched_pages
+    )
     mapped_pages = (
         map_page_task
-            .partial(collection=collection)
+            .partial(collection=collection, mapped_data_version=mapped_data_version)
             .expand(page=fetched_pages)
     )
 
diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py
index bb5963490..bcf639796 100644
--- a/dags/mapper_dag.py
+++ b/dags/mapper_dag.py
@@ -1,22 +1,27 @@
 from datetime import datetime
+from typing import Optional
 
 from airflow.decorators import dag, task
 from airflow.models.param import Param
 
 from rikolti.dags.shared_tasks import get_collection_metadata_task
+from rikolti.dags.shared_tasks import create_mapped_version_task
 from rikolti.dags.shared_tasks import map_page_task
 from rikolti.dags.shared_tasks import get_mapping_status_task
 from rikolti.dags.shared_tasks import validate_collection_task
 from rikolti.metadata_mapper.lambda_shepherd import get_vernacular_pages
+from rikolti.utils.rikolti_storage import get_most_recent_vernacular_version
 
 
 @task()
-def get_vernacular_pages_task(collection: dict):
+def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str] = None):
     collection_id = collection.get('id')
+    if not vernacular_version:
+        vernacular_version = get_most_recent_vernacular_version(collection_id)
     if not collection_id:
         raise ValueError(
             f"Collection ID not found in collection metadata: {collection}")
-    pages = get_vernacular_pages(collection_id)
+    pages = get_vernacular_pages(collection_id, vernacular_version)
     return pages
 
 # This is a functional duplicate of 
@@ -48,9 +53,13 @@ def get_vernacular_pages_task(collection: dict):
 def mapper_dag():
     collection = get_collection_metadata_task()
     page_list = get_vernacular_pages_task(collection=collection)
+    mapped_data_version = create_mapped_version_task(
+        collection=collection,
+        vernacular_pages=page_list
+    )
     mapped_pages = (
         map_page_task
-            .partial(collection=collection)
+            .partial(collection=collection, mapped_data_version=mapped_data_version)
             .expand(page=page_list)
     )
 
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 8102c7324..5975e5027 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -17,6 +17,7 @@
 from rikolti.record_indexer.create_collection_index import get_index_name
 from rikolti.record_indexer.create_collection_index import delete_index
 from rikolti.record_indexer.move_index_to_prod import move_index_to_prod
+from rikolti.utils.rikolti_storage import create_mapped_version
 from rikolti.utils.rikolti_storage import create_vernacular_version
 
 
@@ -99,11 +100,11 @@ def get_collection_metadata_task(params=None):
 # max_active_tis_per_dag - setting on the task to restrict how many
 # instances can be running at the same time, *across all DAG runs*
 @task()
-def map_page_task(page: str, collection: dict):
+def map_page_task(page: str, collection: dict, mapped_data_version: str):
     collection_id = collection.get('id')
-    if not collection_id:
+    if not collection_id or not mapped_data_version:
         return False
-    mapped_page = map_page(collection_id, page, collection)
+    mapped_page = map_page(collection_id, page, mapped_data_version, collection)
     return mapped_page
 
 
@@ -113,6 +114,13 @@ def get_mapping_status_task(collection: dict, mapped_pages: list):
     return mapping_status
 
 
+@task()
+def create_mapped_version_task(collection, vernacular_pages):
+    mapped_data_version = create_mapped_version(
+        collection.get('id'), vernacular_pages[0])
+    return mapped_data_version
+
+
 @task()
 def validate_collection_task(collection_status: dict, params=None) -> str:
     if not params or not params.get('validate'):
diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 1d63d85de..8e92784a6 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -1,6 +1,7 @@
 import importlib
 import json
 import logging
+import os
 import sys
 from typing import Union
 from urllib.parse import parse_qs, urlparse
@@ -72,7 +73,7 @@ def run_enrichments(records, collection, enrichment_set, page_filename):
     return records
 
 
-def map_page(collection_id: int, page_path: str, collection: Union[dict, str]):
+def map_page(collection_id: int, page_path: str, mapped_data_version: str, collection: Union[dict, str]):
     if isinstance(collection, str):
          collection = json.loads(collection)
 
@@ -91,15 +92,6 @@ def map_page(collection_id: int, page_path: str, collection: Union[dict, str]):
         record.to_UCLDC()
     mapped_records = source_metadata_records
 
-    # TODO: write interim mapped but not enriched metadata to s3?
-    # put_page_content(
-    #   json.dumps([record.to_dict() for record in mapped_records]),
-    #   (
-    #       f"{settings.DATA_DEST_URL}/{collection_id}/"
-    #       f"interim_mapped_metadata/{page_filename}"
-    #   )
-    # )
-
     mapped_records = run_enrichments(
         mapped_records, collection, 'rikolti__enrichments', page_filename)
 
@@ -128,10 +120,7 @@ def map_page(collection_id: int, page_path: str, collection: Union[dict, str]):
     mapped_metadata = [record.to_dict() for record in mapped_records]
     put_page_content(
         json.dumps(mapped_metadata),
-        (
-            f"{settings.DATA_DEST_URL}/{collection_id}/"
-            f"mapped_metadata/{page_filename}"
-        )
+        f"{mapped_data_version.rstrip('/')}/data/{page_filename}.jsonl"
     )
 
     return {
@@ -147,11 +136,12 @@ def map_page(collection_id: int, page_path: str, collection: Union[dict, str]):
     parser = argparse.ArgumentParser(
         description="Map metadata from the institution's vernacular")
     parser.add_argument('collection_id', help='collection id')
-    parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename')
+    parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/data/1')
+    parser.add_argument('mapped_data_version', help='uri file path to mapped data version; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/mapped_data_version_1/')
     parser.add_argument('collection', help='json collection metadata from registry')
 
     args = parser.parse_args(sys.argv[1:])
-    mapped_page = map_page(args.collection_id, args.page_path, args.collection)
+    mapped_page = map_page(args.collection_id, args.page_path, args.mapped_data_path, args.collection)
 
     print(f"{mapped_page.get('num_records_mapped')} records mapped")
 
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index c40c9dd2f..506c35385 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -5,10 +5,10 @@
 
 from urllib.parse import urlparse
 
-from . import settings, validate_mapping
+from . import validate_mapping
 from .lambda_function import map_page
 from .mappers.mapper import Record
-from rikolti.utils.rikolti_storage import list_pages
+from rikolti.utils.rikolti_storage import list_pages, create_mapped_version, get_most_recent_vernacular_version
 
 
 def get_collection(collection_id):
@@ -37,12 +37,9 @@ def check_for_missing_enrichments(collection):
     return not_yet_implemented
 
 
-def get_vernacular_pages(collection_id):
+def get_vernacular_pages(collection_id, vernacular_version):
     try:
-        page_list = list_pages(
-            f"{settings.DATA_SRC_URL}/{collection_id}/vernacular_metadata",
-            recursive=True
-        )
+        page_list = list_pages(vernacular_version, recursive=True)
     except FileNotFoundError as e:
         print(
             f"{e} - have you fetched {collection_id}? "
@@ -50,7 +47,7 @@ def get_vernacular_pages(collection_id):
         )
         raise(e)
 
-    # TODO: split page_list into pages and children
+    # TODO: split page_list into pages and children?
     return page_list
 
 
@@ -75,7 +72,7 @@ def get_mapping_status(collection, mapped_pages):
         'group_exceptions': group_exceptions
     }
 
-def map_collection(collection_id, validate=False):
+def map_collection(collection_id, vernacular_version=None, validate=False):
     # This is a functional duplicate of rikolti.d*gs.mapper_d*g.mapper_d*g
 
     # Within an airflow runtime context, we take advantage of airflow's dynamic
@@ -91,11 +88,16 @@ def map_collection(collection_id, validate=False):
 
     collection = get_collection(collection_id)
 
-    page_list = get_vernacular_pages(collection_id)
+    if not vernacular_version:
+        vernacular_version = get_most_recent_vernacular_version(collection_id)
+    page_list = get_vernacular_pages(collection_id, vernacular_version)
+
+    mapped_data_version = create_mapped_version(collection_id, page_list[0])
     mapped_pages = []
     for page in page_list:
         try:
-            mapped_page = map_page(collection_id, page, collection)
+            mapped_page = map_page(
+                collection_id, page, mapped_data_version, collection)
             mapped_pages.append(mapped_page)
         except KeyError:
             print(
@@ -126,8 +128,9 @@ def map_collection(collection_id, validate=False):
     parser.add_argument('collection_id', help='collection ID from registry')
     parser.add_argument('--validate', help='validate mapping; may provide json opts',
         const=True, nargs='?')
+    parser.add_argument('vernacular_version', help='URI to a folder of vernacular pages to map')
     args = parser.parse_args(sys.argv[1:])
-    mapped_collection = map_collection(args.collection_id, args.validate)
+    mapped_collection = map_collection(args.collection_id, args.vernacular_version, args.validate)
     missing_enrichments = mapped_collection.get('missing_enrichments')
     if len(missing_enrichments) > 0:
         print(
diff --git a/metadata_mapper/map_registry_collections.py b/metadata_mapper/map_registry_collections.py
index 3cb241db1..584830b86 100644
--- a/metadata_mapper/map_registry_collections.py
+++ b/metadata_mapper/map_registry_collections.py
@@ -51,8 +51,7 @@ def map_endpoint(url, limit=None):
             f"{collection_id:<6}: call lambda with collection_id: {collection_id}")
 
         try:
-            map_result = lambda_shepherd.map_collection(
-                collection_id)
+            map_result = lambda_shepherd.map_collection(collection_id)
         except FileNotFoundError:
             print(f"{collection_id:<6}: not fetched yet", file=sys.stderr)
             continue

From 6bee4c318852651ea65a57753c8ef251f93666a8 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 9 Nov 2023 15:07:06 -0800
Subject: [PATCH 20/42] maintain Fetcher.__init__ function signature

---
 dags/shared_tasks.py                 | 3 +--
 metadata_fetcher/fetchers/Fetcher.py | 6 +++---
 metadata_fetcher/lambda_function.py  | 3 ++-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 5975e5027..c7b9cd463 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -40,8 +40,7 @@ def get_collection_fetchdata_task(params=None):
 
 @task()
 def create_vernacular_version_task(collection):
-    vernacular_version = create_vernacular_version(collection.get('id'))
-    return vernacular_version
+    return create_vernacular_version(collection.get('collection_id'))
 
 
 @task()
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index 17f53490a..02085d18c 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -22,11 +22,12 @@ class FetchError(Exception):
 
 
 class Fetcher(object):
-    def __init__(self, params, vernacular_data_version):
+    def __init__(self, params):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.data_destination = vernacular_data_version
+        self.data_destination = params.get('vernacular_version')
+
 
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
@@ -51,7 +52,6 @@ def fetch_page(self):
             try:
                 filepath = put_page_content(
                     content, f"{self.data_destination}data/{self.write_page}")
-                print(filepath)
             except Exception as e:
                 print(f"Metadata Fetcher: {e}")
                 raise(e)
diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py
index 5ebd2e1b3..d4f78b68f 100644
--- a/metadata_fetcher/lambda_function.py
+++ b/metadata_fetcher/lambda_function.py
@@ -28,10 +28,11 @@ def fetch_collection(payload, vernacular_version, context):
     logger.debug(f"fetch_collection payload: {payload}")
 
     fetcher_class = import_fetcher(payload.get('harvest_type'))
+    payload.update({'vernacular_version': vernacular_version})
 
     fetch_status = []
     try:
-        fetcher = fetcher_class(payload, vernacular_version)
+        fetcher = fetcher_class(payload)
         fetch_status.append(fetcher.fetch_page())
     except InvalidHarvestEndpoint as e:
         logger.error(e)

From 9dc2e17f754b276afb00e4a74a4da3409ed42bf2 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 9 Nov 2023 16:07:17 -0800
Subject: [PATCH 21/42] add full filepaths to map_page output

---
 dags/harvest_dag.py                         |  6 ++--
 dags/shared_tasks.py                        |  3 +-
 dags/utils_by_mapper_type.py                |  2 +-
 metadata_mapper/lambda_function.py          | 11 +++----
 metadata_mapper/lambda_shepherd.py          |  5 +++-
 metadata_mapper/validate_mapping.py         | 20 ++++++-------
 metadata_mapper/validator/validation_log.py |  9 ++----
 utils/rikolti_storage.py                    | 33 +++++++++++++++++++++
 8 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py
index 49a3f420a..eeac13dd8 100644
--- a/dags/harvest_dag.py
+++ b/dags/harvest_dag.py
@@ -17,7 +17,7 @@
 
 @task()
 def get_mapped_page_filenames_task(mapped_pages):
-    return [mapped['page_filename'] for mapped in mapped_pages]
+    return [mapped['mapped_page_path'] for mapped in mapped_pages]
 
 @dag(
     dag_id="harvest_collection",
@@ -51,7 +51,7 @@ def harvest():
 
     mapping_status = get_mapping_status_task(collection, mapped_pages)
     validate_collection_task(mapping_status)
-    mapped_page_filenames = get_mapped_page_filenames_task(mapped_pages)
+    mapped_page_paths = get_mapped_page_filenames_task(mapped_pages)
 
     content_harvest_task = (
         ContentHarvestOperator
@@ -60,7 +60,7 @@ def harvest():
                 collection_id="{{ params.collection_id }}",
             )
             .expand(
-                page=mapped_page_filenames
+                page=mapped_page_paths
             )
     )
     content_harvest_task
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index c7b9cd463..792875f21 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -131,7 +131,8 @@ def validate_collection_task(collection_status: dict, params=None) -> str:
     if collection_status.get('status') != 'success':
         raise Exception(f"Collection {collection_id} not successfully mapped")
 
-    num_rows, file_location = create_collection_validation_csv(collection_id)
+    num_rows, file_location = create_collection_validation_csv(
+        collection_id, collection_status['mapped_page_paths'])
     print(f"Output {num_rows} rows to {file_location}")
 
     # create a link to the file in the logs
diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py
index bda78ff83..85d1b0386 100644
--- a/dags/utils_by_mapper_type.py
+++ b/dags/utils_by_mapper_type.py
@@ -66,7 +66,7 @@ def validate_endpoint_task(url, params=None):
     for collection in registry_endpoint(url):
         print(f"{collection['collection_id']:<6} Validating collection")
         num_rows, file_location = create_collection_validation_csv(
-            collection['collection_id'])
+            collection['collection_id'], mapped_page_paths)
         csv_paths.append(file_location)
         if file_location.startswith('s3://'):
             s3_path = urlparse(file_location)
diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index 8e92784a6..a5922700e 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -73,14 +73,14 @@ def run_enrichments(records, collection, enrichment_set, page_filename):
     return records
 
 
-def map_page(collection_id: int, page_path: str, mapped_data_version: str, collection: Union[dict, str]):
+def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: str, collection: Union[dict, str]):
     if isinstance(collection, str):
          collection = json.loads(collection)
 
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
-    page_filename = os.path.basename(page_path)
-    api_resp = get_page_content(page_path)
+    page_filename = os.path.basename(vernacular_page_path)
+    api_resp = get_page_content(vernacular_page_path)
 
     source_vernacular = vernacular_reader(collection_id, page_filename)
     source_metadata_records = source_vernacular.parse(api_resp)
@@ -118,7 +118,7 @@ def map_page(collection_id: int, page_path: str, mapped_data_version: str, colle
     #                   for record in mapped_records]
 
     mapped_metadata = [record.to_dict() for record in mapped_records]
-    put_page_content(
+    mapped_page_path = put_page_content(
         json.dumps(mapped_metadata),
         f"{mapped_data_version.rstrip('/')}/data/{page_filename}.jsonl"
     )
@@ -127,7 +127,7 @@ def map_page(collection_id: int, page_path: str, mapped_data_version: str, colle
         'status': 'success',
         'num_records_mapped': len(mapped_records),
         'page_exceptions': group_page_exceptions,
-        'page_filename': page_filename,
+        'mapped_page_path': mapped_page_path,
     }
 
 
@@ -144,6 +144,7 @@ def map_page(collection_id: int, page_path: str, mapped_data_version: str, colle
     mapped_page = map_page(args.collection_id, args.page_path, args.mapped_data_path, args.collection)
 
     print(f"{mapped_page.get('num_records_mapped')} records mapped")
+    print(f"mapped page at {mapped_page.get('mapped_page_path')}")
 
     for report, couch_ids in mapped_page.get('exceptions', {}).items():
         print(f"{len(couch_ids)} records report enrichments errors: {report}")
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index 506c35385..baa5da644 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -69,7 +69,8 @@ def get_mapping_status(collection, mapped_pages):
         'missing_enrichments': check_for_missing_enrichments(collection),
         'count': count,
         'page_count': page_count,
-        'group_exceptions': group_exceptions
+        'group_exceptions': group_exceptions,
+        'mapped_page_paths': [page['mapped_page_path'] for page in mapped_pages],
     }
 
 def map_collection(collection_id, vernacular_version=None, validate=False):
@@ -107,12 +108,14 @@ def map_collection(collection_id, vernacular_version=None, validate=False):
             continue
 
     collection_stats = get_mapping_status(collection, mapped_pages)
+    mapped_page_paths = [page['mapped_page_path'] for page in mapped_pages]
 
     if validate:
         opts = validate if isinstance(validate, dict) else {}
         num_rows, file_location = (
             validate_mapping.create_collection_validation_csv(
                 collection_id,
+                mapped_page_paths,
                 **opts
             )
         )
diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py
index 584e53aa7..f8b606164 100644
--- a/metadata_mapper/validate_mapping.py
+++ b/metadata_mapper/validate_mapping.py
@@ -10,12 +10,13 @@
 from .validator.validation_log import ValidationLogLevel
 from .validator.validation_mode import ValidationMode
 from .validator.validator import Validator
-from rikolti.utils.rikolti_storage import list_pages, get_page_content
+from rikolti.utils.rikolti_storage import get_page_content
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 
 def validate_collection(collection_id: int,
+                        mapped_page_paths: list[str],
                         validator_class: Type[Validator] = None,
                         validator: Validator = None,
                         validation_mode = ValidationMode.STRICT,
@@ -50,12 +51,7 @@ def validate_collection(collection_id: int,
                                     log_level = log_level,
                                     verbose = verbose)
 
-    mapped_pages = list_pages(
-        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/",
-        recursive=False
-    )
-
-    for page_path in mapped_pages:
+    for page_path in mapped_page_paths:
         validate_page(collection_id, page_path, validator)
 
     return validator
@@ -115,9 +111,11 @@ def validate_page(collection_id: int, page_path: str,
     return validator
 
 
-def create_collection_validation_csv(collection_id: int, **options) -> tuple[int, str]:
-    result = validate_collection(collection_id, **options)
-    filename = result.log.output_csv_to_bucket(collection_id)
+def create_collection_validation_csv(
+        collection_id: int, mapped_page_paths: list[str], **options) -> tuple[int, str]:
+    result = validate_collection(collection_id, mapped_page_paths, **options)
+
+    filename = result.log.output_csv_to_bucket(collection_id, mapped_page_paths[0])
     return len(result.log.log), filename
 
 ## Private-ish
@@ -276,5 +274,5 @@ def get_validator_class(collection_id: int) -> Type[Validator]:
     print(kwargs)
 
     num_rows, file_location = create_collection_validation_csv(
-        args.collection_id, **kwargs)
+        args.collection_id, mapped_page_paths, **kwargs)
     print(f"Output {num_rows} rows to {file_location}")
diff --git a/metadata_mapper/validator/validation_log.py b/metadata_mapper/validator/validation_log.py
index 72e6eeeae..516eafaa3 100644
--- a/metadata_mapper/validator/validation_log.py
+++ b/metadata_mapper/validator/validation_log.py
@@ -3,7 +3,7 @@
 from typing import IO, Any
 
 from .. import settings
-from rikolti.utils.rikolti_storage import put_page_content
+from rikolti.utils.rikolti_storage import put_page_content, create_validation_version
 
 
 class ValidationLogLevel(Enum):
@@ -110,7 +110,7 @@ def output_csv_to_file(self, file: IO[str], append: bool = False,
         with open(file, "a" if append else "w") as f:
             f.write(self._csv_content_string(include_fields, append))
 
-    def output_csv_to_bucket(self, collection_id: int, filename: str = None,
+    def output_csv_to_bucket(self, collection_id: int, mapped_data_path: str = None,
                              include_fields: list[str] = None) -> str:
         """
         Writes a CSV to the env-appropriate bucket (local or S3).
@@ -124,14 +124,11 @@ def output_csv_to_bucket(self, collection_id: int, filename: str = None,
             include_fields: list[str] (default: None)
                 A list of fields to include in the CSV. Defaults to all.
         """
-        if not filename:
-            filename = f"{datetime.now().strftime('%m-%d-%YT%H:%M:%S')}.csv"
-
         content = self._csv_content_string(include_fields)
         if isinstance(content, list) or isinstance(content, dict):
             content = json.dumps(content)
 
-        file_location = f"{settings.DATA_SRC_URL}/{collection_id}/validation/{filename}"
+        file_location = create_validation_version(collection_id, mapped_data_path)
         put_page_content(content, file_location)
         
         return file_location
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index f885a3927..8e217dc00 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -283,5 +283,38 @@ def create_mapped_version(
     #             return False
 
 
+def create_validation_version(
+        collection_id: int or str,
+        mapped_data_path: str,
+        validation_suffix: Optional[str] = None
+):
+    validation_data_dest = os.environ.get("VALIDATION_DATA_DEST")
+    # get path of the mapped data version, not the mapped data
+    validation_root = mapped_data_path.rsplit('data', 1)[0]
+
+    if validation_data_dest:
+        # get path relative to collection_id
+        mapped_data_path = mapped_data_path.split(str(collection_id))[-1]
+        validation_root = (
+            f"{validation_data_dest.rstrip('/')}/{collection_id}/{mapped_data_path}"
+        )
+
+    if not validation_suffix:
+        validation_suffix = (
+            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
+    validation_data_path = (
+        f"{validation_root.rstrip('/')}/validation_{validation_suffix}.csv")
+    return validation_data_path
+
+    validation_data_dest = os.environ.get(
+        "VALIDATION_DATA_DEST", "file:///tmp")
+    collection_path = (
+        f"{validation_data_dest.rstrip('/')}/{collection_id}/")
+    if not validation_suffix:
+        validation_suffix = (
+            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
+    validation_version_path = (
+        f"{collection_path}validation_{validation_suffix}/")
+    return validation_version_path
 
 

From df41fb0cef65c08b36c5e032266ba5e9e6769c50 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Wed, 15 Nov 2023 11:42:10 -0800
Subject: [PATCH 22/42] update content harvester to use versioning

---
 content_harvester/by_collection.py | 22 +++++----
 content_harvester/by_page.py       | 51 +++++++++++----------
 utils/rikolti_storage.py           | 71 ++++++++++--------------------
 3 files changed, 65 insertions(+), 79 deletions(-)

diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index 9d7215b5d..dc8bd5a39 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -2,12 +2,12 @@
 
 from . import settings
 from .by_page import harvest_page_content
-from rikolti.utils.rikolti_storage import list_pages
+from rikolti.utils.rikolti_storage import list_pages, create_content_data_version
 
-def get_mapped_pages(collection_id):
+def get_mapped_pages(mapped_data_version:str):
     page_list = []
     page_list = list_pages(
-        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata",
+        f"{mapped_data_version.rstrip('/')}/data/",
         recursive=False,
         aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
         aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
@@ -18,20 +18,25 @@ def get_mapped_pages(collection_id):
 
 
 # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo"}
-def harvest_collection(collection):
+def harvest_collection(collection, mapped_data_version: str):
     if isinstance(collection, str):
         collection = json.loads(collection)
 
     collection_id = collection.get('collection_id')
 
-    if not collection_id:
-        print("ERROR ERROR ERROR\ncollection_id required")
+    if not collection_id or not mapped_data_version:
+        print("ERROR ERROR ERROR\ncollection_id and mapped_data_version required")
         exit()
 
-    page_list = get_mapped_pages(collection_id)
+    page_list = get_mapped_pages(mapped_data_version)
 
     print(f"[{collection_id}]: Harvesting content for {len(page_list)} pages")
     collection_stats = {}
+    collection.update({
+        'content_data_version': create_content_data_version(
+            collection_id, mapped_data_version)
+    })
+
     for page_path in page_list:
         collection.update({'page_path': page_path})
         page_stats = harvest_page_content(**collection)
@@ -56,6 +61,7 @@ def harvest_collection(collection):
     parser = argparse.ArgumentParser(
         description="Harvest content by collection using mapped metadata")
     parser.add_argument('collection_id', help="Collection ID")
+    parser.add_argument('mapped_data_version', help="URI to mapped data version: ex: s3://rikolti-data-root/3433/vernacular_data_version_1/mapped_data_version_2/")
     parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth")
     args = parser.parse_args()
     arguments = {
@@ -63,4 +69,4 @@ def harvest_collection(collection):
     }
     if args.nuxeo:
         arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo'
-    print(harvest_collection(arguments))
\ No newline at end of file
+    print(harvest_collection(arguments, args.mapped_data_version))
\ No newline at end of file
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 4789a4cbd..0c2515638 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,7 +14,7 @@
 from . import derivatives
 from . import settings
 
-from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content
+from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version
 
 class DownloadError(Exception):
     pass
@@ -30,32 +30,33 @@ def get_mapped_records(page_path) -> list:
     return mapped_records
 
 
-def write_mapped_record(collection_id, record):
-    put_page_content(
+def write_mapped_record(content_data_version, record):
+    filename = put_page_content(
         json.dumps(record), 
         (
-            f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/"
-            f"{record.get('calisphere-id').replace(os.sep, '_')}"
+            f"{content_data_version.rstrip('/')}/data/"
+            f"{record.get('calisphere-id').replace(os.sep, '_')}.json"
         )
-)
+    )
+    return filename
 
 
-def write_mapped_page(collection_id, page, records):
-    put_page_content(
+def write_mapped_page(content_data_version, page, records):
+    filename = put_page_content(
         json.dumps(records),
-        f"{settings.DATA_DEST_URL}/{collection_id}/mapped_with_content/{page}"
+        f"{content_data_version.rstrip('/')}/data/{page}"
     )
+    return filename
 
 
-def get_child_records(collection_id, parent_id) -> list:
+def get_child_records(mapped_page_path, parent_id) -> list:
     mapped_child_records = []
     children = list_pages(
-        f"{settings.DATA_SRC_URL}/{collection_id}/mapped_metadata/children",
+        f"{mapped_page_path.rsplit('/', 1)[0]}/children/",
         recursive=False
     )
-    if rikolti_data.data_store == 'file':
-        children = [page for page in children
-                    if os.path.basename(page).startswith(parent_id)]
+    children = [page for page in children
+                if (page.rsplit('/')[-1]).startswith(parent_id)]
     for child in children:
         mapped_child_records.extend(json.loads(get_page_content(child)))
     return mapped_child_records
@@ -179,7 +180,8 @@ def check_mimetype(self, mimetype):
 class ContentHarvester(object):
 
     # context = {'collection_id': '12345', 'page_filename': '1.jsonl'}
-    def __init__(self, collection_id, page_filename, src_auth=None):
+    def __init__(self, mapped_page_path, collection_id, page_filename, src_auth=None):
+        self.mapped_page_path = mapped_page_path
         self.http = requests.Session()
 
         retry_strategy = Retry(
@@ -249,7 +251,7 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict:
 
         # Recurse through the record's children (if any)
         child_records = get_child_records(
-            self.collection_id, calisphere_id)
+            self.mapped_page_path, calisphere_id)
         if child_records:
             print(
                 f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: "
@@ -347,9 +349,9 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] =
 
 
 # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"}
-def harvest_page_content(collection_id, page_path, **kwargs):
+def harvest_page_content(collection_id, mapped_page_path, content_data_version, **kwargs):
     rikolti_mapper_type = kwargs.get('rikolti_mapper_type')
-    page_filename = os.path.basename(page_path)
+    page_filename = os.path.basename(mapped_page_path)
 
     # Weird how we have to use username/pass to hit this endpoint
     # but we have to use auth token to hit API endpoint
@@ -357,12 +359,13 @@ def harvest_page_content(collection_id, page_path, **kwargs):
     if rikolti_mapper_type == 'nuxeo.nuxeo':
         auth = (settings.NUXEO_USER, settings.NUXEO_PASS)
     harvester = ContentHarvester(
+        mapped_page_path,
         collection_id=collection_id,
         page_filename=page_filename,
         src_auth=auth
     )
 
-    records = get_mapped_records(page_path)
+    records = get_mapped_records(mapped_page_path)
     print(
         f"[{collection_id}, {page_filename}]: "
         f"Harvesting content for {len(records)} records"
@@ -377,7 +380,7 @@ def harvest_page_content(collection_id, page_path, **kwargs):
         try:
             record_with_content = harvester.harvest(record)
             # write_mapped_record(
-            #     collection_id, record_with_content)
+            #     content_data_version, record_with_content)
             if not record_with_content.get('thumbnail'):
                 warn_level = "ERROR"
                 if 'sound' in record.get('type', []):
@@ -396,7 +399,7 @@ def harvest_page_content(collection_id, page_path, **kwargs):
             print(f"Exiting after harvesting {i} of {len(records)} items "
                   f"in page {page_filename} of collection {collection_id}")
 
-    write_mapped_page(collection_id, page_filename, records)
+    write_mapped_page(content_data_version, page_filename, records)
 
     media_source = [r for r in records if r.get('media_source')]
     media_harvested = [r for r in records if r.get('media')]
@@ -455,12 +458,14 @@ def harvest_page_content(collection_id, page_path, **kwargs):
     parser = argparse.ArgumentParser(
         description="Harvest content using a page of mapped metadata")
     parser.add_argument('collection_id', help="Collection ID")
-    parser.add_argument('page_path', help="URI-formatted path to a mapped metadata page")
+    parser.add_argument('mapped_page_path', help="URI-formatted path to a mapped metadata page")
     parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth")
     args = parser.parse_args()
     arguments = {
         'collection_id': args.collection_id,
-        'page_filename': args.page_path,
+        'mapped_page_path': args.mapped_page_path,
+        'content_data_version': create_content_data_version(
+            args.collection_id, args.mapped_page_path.rsplit('data', 1)[0])
     }
     if args.nuxeo:
         arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo'
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 8e217dc00..4043e80ae 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -244,45 +244,6 @@ def create_mapped_version(
     return mapped_data_path
 
 
-    # def list_fetched_content(self, recursive: bool=True, **kwargs) -> list:
-    #     return list_pages(
-    #         f"{self.vernacular_data}/{self.collection_id}/"
-    #         f"vernacular_metadata{self.suffix}/",
-    #         recursive=recursive
-    #     )
-
-    # def search_page(self, search_str: str, page: str) -> bool:
-    #     if self.data_store == 's3':
-    #         return self.search_s3_page(search_str, page)
-    #     elif self.data_store == 'file':
-    #         return self.search_file_page(search_str, page)
-    #     else:
-    #         raise Exception(f"Unknown data store: {self.data_store}")
-
-    # def search_s3_page(self, search_str: str, s3_key: str) -> bool:
-    #     """
-    #     Check if search_str is in the body of the object located at s3_key
-    #     Returns the s3_key of the object if so, otherwise returns None
-    #     """
-    #     obj = self.s3.get_object(Bucket=self.data_bucket, Key=s3_key)
-    #     body = obj['Body'].read().decode('utf-8')
-    #     if search_str in body:
-    #         return True
-    #     else:
-    #         return False
-
-    # def search_file_page(self, search_str: str, file_path: str) -> bool:
-    #     """
-    #     Check if search_str is in the body of the file located at file_path
-    #     """
-    #     with open(file_path, 'r') as f:
-    #         body = f.read()
-    #         if search_str in body:
-    #             return True
-    #         else:
-    #             return False
-
-
 def create_validation_version(
         collection_id: int or str,
         mapped_data_path: str,
@@ -306,15 +267,29 @@ def create_validation_version(
         f"{validation_root.rstrip('/')}/validation_{validation_suffix}.csv")
     return validation_data_path
 
-    validation_data_dest = os.environ.get(
-        "VALIDATION_DATA_DEST", "file:///tmp")
-    collection_path = (
-        f"{validation_data_dest.rstrip('/')}/{collection_id}/")
-    if not validation_suffix:
-        validation_suffix = (
+
+def create_content_data_version(
+        collection_id: int or str, 
+        mapped_data_version: str,
+        content_data_suffix: Optional[str] = None
+)-> str:
+    mapped_with_content_dest = os.environ.get('CONTENT_DATA_DEST')
+    # get path of the mapped data version, not the mapped data
+    content_data_root = mapped_data_version
+
+    if mapped_with_content_dest:
+        # get path relative to collection_id
+        mapped_data_path = mapped_data_version.split(str(collection_id))[-1]
+        content_data_root = (
+            f"{mapped_with_content_dest.rstrip('/')}/{collection_id}/{mapped_data_path}"
+        )
+    
+    if not content_data_suffix:
+        content_data_suffix = (
             datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-    validation_version_path = (
-        f"{collection_path}validation_{validation_suffix}/")
-    return validation_version_path
+    content_data_path = (
+        f"{content_data_root.rstrip('/')}/content_data_{content_data_suffix}/")
+    )
+    return content_data_path
 
 

From 8abb2808a06535f96ddbc61c197c01af64423d4a Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Fri, 10 Nov 2023 10:50:29 -0800
Subject: [PATCH 23/42] Resolve simple content harvester versioning issues

---
 content_harvester/by_collection.py |  4 ++--
 content_harvester/by_page.py       | 29 +++++++++++++++--------------
 content_harvester/settings.py      |  5 +++++
 dags/harvest_dag.py                |  3 +++
 dags/shared_content_harvester.py   | 25 +++++++++++++++++++------
 dags/shared_tasks.py               |  8 ++++++++
 utils/rikolti_storage.py           | 13 +++++++------
 7 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index dc8bd5a39..398697094 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -2,7 +2,7 @@
 
 from . import settings
 from .by_page import harvest_page_content
-from rikolti.utils.rikolti_storage import list_pages, create_content_data_version
+from .rikolti_storage import list_pages, create_content_data_version
 
 def get_mapped_pages(mapped_data_version:str):
     page_list = []
@@ -38,7 +38,7 @@ def harvest_collection(collection, mapped_data_version: str):
     })
 
     for page_path in page_list:
-        collection.update({'page_path': page_path})
+        collection.update({'mapped_page_path': page_path})
         page_stats = harvest_page_content(**collection)
 
         # in some cases, value is int and in some cases, value is Counter
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 0c2515638..ef163d715 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,7 +14,7 @@
 from . import derivatives
 from . import settings
 
-from rikolti.utils.rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version
+from .rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version
 
 class DownloadError(Exception):
     pass
@@ -51,10 +51,13 @@ def write_mapped_page(content_data_version, page, records):
 
 def get_child_records(mapped_page_path, parent_id) -> list:
     mapped_child_records = []
-    children = list_pages(
-        f"{mapped_page_path.rsplit('/', 1)[0]}/children/",
-        recursive=False
-    )
+    try:
+        children = list_pages(
+            f"{mapped_page_path.rsplit('/', 1)[0]}/children/",
+            recursive=False
+        )
+    except FileNotFoundError:
+        return mapped_child_records
     children = [page for page in children
                 if (page.rsplit('/')[-1]).startswith(parent_id)]
     for child in children:
@@ -196,10 +199,6 @@ def __init__(self, mapped_page_path, collection_id, page_filename, src_auth=None
         self.collection_id = collection_id
         self.page_filename = page_filename
 
-        if settings.CONTENT_DEST["STORE"] == 's3':
-            self.s3 = boto3.client('s3')
-        else:
-            self.s3 = None
 
     # returns content = {thumbnail, media, children} where children
     # is an array of the self-same content dictionary
@@ -332,9 +331,10 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] =
             shutil.copyfile(filepath, dest_path)
 
         if settings.CONTENT_DEST["STORE"] == 's3':
+            s3 = boto3.client('s3')
             dest_path = (
                 f"{settings.CONTENT_DEST['PATH']}/{dest_prefix}/{dest_filename}")
-            self.s3.upload_file(
+            s3.upload_file(
                 filepath, settings.CONTENT_DEST["BUCKET"], dest_path)
 
         # (mime, dimensions) = image_info(filepath)
@@ -398,6 +398,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
             )
             print(f"Exiting after harvesting {i} of {len(records)} items "
                   f"in page {page_filename} of collection {collection_id}")
+            raise(e)
 
     write_mapped_page(content_data_version, page_filename, records)
 
@@ -444,9 +445,9 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
     child_contents = [len(record.get('children', [])) for record in records]
 
     return {
-        'thumb_source': Counter(thumb_src_mimetypes),
+        'thumb_source_mimetypes': Counter(thumb_src_mimetypes),
         'thumb_mimetypes': Counter(thumb_mimetypes),
-        'media_source': Counter(media_src_mimetypes),
+        'media_source_mimetypes': Counter(media_src_mimetypes),
         'media_mimetypes': Counter(media_mimetypes),
         'children': sum(child_contents),
         'records': len(records)
@@ -459,13 +460,13 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
         description="Harvest content using a page of mapped metadata")
     parser.add_argument('collection_id', help="Collection ID")
     parser.add_argument('mapped_page_path', help="URI-formatted path to a mapped metadata page")
+    parser.add_argument('content_data_version', help="URI-formatted path to a content data version")
     parser.add_argument('--nuxeo', action="store_true", help="Use Nuxeo auth")
     args = parser.parse_args()
     arguments = {
         'collection_id': args.collection_id,
         'mapped_page_path': args.mapped_page_path,
-        'content_data_version': create_content_data_version(
-            args.collection_id, args.mapped_page_path.rsplit('data', 1)[0])
+        'content_data_version': args.content_data_version
     }
     if args.nuxeo:
         arguments['rikolti_mapper_type'] = 'nuxeo.nuxeo'
diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index f7bebe969..df924d801 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -9,6 +9,11 @@
 DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp')
 DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp')
 CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp')
+CONTENT_DEST = {
+    "STORE": urlparse(CONTENT_DEST_URL).scheme,
+    "BUCKET": urlparse(CONTENT_DEST_URL).netloc,
+    "PATH": urlparse(CONTENT_DEST_URL).path,
+}
 
 AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False)
 AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', False)
diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py
index eeac13dd8..a3181a878 100644
--- a/dags/harvest_dag.py
+++ b/dags/harvest_dag.py
@@ -12,6 +12,7 @@
 from rikolti.dags.shared_tasks  import map_page_task
 from rikolti.dags.shared_tasks  import get_mapping_status_task
 from rikolti.dags.shared_tasks import validate_collection_task
+from rikolti.dags.shared_tasks import create_content_data_version_task
 from rikolti.dags.shared_content_harvester import ContentHarvestOperator
 
 
@@ -53,11 +54,13 @@ def harvest():
     validate_collection_task(mapping_status)
     mapped_page_paths = get_mapped_page_filenames_task(mapped_pages)
 
+    content_data_version = create_content_data_version_task(collection, mapped_pages)
     content_harvest_task = (
         ContentHarvestOperator
             .partial(
                 task_id="content_harvest", 
                 collection_id="{{ params.collection_id }}",
+                content_data_version=content_data_version,
             )
             .expand(
                 page=mapped_page_paths
diff --git a/dags/shared_content_harvester.py b/dags/shared_content_harvester.py
index 4352d0e16..4c64943e2 100644
--- a/dags/shared_content_harvester.py
+++ b/dags/shared_content_harvester.py
@@ -38,7 +38,13 @@ def get_awsvpc_config():
 
 
 class ContentHarvestEcsOperator(EcsRunTaskOperator):
-    def __init__(self, collection_id=None, page=None, **kwargs):
+    def __init__(self, collection_id=None, content_data_version=None, page=None, **kwargs):
+        container_name = "rikolti-content_harvester"
+        if page:
+            page_basename = page.split('/')[-1]
+            container_name = (
+                f"content_harvester_{collection_id}_{page_basename.split('.')[0]}")
+
         args = {
             "cluster": "rikolti-ecs-cluster",
             "launch_type": "FARGATE",
@@ -47,8 +53,12 @@ def __init__(self, collection_id=None, page=None, **kwargs):
             "overrides": {
                 "containerOverrides": [
                     {
-                        "name": "rikolti-content_harvester",
-                        "command": [f"{collection_id}", f"{page}"],
+                        "name": container_name,
+                        "command": [
+                            f"{collection_id}",
+                            f"{page}",
+                            f"{content_data_version}"
+                        ],
                         "environment": [
                             {
                                 "CONTENT_DATA_SRC": os.environ.get("CONTENT_DATA_SRC"),
@@ -86,7 +96,7 @@ def execute(self, context):
 
 
 class ContentHarvestDockerOperator(DockerOperator):
-    def __init__(self, collection_id, page, **kwargs):
+    def __init__(self, collection_id, content_data_version, page, **kwargs):
         mounts = []
         if os.environ.get("CONTENT_DATA_MOUNT"):
             mounts.append(Mount(
@@ -109,11 +119,14 @@ def __init__(self, collection_id, page, **kwargs):
         )
         container_version = os.environ.get(
             'CONTENT_HARVEST_VERSION', 'latest')
+        page_basename = page.split('/')[-1]
+        container_name = (
+            f"content_harvester_{collection_id}_{page_basename.split('.')[0]}")
 
         args = {
             "image": f"{container_image}:{container_version}",
-            "container_name": f"content_harvester_{collection_id}_{page}",
-            "command": [f"{collection_id}", f"{page}"],
+            "container_name": container_name,
+            "command": [f"{collection_id}", f"{page}", f"{content_data_version}"],
             "network_mode": "bridge",
             "auto_remove": 'force',
             "mounts": mounts,
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 792875f21..ab49035a1 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -19,6 +19,7 @@
 from rikolti.record_indexer.move_index_to_prod import move_index_to_prod
 from rikolti.utils.rikolti_storage import create_mapped_version
 from rikolti.utils.rikolti_storage import create_vernacular_version
+from rikolti.utils.rikolti_storage import create_content_data_version
 
 
 # TODO: remove the rikoltifetcher registry endpoint and restructure
@@ -146,6 +147,13 @@ def validate_collection_task(collection_status: dict, params=None) -> str:
     return file_location
 
 
+@task()
+def create_content_data_version_task(collection: dict, mapped_pages: list[dict]):
+    content_data_version = create_content_data_version(
+        collection['id'], mapped_pages[0]['mapped_page_path'])
+    return content_data_version
+
+
 @task()
 def create_stage_index_task(collection: dict, index_name: str):
     collection_id = collection.get('id')
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index 4043e80ae..ff416d721 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -54,15 +54,17 @@ def list_pages(data_uri: str, recursive: bool=True, **kwargs) -> list:
                 f"https://{data.bucket}.s3.us-west-2.amazonaws"
                 ".com/index.html#{data.path}/"
             )
-            raise Exception(
+            print(
                 f"Error listing files at {data.uri}\n"
                 f"Check that {data.path} exists at {url}\n{e}"
-        )
+            )
+            raise e
     elif data.store == 'file':
         try:
             return list_file_pages(data, recursive=recursive)
         except Exception as e:
-            raise Exception(f"Error listing files in {data.path}\n{e}")
+            print(f"Error listing files in {data.path}\n{e}")
+            raise e
     else:
         raise Exception(f"Unknown data store: {data.store}")
 
@@ -97,14 +99,14 @@ def list_file_pages(data: DataStorage, recursive: bool=True) -> list:
     file_objects = []
     if recursive:
         for root, dirs, files in os.walk(data.path):
-            root_uri = "file://{root}/" if root[-1] != '/' else "file://{root}"
+            root_uri = f"file://{root}/" if root[-1] != '/' else f"file://{root}"
             for file in files:
                 file_objects.append(f"{root_uri}{file}")
 
     if not recursive:
         for file in os.listdir(data.path):
             if os.path.isfile(os.path.join(data.path, file)):
-                root_uri = "file://{data.path}/" if data.path[-1] != '/' else "file://{data.path}"
+                root_uri = f"file://{data.path}/" if data.path[-1] != '/' else f"file://{data.path}"
                 file_objects.append(f"{root_uri}{file}")
 
     return file_objects
@@ -289,7 +291,6 @@ def create_content_data_version(
             datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
     content_data_path = (
         f"{content_data_root.rstrip('/')}/content_data_{content_data_suffix}/")
-    )
     return content_data_path
 
 

From 927c6d541013e15be81f1d9b0a89642d9a29b49b Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Fri, 10 Nov 2023 13:40:21 -0800
Subject: [PATCH 24/42] detangle data root from version path for fetched data

---
 dags/mapper_dag.py                            |  7 +--
 dags/shared_tasks.py                          | 21 ++++++-
 metadata_fetcher/fetchers/Fetcher.py          |  9 ++-
 metadata_fetcher/fetchers/ucd_json_fetcher.py |  6 +-
 metadata_mapper/lambda_function.py            |  6 +-
 metadata_mapper/lambda_shepherd.py            | 17 +-----
 utils/rikolti_storage.py                      | 57 ++++++++++++++++---
 7 files changed, 84 insertions(+), 39 deletions(-)

diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py
index bcf639796..dc5be8aa2 100644
--- a/dags/mapper_dag.py
+++ b/dags/mapper_dag.py
@@ -9,18 +9,15 @@
 from rikolti.dags.shared_tasks import map_page_task
 from rikolti.dags.shared_tasks import get_mapping_status_task
 from rikolti.dags.shared_tasks import validate_collection_task
-from rikolti.metadata_mapper.lambda_shepherd import get_vernacular_pages
+from rikolti.utils.rikolti_storage import get_vernacular_pages
 from rikolti.utils.rikolti_storage import get_most_recent_vernacular_version
 
 
 @task()
 def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str] = None):
-    collection_id = collection.get('id')
+    collection_id = collection['id']
     if not vernacular_version:
         vernacular_version = get_most_recent_vernacular_version(collection_id)
-    if not collection_id:
-        raise ValueError(
-            f"Collection ID not found in collection metadata: {collection}")
     pages = get_vernacular_pages(collection_id, vernacular_version)
     return pages
 
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index ab49035a1..47b98f07f 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -19,6 +19,7 @@
 from rikolti.record_indexer.move_index_to_prod import move_index_to_prod
 from rikolti.utils.rikolti_storage import create_mapped_version
 from rikolti.utils.rikolti_storage import create_vernacular_version
+from rikolti.utils.rikolti_storage import get_version
 from rikolti.utils.rikolti_storage import create_content_data_version
 
 
@@ -46,6 +47,13 @@ def create_vernacular_version_task(collection):
 
 @task()
 def fetch_collection_task(collection: dict, vernacular_version: str):
+    """
+    returns a list of the filepaths of the vernacular metadata relative to the
+    collection id, ex: [
+        '3433/vernacular_metadata_2023-01-01T00:00:00/data/1',
+        '3433/vernacular_metadata_2023-01-01T00:00:00/data/2'
+    ]
+    """
     fetch_status = fetch_collection(collection, vernacular_version, {})
     success = all([page['status'] == 'success' for page in fetch_status])
     total_items = sum([page['document_count'] for page in fetch_status])
@@ -116,8 +124,19 @@ def get_mapping_status_task(collection: dict, mapped_pages: list):
 
 @task()
 def create_mapped_version_task(collection, vernacular_pages):
+    """
+    vernacular pages is a list of the filepaths of the vernacular metadata
+    relative to the collection id, ex: [
+        '3433/vernacular_metadata_2023-01-01T00:00:00/data/1',
+        '3433/vernacular_metadata_2023-01-01T00:00:00/data/2'
+    ]
+    """
+    vernacular_version = get_version(collection.get('id'), vernacular_pages[0])
+    if not vernacular_version:
+        raise ValueError(
+            f"Vernacular version not found in {vernacular_pages[0]}")
     mapped_data_version = create_mapped_version(
-        collection.get('id'), vernacular_pages[0])
+        collection.get('id'), vernacular_version)
     return mapped_data_version
 
 
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index 02085d18c..86ab6ec45 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -1,9 +1,8 @@
 import logging
 import requests
-import os
 
 from requests.adapters import HTTPAdapter, Retry
-from rikolti.utils.rikolti_storage import put_page_content
+from rikolti.utils.rikolti_storage import put_vernacular_content
 
 
 logger = logging.getLogger(__name__)
@@ -26,7 +25,7 @@ def __init__(self, params):
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.data_destination = params.get('vernacular_version')
+        self.vernacular_version = params.get('vernacular_version')
 
 
         if not self.collection_id:
@@ -50,8 +49,8 @@ def fetch_page(self):
         if record_count:
             content = self.aggregate_vernacular_content(response.text)
             try:
-                filepath = put_page_content(
-                    content, f"{self.data_destination}data/{self.write_page}")
+                filepath = put_vernacular_content(
+                    content, self.write_page, self.vernacular_version)
             except Exception as e:
                 print(f"Metadata Fetcher: {e}")
                 raise(e)
diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index 860495116..cbc15e0bc 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -10,7 +10,7 @@
 from bs4 import BeautifulSoup
 
 from .Fetcher import Fetcher, FetchError
-from rikolti.utils.rikolti_storage import put_page_content
+from rikolti.utils.rikolti_storage import put_vernacular_content
 
 class UcdJsonFetcher(Fetcher):
     def __init__(self, params: dict[str]):
@@ -69,8 +69,8 @@ def fetch_all_pages(self, response: requests.Response) -> list:
             records = [self.fetch_json_ld(url) for url in urls]
             document_count = len(records)
             try:
-                filepath = put_page_content(
-                    json.dumps(records), f"{self.data_destination}data/{self.write_page}")
+                filepath = put_vernacular_content(
+                    json.dumps(records), self.write_page, self.vernacular_version)
                 fetch_status.append({
                     'document_count': document_count,
                     'vernacular_filepath': filepath,
diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index a5922700e..eaf7350d4 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -8,7 +8,7 @@
 
 from . import settings
 from .mappers.mapper import Record, Vernacular
-from rikolti.utils.rikolti_storage import get_page_content, put_page_content
+from rikolti.utils.rikolti_storage import get_mapped_page, put_page_content
 
 logger = logging.getLogger(__name__)
 
@@ -80,7 +80,7 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version:
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
     page_filename = os.path.basename(vernacular_page_path)
-    api_resp = get_page_content(vernacular_page_path)
+    api_resp = get_mapped_page(vernacular_page_path)
 
     source_vernacular = vernacular_reader(collection_id, page_filename)
     source_metadata_records = source_vernacular.parse(api_resp)
@@ -136,7 +136,7 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version:
     parser = argparse.ArgumentParser(
         description="Map metadata from the institution's vernacular")
     parser.add_argument('collection_id', help='collection id')
-    parser.add_argument('page_path', help='uri file path to vernauclar metadata page filename; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/data/1')
+    parser.add_argument('page_path', help='relative file path to vernauclar metadata page filename; ex: 3433/vernacular_data_version_1/data/1')
     parser.add_argument('mapped_data_version', help='uri file path to mapped data version; ex: file:///rikolti_data_root/3433/vernacular_data_version_1/mapped_data_version_1/')
     parser.add_argument('collection', help='json collection metadata from registry')
 
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index baa5da644..a376e0c14 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -8,7 +8,7 @@
 from . import validate_mapping
 from .lambda_function import map_page
 from .mappers.mapper import Record
-from rikolti.utils.rikolti_storage import list_pages, create_mapped_version, get_most_recent_vernacular_version
+from rikolti.utils.rikolti_storage import get_vernacular_pages, create_mapped_version, get_most_recent_vernacular_version
 
 
 def get_collection(collection_id):
@@ -37,19 +37,6 @@ def check_for_missing_enrichments(collection):
     return not_yet_implemented
 
 
-def get_vernacular_pages(collection_id, vernacular_version):
-    try:
-        page_list = list_pages(vernacular_version, recursive=True)
-    except FileNotFoundError as e:
-        print(
-            f"{e} - have you fetched {collection_id}? "
-            f"looked in dir {e.filename} for vernacular pages"
-        )
-        raise(e)
-
-    # TODO: split page_list into pages and children?
-    return page_list
-
 
 def get_mapping_status(collection, mapped_pages):
     count = sum([page['num_records_mapped'] for page in mapped_pages])
@@ -131,7 +118,7 @@ def map_collection(collection_id, vernacular_version=None, validate=False):
     parser.add_argument('collection_id', help='collection ID from registry')
     parser.add_argument('--validate', help='validate mapping; may provide json opts',
         const=True, nargs='?')
-    parser.add_argument('vernacular_version', help='URI to a folder of vernacular pages to map')
+    parser.add_argument('vernacular_version', help='relative path describing a vernacular version, ex: 3433/vernacular_data_version_1/')
     args = parser.parse_args(sys.argv[1:])
     mapped_collection = map_collection(args.collection_id, args.vernacular_version, args.validate)
     missing_enrichments = mapped_collection.get('missing_enrichments')
diff --git a/utils/rikolti_storage.py b/utils/rikolti_storage.py
index ff416d721..150b67551 100644
--- a/utils/rikolti_storage.py
+++ b/utils/rikolti_storage.py
@@ -183,6 +183,7 @@ def put_s3_content(data: DataStorage, content, **kwargs) -> str:
     )
     return data.uri
 
+
 def put_file_content(data: DataStorage, content) -> str:
     """
     Write content to a file at data.path
@@ -197,34 +198,70 @@ def put_file_content(data: DataStorage, content) -> str:
     return data.uri
 
 
+def get_version(collection_id, uri):
+    """
+    From an arbitrary path, try to get the version string
+    """
+    uri = uri.rstrip('/')
+    if collection_id not in uri or uri.endswith(collection_id):
+        return None
+    rikolti_data_root, relative_path = uri.split(f"/{collection_id}/")
+    path_list = relative_path.split('/')
+    if 'data' in path_list:
+        path_list = path_list[:path_list.index('data')]
+    path_list.insert(0, collection_id)
+    version = "/".join(path_list)
+    return version
+
+
 def create_vernacular_version(
         collection_id: int or str,
         version_suffix: Optional[str] = None
     ):
-    fetcher_data_dest = os.environ.get(
-        "FETCHER_DATA_DEST", "file:///tmp")
-    collection_path = (
-        f"{fetcher_data_dest.rstrip('/')}/{collection_id}/")
     if not version_suffix:
         version_suffix = (
             datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
     vernacular_version_path = (
-        f"{collection_path}vernacular_metadata_{version_suffix}/")
+        f"{collection_id}/vernacular_metadata_{version_suffix}/")
     return vernacular_version_path
 
 
+def put_vernacular_content(content: str, page_name: int or str, version: str):
+    fetcher_data_dest = os.environ.get(
+        "FETCHER_DATA_DEST", "file:///tmp")
+    path = f"{fetcher_data_dest.rstrip('/')}/{version}/data/{page_name}"
+    put_page_content(content, path)
+    return f"{version}/data/{page_name}"
+
+
 def get_most_recent_vernacular_version(collection_id: int or str):
     mapper_data_src = os.environ.get("MAPPED_DATA_SRC")
     vernacular_versions = list_dirs(f"{mapper_data_src}/{collection_id}/")
     if not vernacular_versions:
         raise Exception(
             "No vernacular metadata versions found for {collection_id}")
-    return sorted(vernacular_versions)[-1]
+    return get_version(collection_id, sorted(vernacular_versions)[-1])
+
+
+def get_vernacular_pages(collection_id, vernacular_version):
+    mapper_data_src = os.environ.get("MAPPED_DATA_SRC", "file:///tmp").rstrip('/')
+    vernacular_path = f"{mapper_data_src}/{vernacular_version}/data/"
+    try:
+        page_list = list_pages(vernacular_path, recursive=True)
+    except FileNotFoundError as e:
+        print(
+            f"{e} - have you fetched {collection_id}? "
+            f"looked in dir {e.filename} for vernacular pages"
+        )
+        raise(e)
+
+    # TODO: split page_list into pages and children?
+    return page_list
 
 
 def create_mapped_version(
         collection_id: int or str,
-        vernacular_path: str,
+        vernacular_version: str,
         mapped_data_suffix: Optional[str] = None,
 ):
     mapper_data_dest = os.environ.get("MAPPED_DATA_DEST")
@@ -246,6 +283,12 @@ def create_mapped_version(
     return mapped_data_path
 
 
+def get_mapped_page(relative_vernacular_path):
+    mapper_data_src = os.environ.get("MAPPER_DATA_SRC", "file:///tmp").rstrip('/')
+    relative_vernacular_path = relative_vernacular_path.lstrip('/')
+    return get_page_content(f"{mapper_data_src}/{relative_vernacular_path}")
+
+
 def create_validation_version(
         collection_id: int or str,
         mapped_data_path: str,

From 3509146dbf09485aa8631949a086a127daa0d060 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 13 Nov 2023 16:26:36 -0800
Subject: [PATCH 25/42] Rename rikolti_storage to storage and add versions
 module

---
 README.md                                     |  24 ++-
 content_harvester/README.md                   |   2 +-
 content_harvester/by_collection.py            |  27 ++-
 content_harvester/by_page.py                  |  87 ++++------
 content_harvester/docker-compose.yml          |   6 +-
 content_harvester/settings.py                 |  12 +-
 dags/docker_content_harvest.py                |   2 +
 dags/harvest_dag.py                           |   7 +-
 dags/mapper_dag.py                            |   9 +-
 dags/shared_content_harvester.py              |  12 +-
 dags/shared_tasks.py                          |  65 +++++--
 dags/utils_by_mapper_type.py                  |  18 +-
 env.example                                   |  10 +-
 .../fetch_registry_collections.py             |  14 +-
 metadata_fetcher/fetchers/Fetcher.py          |  23 ++-
 metadata_fetcher/fetchers/ucd_json_fetcher.py |   4 +-
 metadata_fetcher/lambda_function.py           |  11 +-
 metadata_fetcher/settings.py                  |   4 -
 metadata_fetcher/tests.py                     |   2 +-
 metadata_mapper/lambda_function.py            |  32 +++-
 metadata_mapper/lambda_shepherd.py            |  28 ++-
 metadata_mapper/settings.py                   |   3 -
 metadata_mapper/validate_mapping.py           |  25 ++-
 metadata_mapper/validator/validation_log.py   |  25 +--
 utils/{rikolti_storage.py => storage.py}      | 141 +--------------
 utils/versions.py                             | 160 ++++++++++++++++++
 26 files changed, 434 insertions(+), 319 deletions(-)
 rename utils/{rikolti_storage.py => storage.py} (53%)
 create mode 100644 utils/versions.py

diff --git a/README.md b/README.md
index 028d07850..186e02603 100644
--- a/README.md
+++ b/README.md
@@ -48,20 +48,18 @@ vi env.local
 
 Currently, I only use one virtual environment, even though each folder located at the root of this repository represents an isolated component. If dependency conflicts are encountered, I'll wind up creating separate environments.
 
-Similarly, I also only use one env.local as well. Rikolti fetches data to your local system, maps that data, and then fetches relevant content files (media files, previews, and thumbnails). Set `FETCHER_DATA_DEST` to the URI where you would like Rikolti to store fetched data - Rikolti will create a folder (or s3 prefix) `<collection_id>/vernacular_metadata` at this location. Set `MAPPER_DATA_SRC` to the URI where Rikolti can find a `<collection_id>/vernacular_metadata` folder that contains the fetched data you're attempting to map. Set `MAPPER_DATA_DEST` to the URI where you would like Rikolti to store mapped data - Rikolti will create a folder (or s3 prefix) `<collection_id>/mapped_metadata` at this location. Set `CONTENT_DATA_SRC` to the URI where Rikolti can find a `<collection_id>/mapped_metadata` folder that contains the mapped metadata describing where to find content. Set `CONTENT_DATA_DEST` to the URI where you would like Rikolti to store mapped data that has been updated with pointers to content files - Rikolti will create a folder (or s3 prefix) `<collection_id>/mapped_with_content` at this location. Set `CONTENT_DEST` to the URI where you would like Rikolti to store content files.
+Similarly, I also only use one env.local as well. Rikolti fetches data to your local system, maps that data, and then fetches relevant content files (media files, previews, and thumbnails). Set `VERNACULAR_DATA` to the URI where you would like Rikolti to store and retrieve fetched data - Rikolti will create a folder (or s3 prefix) `<collection_id>/vernacular_metadata` at this location. Set `MAPPED_DATA` to the URI where you would like Rikolti to store and retrieve mapped data - Rikolti will create a folder (or s3 prefix) `<collection_id>/mapped_metadata` at this location. Set `CONTENT_DATA` to the URI where you would like Rikolti to store mapped data that has been updated with pointers to content files - Rikolti will create a folder (or s3 prefix) `<collection_id>/mapped_with_content` at this location. Set `CONTENT_ROOT` to the URI where you would like Rikolti to store content files.
 
 For example, one way to configure `env.local` is:
 
 ```
-FETCHER_DATA_DEST=file:///Users/awieliczka/Projects/rikolti/rikolti_data
-MAPPER_DATA_SRC=$FETCHER_DATA_DEST
-MAPPER_DATA_DEST=$FETCHER_DATA_DEST
-CONTENT_DATA_SRC=$FETCHER_DATA_DEST
-CONTENT_DATA_DEST=$FETCHER_DATA_DEST
-CONTENT_DEST=file:///Users/awieliczka/Projects/rikolti/rikolti_content
+VERNACULAR_DATA=file:///Users/awieliczka/Projects/rikolti/rikolti_data
+MAPPED_DATA=$VERNACULAR_DATA
+CONTENT_DATA=$VERNACULAR_DATA
+CONTENT_ROOT=file:///Users/awieliczka/Projects/rikolti/rikolti_content
 ```
 
-Each of these can be different locations, however. For example, if you're attempting to re-run a mapper locally off of previously fetched data stored on s3, you might set `MAPPER_DATA_SRC=s3://rikolti_data`.
+Each of these can be different locations, however. For example, if you're attempting to re-run a mapper locally off of previously fetched data stored on s3, you might set `VERNACULAR_DATA=s3://rikolti_data`.
 
 In env.example you'll also see `CONTENT_DATA_MOUNT` and `CONTENT_MOUNT` environment variables. These are only relevant if you are running the content harvester using airflow, and want to set and of the CONTENT_ environment variables to the local filesystem. Their usage is described below in the Airflow Development section.
 
@@ -172,9 +170,8 @@ The docker socket will typically be at `/var/run/docker.sock`. On Mac OS Docker
 Next, back in the Rikolti repository, create the `startup.sh` file by running `cp env.example dags/startup.sh`. Update the startup.sh file with Nuxeo, Flickr, and Solr keys as available, and make sure that the following environment variables are set:
 
 ```
-export FETCHER_DATA_DEST=file:///usr/local/airflow/rikolti_data
-export MAPPER_DATA_SRC=file:///usr/local/airflow/rikolti_data
-export MAPPER_DATA_DEST=file:///usr/local/airflow/rikolti_data
+export VERNACULAR_DATA=file:///usr/local/airflow/rikolti_data
+export MAPPED_DATA=file:///usr/local/airflow/rikolti_data
 ```
 
 The folder located at `RIKOLTI_DATA_HOME` (set in `aws-mwaa-local-runner/docker/.env`) is mounted to `/usr/local/airflow/rikolti_data` on the airflow docker container.
@@ -184,9 +181,8 @@ Please also make sure the following `CONTENT_*` variables are set - `CONTENT_DAT
 ```
 export CONTENT_DATA_MOUNT=/Users/awieliczka/Projects/rikolti_data
 export CONTENT_MOUNT=/Users/awieliczka/Projects/rikolti_content
-export CONTENT_DATA_SRC=file:///rikolti_data
-export CONTENT_DATA_DEST=file:///rikolti_data
-export CONTENT_DEST=file:///rikolti_content
+export CONTENT_DATA=file:///rikolti_data
+export CONTENT_ROOT=file:///rikolti_content
 ```
 
 The folder located at `CONTENT_DATA_MOUNT` is mounted to `/rikolti_data` and the folder located at `CONTENT_MOUNT` is mounted to `/rikolti_content` on the content_harvester docker container.
diff --git a/content_harvester/README.md b/content_harvester/README.md
index e5f810588..badf0551f 100644
--- a/content_harvester/README.md
+++ b/content_harvester/README.md
@@ -34,7 +34,7 @@ The above media and thumbnail fetching processes are enacted upon child metadata
 
 # Settings
 
-You can bypass uploading to s3 by setting `settings.CONTENT_DATA_DEST = "file://<local path>"` and `settings.CONTENT_DEST = "file://<local_path>"`. This is useful for local development and testing. This will, however, set the metadata records' `media['media_filepath']` and `thumbnail['thumbnail_filepath']` to a local filepath. 
+You can bypass uploading to s3 by setting `settings.CONTENT_DATA = "file://<local path>"` and `settings.CONTENT_ROOT = "file://<local_path>"`. This is useful for local development and testing. This will, however, set the metadata records' `media['media_filepath']` and `thumbnail['thumbnail_filepath']` to a local filepath. 
 
 # Local Development
 
diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index 398697094..bf0bc057a 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -1,20 +1,7 @@
 import json
 
-from . import settings
 from .by_page import harvest_page_content
-from .rikolti_storage import list_pages, create_content_data_version
-
-def get_mapped_pages(mapped_data_version:str):
-    page_list = []
-    page_list = list_pages(
-        f"{mapped_data_version.rstrip('/')}/data/",
-        recursive=False,
-        aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
-        aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
-        aws_session_token=settings.AWS_SESSION_TOKEN,
-        region_name=settings.AWS_REGION
-    )
-    return page_list
+from .versions import get_mapped_pages, create_content_data_version
 
 
 # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo"}
@@ -28,13 +15,19 @@ def harvest_collection(collection, mapped_data_version: str):
         print("ERROR ERROR ERROR\ncollection_id and mapped_data_version required")
         exit()
 
-    page_list = get_mapped_pages(mapped_data_version)
+    page_list = get_mapped_pages(
+        mapped_data_version,
+        aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
+        aws_session_token=settings.AWS_SESSION_TOKEN,
+        region_name=settings.AWS_REGION
+    )
 
     print(f"[{collection_id}]: Harvesting content for {len(page_list)} pages")
     collection_stats = {}
+
     collection.update({
-        'content_data_version': create_content_data_version(
-            collection_id, mapped_data_version)
+        'content_data_version': create_content_data_version(mapped_data_version)
     })
 
     for page_path in page_list:
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index ef163d715..5700d65cb 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,7 +14,10 @@
 from . import derivatives
 from . import settings
 
-from .rikolti_storage import list_pages, get_page_content, put_page_content, create_content_data_version
+from .versions import (
+    get_mapped_page, get_child_directories, get_child_pages, get_child_page,
+    get_version, put_content_data_page
+)
 
 class DownloadError(Exception):
     pass
@@ -24,44 +27,13 @@ class UnsupportedMimetype(Exception):
     pass
 
 
-def get_mapped_records(page_path) -> list:
-    mapped_records = []
-    mapped_records = json.loads(get_page_content(page_path))
-    return mapped_records
-
-
-def write_mapped_record(content_data_version, record):
-    filename = put_page_content(
-        json.dumps(record), 
-        (
-            f"{content_data_version.rstrip('/')}/data/"
-            f"{record.get('calisphere-id').replace(os.sep, '_')}.json"
-        )
-    )
-    return filename
-
-
-def write_mapped_page(content_data_version, page, records):
-    filename = put_page_content(
-        json.dumps(records),
-        f"{content_data_version.rstrip('/')}/data/{page}"
-    )
-    return filename
-
-
 def get_child_records(mapped_page_path, parent_id) -> list:
     mapped_child_records = []
-    try:
-        children = list_pages(
-            f"{mapped_page_path.rsplit('/', 1)[0]}/children/",
-            recursive=False
-        )
-    except FileNotFoundError:
-        return mapped_child_records
+    children = get_child_pages(mapped_page_path)
     children = [page for page in children
                 if (page.rsplit('/')[-1]).startswith(parent_id)]
     for child in children:
-        mapped_child_records.extend(json.loads(get_page_content(child)))
+        mapped_child_records.extend(get_child_page(child))
     return mapped_child_records
 
 
@@ -235,7 +207,7 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict:
                 dest_filename = os.path.basename(content.derivative_filepath)
 
             content_s3_filepath = self._upload(
-                content.dest_prefix, dest_filename, content.derivative_filepath)
+                f"{content.dest_prefix}/{collection_id}", dest_filename, content.derivative_filepath)
             content.set_s3_filepath(content_s3_filepath)
 
             # print(
@@ -249,14 +221,19 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict:
             }
 
         # Recurse through the record's children (if any)
-        child_records = get_child_records(
-            self.mapped_page_path, calisphere_id)
-        if child_records:
-            print(
-                f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: "
-                f"{len(child_records)} children found."
-            )
-            record['children'] = [self.harvest(c, download_cache=download_cache) for c in child_records]
+        mapped_version = get_version(
+            self.collection_id, self.mapped_page_path)
+        child_directories = get_child_directories(mapped_version)
+        print(f"CHILD DIRECTORIES: {child_directories}")
+        if child_directories:
+            child_records = get_child_records(
+                self.mapped_page_path, calisphere_id)
+            if child_records:
+                print(
+                    f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: "
+                    f"{len(child_records)} children found."
+                )
+                record['children'] = [self.harvest(c, download_cache=download_cache) for c in child_records]
 
         return record
 
@@ -312,7 +289,7 @@ def _download(self, url: str, destination_file: str, cache: Optional[dict] = Non
 
     def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = None) -> str:
         '''
-            upload file to CONTENT_DEST
+            upload file to CONTENT_ROOT
         '''
         if not cache:
             cache = {}
@@ -322,20 +299,20 @@ def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] =
 
         dest_path = ''
 
-        if settings.CONTENT_DEST["STORE"] == 'file':
+        if settings.CONTENT_ROOT["STORE"] == 'file':
             dest_path = os.path.join(
-                settings.CONTENT_DEST["PATH"], dest_prefix)
+                settings.CONTENT_ROOT["PATH"], dest_prefix)
             if not os.path.exists(dest_path):
                 os.makedirs(dest_path)
             dest_path = os.path.join(dest_path, dest_filename)
             shutil.copyfile(filepath, dest_path)
 
-        if settings.CONTENT_DEST["STORE"] == 's3':
+        if settings.CONTENT_ROOT["STORE"] == 's3':
             s3 = boto3.client('s3')
             dest_path = (
-                f"{settings.CONTENT_DEST['PATH']}/{dest_prefix}/{dest_filename}")
+                f"{settings.CONTENT_ROOT['PATH']}/{dest_prefix}/{dest_filename}")
             s3.upload_file(
-                filepath, settings.CONTENT_DEST["BUCKET"], dest_path)
+                filepath, settings.CONTENT_ROOT["BUCKET"], dest_path)
 
         # (mime, dimensions) = image_info(filepath)
         cache_updates = {
@@ -365,7 +342,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
         src_auth=auth
     )
 
-    records = get_mapped_records(mapped_page_path)
+    records = json.loads(get_mapped_page(mapped_page_path))
     print(
         f"[{collection_id}, {page_filename}]: "
         f"Harvesting content for {len(records)} records"
@@ -379,8 +356,11 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
         # spit out progress so far if an error has been encountered
         try:
             record_with_content = harvester.harvest(record)
-            # write_mapped_record(
-            #     content_data_version, record_with_content)
+            # put_content_data_page(
+            #     json.dumps(record_with_content), 
+            #     record_with_content.get('calisphere-id').replace(os.sep, '_') + ".json",
+            #     content_data_version
+            # )
             if not record_with_content.get('thumbnail'):
                 warn_level = "ERROR"
                 if 'sound' in record.get('type', []):
@@ -400,7 +380,8 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
                   f"in page {page_filename} of collection {collection_id}")
             raise(e)
 
-    write_mapped_page(content_data_version, page_filename, records)
+    put_content_data_page(
+        json.dumps(records), page_filename, content_data_version)
 
     media_source = [r for r in records if r.get('media_source')]
     media_harvested = [r for r in records if r.get('media')]
diff --git a/content_harvester/docker-compose.yml b/content_harvester/docker-compose.yml
index af1aef4d0..7d28ead0c 100644
--- a/content_harvester/docker-compose.yml
+++ b/content_harvester/docker-compose.yml
@@ -17,8 +17,8 @@ services:
       - ../rikolti_content:/rikolti_content
       - ./:/content_harvester
     environment:
-      - CONTENT_DATA_SRC=file:///rikolti_data
-      - CONTENT_DATA_DEST=file:///rikolti_data
-      - CONTENT_DEST=file:///rikolti_content
+      - MAPPED_DATA=file:///rikolti_data
+      - CONTENT_DATA=file:///rikolti_data
+      - CONTENT_ROOT=file:///rikolti_content
       - NUXEO_USER=${NUXEO_USER}
       - NUXEO_PASS=${NUXEO_PASS}
diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index df924d801..56aeae3fb 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -6,13 +6,11 @@
 
 load_dotenv()
 
-DATA_SRC_URL = os.environ.get('CONTENT_DATA_SRC', 'file:///tmp')
-DATA_DEST_URL = os.environ.get('CONTENT_DATA_DEST', 'file:///tmp')
-CONTENT_DEST_URL = os.environ.get("CONTENT_DEST", 'file:///tmp')
-CONTENT_DEST = {
-    "STORE": urlparse(CONTENT_DEST_URL).scheme,
-    "BUCKET": urlparse(CONTENT_DEST_URL).netloc,
-    "PATH": urlparse(CONTENT_DEST_URL).path,
+CONTENT_ROOT_URL = os.environ.get("CONTENT_ROOT", 'file:///tmp')
+CONTENT_ROOT = {
+    "STORE": urlparse(CONTENT_ROOT_URL).scheme,
+    "BUCKET": urlparse(CONTENT_ROOT_URL).netloc,
+    "PATH": urlparse(CONTENT_ROOT_URL).path,
 }
 
 AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False)
diff --git a/dags/docker_content_harvest.py b/dags/docker_content_harvest.py
index 73974c87f..19a23484d 100644
--- a/dags/docker_content_harvest.py
+++ b/dags/docker_content_harvest.py
@@ -22,6 +22,7 @@ def docker_content_harvest():
     harvest_content_for_page_task = ContentHarvestDockerOperator(
         task_id="page_content_harvester_on_local_docker",
         collection_id="{{ params.collection_id }}",
+        content_data_version="{{ params.content_data_version }}",
         page="{{ params.page_filename }}",
     )
     harvest_content_for_page_task
@@ -31,6 +32,7 @@ def docker_content_harvest():
         entrypoint="python3 -m content_harvester.by_collection",
         command=["{{ params.collection_id }}"],
         collection_id="{{ params.collection_id }}",
+        content_data_version="{{ params.content_data_version }}",
         page="all",
     )
     harvest_content_for_collection_task
diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py
index a3181a878..5365a730f 100644
--- a/dags/harvest_dag.py
+++ b/dags/harvest_dag.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-
+import os
 from airflow.decorators import dag, task
 from airflow.models.param import Param
 
@@ -47,7 +47,7 @@ def harvest():
     mapped_pages = (
         map_page_task
             .partial(collection=collection, mapped_data_version=mapped_data_version)
-            .expand(page=fetched_pages)
+            .expand(vernacular_page=fetched_pages)
     )
 
     mapping_status = get_mapping_status_task(collection, mapped_pages)
@@ -55,12 +55,13 @@ def harvest():
     mapped_page_paths = get_mapped_page_filenames_task(mapped_pages)
 
     content_data_version = create_content_data_version_task(collection, mapped_pages)
+
     content_harvest_task = (
         ContentHarvestOperator
             .partial(
                 task_id="content_harvest", 
                 collection_id="{{ params.collection_id }}",
-                content_data_version=content_data_version,
+                content_data_version=content_data_version
             )
             .expand(
                 page=mapped_page_paths
diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py
index dc5be8aa2..ece23aa63 100644
--- a/dags/mapper_dag.py
+++ b/dags/mapper_dag.py
@@ -9,8 +9,8 @@
 from rikolti.dags.shared_tasks import map_page_task
 from rikolti.dags.shared_tasks import get_mapping_status_task
 from rikolti.dags.shared_tasks import validate_collection_task
-from rikolti.utils.rikolti_storage import get_vernacular_pages
-from rikolti.utils.rikolti_storage import get_most_recent_vernacular_version
+from rikolti.utils.versions import get_most_recent_vernacular_version
+from rikolti.utils.versions import get_vernacular_pages
 
 
 @task()
@@ -18,7 +18,8 @@ def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str
     collection_id = collection['id']
     if not vernacular_version:
         vernacular_version = get_most_recent_vernacular_version(collection_id)
-    pages = get_vernacular_pages(collection_id, vernacular_version)
+    pages = get_vernacular_pages(vernacular_version)
+    # TODO: split page_list into pages and children?
     return pages
 
 # This is a functional duplicate of 
@@ -57,7 +58,7 @@ def mapper_dag():
     mapped_pages = (
         map_page_task
             .partial(collection=collection, mapped_data_version=mapped_data_version)
-            .expand(page=page_list)
+            .expand(vernacular_page=page_list)
     )
 
     mapping_status = get_mapping_status_task(collection, mapped_pages)
diff --git a/dags/shared_content_harvester.py b/dags/shared_content_harvester.py
index 4c64943e2..6b2f238d3 100644
--- a/dags/shared_content_harvester.py
+++ b/dags/shared_content_harvester.py
@@ -61,9 +61,9 @@ def __init__(self, collection_id=None, content_data_version=None, page=None, **k
                         ],
                         "environment": [
                             {
-                                "CONTENT_DATA_SRC": os.environ.get("CONTENT_DATA_SRC"),
-                                "CONTENT_DATA_DEST": os.environ.get("CONTENT_DATA_DEST"),
-                                "CONTENT_DEST": os.environ.get("CONTENT_DEST"),
+                                "MAPPED_DATA": os.environ.get("CONTENT_DATA"),
+                                "CONTENT_DATA": os.environ.get("CONTENT_DATA"),
+                                "CONTENT_ROOT": os.environ.get("CONTENT_ROOT"),
                                 "NUXEO_USER": os.environ.get("NUXEO_USER"),
                                 "NUXEO_PASS": os.environ.get("NUXEO_PASS")
                             }
@@ -132,9 +132,9 @@ def __init__(self, collection_id, content_data_version, page, **kwargs):
             "mounts": mounts,
             "mount_tmp_dir": False,
             "environment": {
-                "CONTENT_DATA_SRC": os.environ.get("CONTENT_DATA_SRC"),
-                "CONTENT_DATA_DEST": os.environ.get("CONTENT_DATA_DEST"),
-                "CONTENT_DEST": os.environ.get("CONTENT_DEST"),
+                "MAPPED_DATA": os.environ.get("CONTENT_DATA"),
+                "CONTENT_DATA": os.environ.get("CONTENT_DATA"),
+                "CONTENT_ROOT": os.environ.get("CONTENT_ROOT"),
                 "NUXEO_USER": os.environ.get("NUXEO_USER"),
                 "NUXEO_PASS": os.environ.get("NUXEO_PASS")
             },
diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 47b98f07f..96fd2b072 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -17,10 +17,10 @@
 from rikolti.record_indexer.create_collection_index import get_index_name
 from rikolti.record_indexer.create_collection_index import delete_index
 from rikolti.record_indexer.move_index_to_prod import move_index_to_prod
-from rikolti.utils.rikolti_storage import create_mapped_version
-from rikolti.utils.rikolti_storage import create_vernacular_version
-from rikolti.utils.rikolti_storage import get_version
-from rikolti.utils.rikolti_storage import create_content_data_version
+from rikolti.utils.versions import create_vernacular_version
+from rikolti.utils.versions import get_version
+from rikolti.utils.versions import create_mapped_version
+from rikolti.utils.versions import create_content_data_version
 
 
 # TODO: remove the rikoltifetcher registry endpoint and restructure
@@ -41,7 +41,8 @@ def get_collection_fetchdata_task(params=None):
 
 
 @task()
-def create_vernacular_version_task(collection):
+def create_vernacular_version_task(collection) -> str:
+    # returns: '3433/vernacular_metadata_v1/'
     return create_vernacular_version(collection.get('collection_id'))
 
 
@@ -108,16 +109,43 @@ def get_collection_metadata_task(params=None):
 # max_active_tis_per_dag - setting on the task to restrict how many
 # instances can be running at the same time, *across all DAG runs*
 @task()
-def map_page_task(page: str, collection: dict, mapped_data_version: str):
+def map_page_task(vernacular_page: str, collection: dict, mapped_data_version: str):
+    """
+    vernacular_page is a filepath relative to the collection id, ex:
+        3433/vernacular_metadata_2023-01-01T00:00:00/data/1
+    mapped_data_version is a path relative to the collection id, ex:
+        3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/
+    returns a dictionary with the following keys:
+        status: success
+        num_records_mapped: int
+        page_exceptions: TODO
+        mapped_page_path: str, ex: 
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl
+    """
     collection_id = collection.get('id')
     if not collection_id or not mapped_data_version:
         return False
-    mapped_page = map_page(collection_id, page, mapped_data_version, collection)
+    mapped_page = map_page(
+        collection_id, vernacular_page, mapped_data_version, collection)
     return mapped_page
 
 
 @task()
 def get_mapping_status_task(collection: dict, mapped_pages: list):
+    """
+    mapped_pages is a list of dicts with the following keys:
+        status: success
+        num_records_mapped: int
+        page_exceptions: TODO
+        mapped_page_path: str, ex: 
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl
+    returns a dict with the following keys:
+        mapped_page_paths: ex: [
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl,
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/2.jsonl,
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/3.jsonl
+        ]
+    """
     mapping_status = get_mapping_status(collection, mapped_pages)
     return mapping_status
 
@@ -130,18 +158,27 @@ def create_mapped_version_task(collection, vernacular_pages):
         '3433/vernacular_metadata_2023-01-01T00:00:00/data/1',
         '3433/vernacular_metadata_2023-01-01T00:00:00/data/2'
     ]
+    returns the path to a new mapped version, ex:
+        "3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/"
     """
     vernacular_version = get_version(collection.get('id'), vernacular_pages[0])
     if not vernacular_version:
         raise ValueError(
             f"Vernacular version not found in {vernacular_pages[0]}")
-    mapped_data_version = create_mapped_version(
-        collection.get('id'), vernacular_version)
+    mapped_data_version = create_mapped_version(vernacular_version)
     return mapped_data_version
 
 
 @task()
 def validate_collection_task(collection_status: dict, params=None) -> str:
+    """
+    collection_status is a dict containing the following keys:
+        mapped_page_paths: ex: [
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/1.jsonl,
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/2.jsonl,
+            3433/vernacular_metadata_2023-01-01T00:00:00/mapped_metadata_2023-01-01T00:00:00/3.jsonl
+        ]
+    """
     if not params or not params.get('validate'):
         raise ValueError("Validate flag not found in params")
 
@@ -156,8 +193,10 @@ def validate_collection_task(collection_status: dict, params=None) -> str:
     print(f"Output {num_rows} rows to {file_location}")
 
     # create a link to the file in the logs
-    if file_location.startswith('s3://'):
-        parsed_loc = urlparse(file_location)
+    mapper_data_dest = os.environ.get("MAPPED_DATA", "file:///tmp")
+    if mapper_data_dest.startswith("s3"):
+        parsed_loc = urlparse(
+            f"{mapper_data_dest.rstrip('/')}/{file_location}")
         file_location = (
             f"https://{parsed_loc.netloc}.s3.us-west-2."
             f"amazonaws.com{parsed_loc.path}"
@@ -168,9 +207,9 @@ def validate_collection_task(collection_status: dict, params=None) -> str:
 
 @task()
 def create_content_data_version_task(collection: dict, mapped_pages: list[dict]):
-    content_data_version = create_content_data_version(
+    mapped_version = get_version(
         collection['id'], mapped_pages[0]['mapped_page_path'])
-    return content_data_version
+    return create_content_data_version(mapped_version)
 
 
 @task()
diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py
index 85d1b0386..a4781afd2 100644
--- a/dags/utils_by_mapper_type.py
+++ b/dags/utils_by_mapper_type.py
@@ -1,5 +1,6 @@
 import requests
 import logging
+import os
 
 from urllib.parse import urlparse
 
@@ -29,6 +30,18 @@ def make_mapper_type_endpoint(params=None):
 
 @task()
 def fetch_endpoint_task(endpoint, params=None):
+    """
+    TODO: map the output of this job to the input of the map_endpoint_task
+    re: versioning
+    3433: [
+            {
+                document_count: int
+                vernacular_filepath: path relative to collection id
+                    ex: "3433/vernacular_version_1/data/1"
+                status: 'success' or 'error'
+            }
+        ]
+    """
     limit = params.get('limit', None) if params else None
     fetcher_job_result = fetch_endpoint(endpoint, limit, logger)
     for collection_id in fetcher_job_result.keys():
@@ -68,8 +81,9 @@ def validate_endpoint_task(url, params=None):
         num_rows, file_location = create_collection_validation_csv(
             collection['collection_id'], mapped_page_paths)
         csv_paths.append(file_location)
-        if file_location.startswith('s3://'):
-            s3_path = urlparse(file_location)
+        validation_data_dest = os.environ.get("MAPPED_DATA", "file:///tmp")
+        if validation_data_dest.startswith("s3"):
+            s3_path = urlparse(f"{validation_data_dest.rstrip('/')}/{file_location}")
             s3_paths.append(f"https://{s3_path.netloc}.s3.amazonaws.com{s3_path.path}")
         print(f"Output {num_rows} rows to {file_location}")
 
diff --git a/env.example b/env.example
index 40252ca62..65a09f7d5 100644
--- a/env.example
+++ b/env.example
@@ -1,11 +1,10 @@
 # metadata_fetcher
-export FETCHER_DATA_DEST=file:///usr/local/airflow/rikolti_data
+export VERNACULAR_DATA=file:///usr/local/airflow/rikolti_data
 export NUXEO=                                                       # ask for a key - required to run the NuxeoFetcher
 export FLICKR_API_KEY=                                              # ask for a key - required to run the FlickrFetcher
 
 # metadata_mapper
-export MAPPER_DATA_SRC=file:///usr/local/airflow/rikolti_data
-export MAPPER_DATA_DEST=file:///usr/local/airflow/rikolti_data
+export MAPPED_DATA=file:///usr/local/airflow/rikolti_data
 export SKIP_UNDEFINED_ENRICHMENTS=True
 
 # validator
@@ -14,9 +13,8 @@ export UCLDC_SOLR_URL="https://solr.calisphere.org/solr"            # this is so
 export UCLDC_SOLR_API_KEY=                                          # ask for a key
 
 # content_harvester
-export CONTENT_DATA_SRC=file:///rikolti_data
-export CONTENT_DATA_DEST=file:///rikolti_data
-export CONTENT_DEST=file:///rikolti_content
+export CONTENT_DATA=file:///rikolti_data
+export CONTENT_ROOT=file:///rikolti_content
 
 # content_harvester when run locally via aws_mwaa_local_runner
 export CONTENT_DATA_MOUNT=/Users/awieliczka/Projects/rikolti_data
diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py
index 27a322b54..eb39438a5 100644
--- a/metadata_fetcher/fetch_registry_collections.py
+++ b/metadata_fetcher/fetch_registry_collections.py
@@ -5,7 +5,7 @@
 import requests
 
 from . import lambda_function
-from rikolti.utils.rikolti_storage import create_vernacular_version
+from rikolti.utils.versions import create_vernacular_version
 
 logger = logging.getLogger(__name__)
 
@@ -24,6 +24,18 @@ def registry_endpoint(url):
 
 
 def fetch_endpoint(url, limit=None, job_logger=logger):
+    """
+    returns a dictionary of collection ids and fetch results, where
+    fetch results are a list of of dictionaries with the following keys:
+    ex: 3433: [
+            {
+                document_count: int
+                vernacular_filepath: path relative to collection id
+                    ex: "3433/vernacular_version_1/data/1"
+                status: 'success' or 'error'
+            }
+        ]
+    """
     response = requests.get(url=url)
     response.raise_for_status()
     total = response.json().get('meta', {}).get('total_count', 1)
diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py
index 86ab6ec45..13b9c55d5 100644
--- a/metadata_fetcher/fetchers/Fetcher.py
+++ b/metadata_fetcher/fetchers/Fetcher.py
@@ -2,7 +2,7 @@
 import requests
 
 from requests.adapters import HTTPAdapter, Retry
-from rikolti.utils.rikolti_storage import put_vernacular_content
+from rikolti.utils.versions import put_vernacular_page
 
 
 logger = logging.getLogger(__name__)
@@ -21,17 +21,32 @@ class FetchError(Exception):
 
 
 class Fetcher(object):
-    def __init__(self, params):
+    def __init__(self, params: dict):
+        """
+        params: dict
+            harvest_type: str
+            collection_id: str or int
+            write_page: str or int filename of the page to write to
+            vernacular_version: path relative to collection id
+                ex: "3433/vernacular_version_1"
+        """
         self.harvest_type = params.get('harvest_type')
         self.collection_id = params.get('collection_id')
         self.write_page = params.get('write_page', 0)
-        self.vernacular_version = params.get('vernacular_version')
+        self.vernacular_version = params['vernacular_version']
 
 
         if not self.collection_id:
             raise CollectionIdRequired("collection_id is required")
 
     def fetch_page(self):
+        """
+        returns a dict with the following keys:
+            document_count: int
+            vernacular_filepath: path relative to collection id
+                ex: "3433/vernacular_version_1/data/1"
+            status: 'success' or 'error'
+        """
         page = self.build_fetch_request()
         logger.debug(
             f"[{self.collection_id}]: fetching page {self.write_page} "
@@ -49,7 +64,7 @@ def fetch_page(self):
         if record_count:
             content = self.aggregate_vernacular_content(response.text)
             try:
-                filepath = put_vernacular_content(
+                filepath = put_vernacular_page(
                     content, self.write_page, self.vernacular_version)
             except Exception as e:
                 print(f"Metadata Fetcher: {e}")
diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index cbc15e0bc..1c91fc4f6 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -10,7 +10,7 @@
 from bs4 import BeautifulSoup
 
 from .Fetcher import Fetcher, FetchError
-from rikolti.utils.rikolti_storage import put_vernacular_content
+from rikolti.utils.versions import put_vernacular_page
 
 class UcdJsonFetcher(Fetcher):
     def __init__(self, params: dict[str]):
@@ -69,7 +69,7 @@ def fetch_all_pages(self, response: requests.Response) -> list:
             records = [self.fetch_json_ld(url) for url in urls]
             document_count = len(records)
             try:
-                filepath = put_vernacular_content(
+                filepath = put_vernacular_page(
                     json.dumps(records), self.write_page, self.vernacular_version)
                 fetch_status.append({
                     'document_count': document_count,
diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py
index d4f78b68f..17489fb51 100644
--- a/metadata_fetcher/lambda_function.py
+++ b/metadata_fetcher/lambda_function.py
@@ -4,7 +4,7 @@
 import sys
 
 from .fetchers.Fetcher import Fetcher, InvalidHarvestEndpoint
-from rikolti.utils.rikolti_storage import create_vernacular_version
+from rikolti.utils.versions import create_vernacular_version
 
 logger = logging.getLogger(__name__)
 
@@ -21,7 +21,14 @@ def import_fetcher(harvest_type):
 
 
 # AWS Lambda entry point
-def fetch_collection(payload, vernacular_version, context):
+def fetch_collection(payload, vernacular_version, context) -> list[dict]:
+    """
+    returns a list of dicts with the following keys:
+        document_count: int
+        vernacular_version: path relative to collection id
+            ex: "3433/vernacular_version_1/data/1"
+        status: 'success' or 'error'
+    """
     if isinstance(payload, str):
         payload = json.loads(payload)
 
diff --git a/metadata_fetcher/settings.py b/metadata_fetcher/settings.py
index e18110918..3baf224a6 100644
--- a/metadata_fetcher/settings.py
+++ b/metadata_fetcher/settings.py
@@ -1,8 +1,6 @@
 import logging
 import os
 
-from urllib.parse import urlparse
-
 from dotenv import load_dotenv
 
 logger = logging.getLogger(__name__)
@@ -12,7 +10,5 @@
 NUXEO_TOKEN = os.environ.get('NUXEO')
 FLICKR_API_KEY = os.environ.get('FLICKR_API_KEY')
 
-DATA_DEST_URL = os.environ.get("FETCHER_DATA_DEST", "file:///tmp")
-
 for key, value in os.environ.items():
     logger.debug(f"{key}={value}")
diff --git a/metadata_fetcher/tests.py b/metadata_fetcher/tests.py
index 85f6cebb1..f6594fe86 100644
--- a/metadata_fetcher/tests.py
+++ b/metadata_fetcher/tests.py
@@ -10,7 +10,7 @@
                                         nuxeo_nested_complex_object_harvests)
 from .sample_data.oac_harvests import oac_harvests
 from .sample_data.oai_harvests import oai_harvests
-from rikolti.utils.rikolti_storage import create_vernacular_version
+from rikolti.utils.versions import create_vernacular_version
 
 
 def main():
diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index eaf7350d4..b62271999 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -8,7 +8,7 @@
 
 from . import settings
 from .mappers.mapper import Record, Vernacular
-from rikolti.utils.rikolti_storage import get_mapped_page, put_page_content
+from rikolti.utils.versions import get_vernacular_page, put_mapped_page
 
 logger = logging.getLogger(__name__)
 
@@ -73,14 +73,32 @@ def run_enrichments(records, collection, enrichment_set, page_filename):
     return records
 
 
-def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version: str, collection: Union[dict, str]):
+def map_page(
+        collection_id: int,
+        vernacular_page_path: str,
+        mapped_data_version: str,
+        collection: Union[dict, str]
+    ):
+    """
+    vernacular_page_path is a filepath relative to the collection id, ex:
+        3433/vernacular_metadata_v1/data/1
+    mapped_data_version is a version path relative to the collection id, ex:
+        3433/vernacular_metadata_v1/mapped_metadata_v1/
+
+    returns a dict with the following keys:
+        status: success
+        num_records_mapped: int
+        page_exceptions: TODO
+        mapped_page_path: str, ex:
+            3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl
+    """
     if isinstance(collection, str):
          collection = json.loads(collection)
 
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
     page_filename = os.path.basename(vernacular_page_path)
-    api_resp = get_mapped_page(vernacular_page_path)
+    api_resp = get_vernacular_page(vernacular_page_path)
 
     source_vernacular = vernacular_reader(collection_id, page_filename)
     source_metadata_records = source_vernacular.parse(api_resp)
@@ -118,10 +136,8 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version:
     #                   for record in mapped_records]
 
     mapped_metadata = [record.to_dict() for record in mapped_records]
-    mapped_page_path = put_page_content(
-        json.dumps(mapped_metadata),
-        f"{mapped_data_version.rstrip('/')}/data/{page_filename}.jsonl"
-    )
+    mapped_page_path = put_mapped_page(
+        json.dumps(mapped_metadata), page_filename, mapped_data_version)
 
     return {
         'status': 'success',
@@ -144,7 +160,7 @@ def map_page(collection_id: int, vernacular_page_path: str, mapped_data_version:
     mapped_page = map_page(args.collection_id, args.page_path, args.mapped_data_path, args.collection)
 
     print(f"{mapped_page.get('num_records_mapped')} records mapped")
-    print(f"mapped page at {mapped_page.get('mapped_page_path')}")
+    print(f"mapped page at {os.environ.get('MAPPED_DATA')}/{mapped_page.get('mapped_page_path')}")
 
     for report, couch_ids in mapped_page.get('exceptions', {}).items():
         print(f"{len(couch_ids)} records report enrichments errors: {report}")
diff --git a/metadata_mapper/lambda_shepherd.py b/metadata_mapper/lambda_shepherd.py
index a376e0c14..264f948f1 100644
--- a/metadata_mapper/lambda_shepherd.py
+++ b/metadata_mapper/lambda_shepherd.py
@@ -8,7 +8,10 @@
 from . import validate_mapping
 from .lambda_function import map_page
 from .mappers.mapper import Record
-from rikolti.utils.rikolti_storage import get_vernacular_pages, create_mapped_version, get_most_recent_vernacular_version
+from rikolti.utils.versions import (
+    get_most_recent_vernacular_version, get_vernacular_pages,
+    get_version, create_mapped_version
+)
 
 
 def get_collection(collection_id):
@@ -39,6 +42,20 @@ def check_for_missing_enrichments(collection):
 
 
 def get_mapping_status(collection, mapped_pages):
+    """
+    mapped_pages is a list of dicts with the following keys:
+        status: success
+        num_records_mapped: int
+        page_exceptions: TODO
+        mapped_page_path: str, ex:
+            3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl
+    returns a dict, one of the keys is mapped_page_paths:
+        mapped_page_paths: ex: [
+            3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl,
+            3433/vernacular_metadata_v1/mapped_metadata_v1/data/2.jsonl,
+            3433/vernacular_metadata_v1/mapped_metadata_v1/data/3.jsonl
+        ]
+    """
     count = sum([page['num_records_mapped'] for page in mapped_pages])
     page_count = len(mapped_pages)
     collection_exceptions = [page.get('page_exceptions', {}) for page in mapped_pages]
@@ -78,9 +95,11 @@ def map_collection(collection_id, vernacular_version=None, validate=False):
 
     if not vernacular_version:
         vernacular_version = get_most_recent_vernacular_version(collection_id)
-    page_list = get_vernacular_pages(collection_id, vernacular_version)
+    page_list = get_vernacular_pages(vernacular_version)
+    # TODO: split page_list into pages and children?
 
-    mapped_data_version = create_mapped_version(collection_id, page_list[0])
+    vernacular_version = get_version(collection_id, page_list[0])
+    mapped_data_version = create_mapped_version(vernacular_version)
     mapped_pages = []
     for page in page_list:
         try:
@@ -95,14 +114,13 @@ def map_collection(collection_id, vernacular_version=None, validate=False):
             continue
 
     collection_stats = get_mapping_status(collection, mapped_pages)
-    mapped_page_paths = [page['mapped_page_path'] for page in mapped_pages]
 
     if validate:
         opts = validate if isinstance(validate, dict) else {}
         num_rows, file_location = (
             validate_mapping.create_collection_validation_csv(
                 collection_id,
-                mapped_page_paths,
+                collection_stats['mapped_page_paths'],
                 **opts
             )
         )
diff --git a/metadata_mapper/settings.py b/metadata_mapper/settings.py
index aaecef5fc..d1dfd6cf8 100644
--- a/metadata_mapper/settings.py
+++ b/metadata_mapper/settings.py
@@ -4,9 +4,6 @@
 
 load_dotenv()
 
-DATA_SRC_URL = os.environ.get('MAPPER_DATA_SRC', 'file:///tmp')
-DATA_DEST_URL = os.environ.get('MAPPER_DATA_DEST', 'file:///tmp')
-
 SKIP_UNDEFINED_ENRICHMENTS = os.environ.get('SKIP_UNDEFINED_ENRICHMENTS', False)
 
 SOLR_URL = os.environ.get('UCLDC_SOLR_URL', False)
diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py
index f8b606164..0580b2c07 100644
--- a/metadata_mapper/validate_mapping.py
+++ b/metadata_mapper/validate_mapping.py
@@ -10,7 +10,8 @@
 from .validator.validation_log import ValidationLogLevel
 from .validator.validation_mode import ValidationMode
 from .validator.validator import Validator
-from rikolti.utils.rikolti_storage import get_page_content
+from rikolti.utils.versions import (
+    get_mapped_page, get_version, get_mapped_pages)
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -29,6 +30,13 @@ def validate_collection(collection_id: int,
     Parameters:
         collection_id: int
             The collection ID
+        mapped_page_paths: list[str]
+            A list of the relative paths to pages of vernacular metadata, ex:
+            [
+                3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl,
+                3433/vernacular_metadata_v1/mapped_metadata_v1/data/2.jsonl,
+                3433/vernacular_metadata_v1/mapped_metadata_v1/data/3.jsonl
+            ]
         validator_class: Type[Validator] (default: None)
             The validator class to use. Can be derived if not provided.
         validator: Validator (default: None)
@@ -66,7 +74,8 @@ def validate_page(collection_id: int, page_path: str,
         collection_id: int
             The collection ID
         page_path: str
-            The absolute path to a page within the collection
+            The relative path to a specific page of mapped metadata, ex:
+                3433/vernacular_metadata_v1/mapped_metadata_v1/data/1.jsonl
         validator: Validator
             The validator instance to use
 
@@ -78,7 +87,7 @@ def validate_page(collection_id: int, page_path: str,
         "page_path": page_path
     }
     mapped_metadata = validator.generate_keys(
-                        get_mapped_data(page_path),
+                        json.loads(get_mapped_page(page_path)),
                         type="Rikolti",
                         context=context
                       )
@@ -115,16 +124,13 @@ def create_collection_validation_csv(
         collection_id: int, mapped_page_paths: list[str], **options) -> tuple[int, str]:
     result = validate_collection(collection_id, mapped_page_paths, **options)
 
-    filename = result.log.output_csv_to_bucket(collection_id, mapped_page_paths[0])
+    mapped_version = get_version(collection_id, mapped_page_paths[0])
+    filename = result.log.output_csv_to_bucket(collection_id, mapped_version)
     return len(result.log.log), filename
 
 ## Private-ish
 
 
-def get_mapped_data(page_path: str) -> list[dict]:
-    return json.loads(get_page_content(page_path))
-
-
 def get_comparison_data(collection_id: int, harvest_ids: list[str]) -> list[dict]:
     solr_data = get_solr_data(collection_id, harvest_ids)
     couch_data = get_couch_db_data(collection_id, harvest_ids)
@@ -255,6 +261,7 @@ def get_validator_class(collection_id: int) -> Type[Validator]:
         description="Validate mapped metadata against SOLR")
     
     parser.add_argument('collection_id', help='Collection ID')
+    parser.add_argument('mapped_data_version', help="Mapped data version, ex: 3433/vernacular_data_1/mapped_data_1")
     parser.add_argument("--log-level", dest="log_level",
                         help="Log level - can be ERROR, WARNING, INFO, or DEBUG")
     parser.add_argument('-v', '--verbose', action="store_true", help="Verbose mode")
@@ -273,6 +280,8 @@ def get_validator_class(collection_id: int) -> Type[Validator]:
     print(f"Generating validations for collection {args.collection_id} with options:")
     print(kwargs)
 
+    mapped_page_paths = get_mapped_pages(args.mapped_data_version)
+
     num_rows, file_location = create_collection_validation_csv(
         args.collection_id, mapped_page_paths, **kwargs)
     print(f"Output {num_rows} rows to {file_location}")
diff --git a/metadata_mapper/validator/validation_log.py b/metadata_mapper/validator/validation_log.py
index 516eafaa3..df455fa18 100644
--- a/metadata_mapper/validator/validation_log.py
+++ b/metadata_mapper/validator/validation_log.py
@@ -1,10 +1,8 @@
-from datetime import datetime
 from enum import Enum
+import json
 from typing import IO, Any
-
-from .. import settings
-from rikolti.utils.rikolti_storage import put_page_content, create_validation_version
-
+from rikolti.utils.versions import (
+    create_validation_version, put_validation_report)
 
 class ValidationLogLevel(Enum):
     DEBUG = "DEBUG"
@@ -110,7 +108,7 @@ def output_csv_to_file(self, file: IO[str], append: bool = False,
         with open(file, "a" if append else "w") as f:
             f.write(self._csv_content_string(include_fields, append))
 
-    def output_csv_to_bucket(self, collection_id: int, mapped_data_path: str = None,
+    def output_csv_to_bucket(self, collection_id: int, mapped_version: str = None,
                              include_fields: list[str] = None) -> str:
         """
         Writes a CSV to the env-appropriate bucket (local or S3).
@@ -118,19 +116,22 @@ def output_csv_to_bucket(self, collection_id: int, mapped_data_path: str = None,
         Parameters:
             collection_id: int
                 The collection ID (for finding appropriate folder)
-            filename: str (default: None)
-                The name of the created file. If not provided, defaults to
-                timestamp
+            mapped_version: str (default: None)
+                the mapped_data version, ex:
+                    3433/vernacular_metadata_v1/mapped_metadata_v1/
             include_fields: list[str] (default: None)
                 A list of fields to include in the CSV. Defaults to all.
+
+        Returns: str
+            the relative path to the created file, ex:
+                3433/vernacular_metadata_v1/mapped_metadata_v1/validation_v1.csv
         """
         content = self._csv_content_string(include_fields)
         if isinstance(content, list) or isinstance(content, dict):
             content = json.dumps(content)
 
-        file_location = create_validation_version(collection_id, mapped_data_path)
-        put_page_content(content, file_location)
-        
+        file_location = create_validation_version(mapped_version)
+        put_validation_report(content, file_location)
         return file_location
 
 
diff --git a/utils/rikolti_storage.py b/utils/storage.py
similarity index 53%
rename from utils/rikolti_storage.py
rename to utils/storage.py
index 150b67551..467d79d49 100644
--- a/utils/rikolti_storage.py
+++ b/utils/storage.py
@@ -5,7 +5,7 @@
 from datetime import datetime
 
 from urllib.parse import urlparse
-from typing import Optional
+from typing import Optional, Union
 from collections import namedtuple
 
 DataStorage = namedtuple(
@@ -198,142 +198,3 @@ def put_file_content(data: DataStorage, content) -> str:
     return data.uri
 
 
-def get_version(collection_id, uri):
-    """
-    From an arbitrary path, try to get the version string
-    """
-    uri = uri.rstrip('/')
-    if collection_id not in uri or uri.endswith(collection_id):
-        return None
-    rikolti_data_root, relative_path = uri.split(f"/{collection_id}/")
-    path_list = relative_path.split('/')
-    if 'data' in path_list:
-        path_list = path_list[:path_list.index('data')]
-    path_list.insert(0, collection_id)
-    version = "/".join(path_list)
-    return version
-
-
-def create_vernacular_version(
-        collection_id: int or str,
-        version_suffix: Optional[str] = None
-    ):
-    if not version_suffix:
-        version_suffix = (
-            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-    vernacular_version_path = (
-        f"{collection_id}/vernacular_metadata_{version_suffix}/")
-    return vernacular_version_path
-
-
-def put_vernacular_content(content: str, page_name: int or str, version: str):
-    fetcher_data_dest = os.environ.get(
-        "FETCHER_DATA_DEST", "file:///tmp")
-    path = f"{fetcher_data_dest.rstrip('/')}/{version}/data/{page_name}"
-    put_page_content(content, path)
-    return f"{version}/data/{page_name}"
-
-
-def get_most_recent_vernacular_version(collection_id: int or str):
-    mapper_data_src = os.environ.get("MAPPED_DATA_SRC")
-    vernacular_versions = list_dirs(f"{mapper_data_src}/{collection_id}/")
-    if not vernacular_versions:
-        raise Exception(
-            "No vernacular metadata versions found for {collection_id}")
-    return get_version(collection_id, sorted(vernacular_versions)[-1])
-
-
-def get_vernacular_pages(collection_id, vernacular_version):
-    mapper_data_src = os.environ.get("MAPPED_DATA_SRC", "file:///tmp").rstrip('/')
-    vernacular_path = f"{mapper_data_src}/{vernacular_version}/data/"
-    try:
-        page_list = list_pages(vernacular_path, recursive=True)
-    except FileNotFoundError as e:
-        print(
-            f"{e} - have you fetched {collection_id}? "
-            f"looked in dir {e.filename} for vernacular pages"
-        )
-        raise(e)
-
-    # TODO: split page_list into pages and children?
-    return page_list
-
-
-def create_mapped_version(
-        collection_id: int or str,
-        vernacular_version: str,
-        mapped_data_suffix: Optional[str] = None,
-):
-    mapper_data_dest = os.environ.get("MAPPED_DATA_DEST")
-    # get path of the vernacular version, not the vernacular data
-    mapped_root = vernacular_path.rsplit('data', 1)[0]
-
-    if mapper_data_dest:
-        # get path relative to collection_id
-        vernacular_path = vernacular_path.split(str(collection_id))[-1]
-        mapped_root = (
-            f"{mapper_data_dest.rstrip('/')}/{collection_id}/{vernacular_path}"
-        )
-
-    if not mapped_data_suffix:
-        mapped_data_suffix = (
-            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-    mapped_data_path = (
-        f"{mapped_root.rstrip('/')}/mapped_metadata_{mapped_data_suffix}/")
-    return mapped_data_path
-
-
-def get_mapped_page(relative_vernacular_path):
-    mapper_data_src = os.environ.get("MAPPER_DATA_SRC", "file:///tmp").rstrip('/')
-    relative_vernacular_path = relative_vernacular_path.lstrip('/')
-    return get_page_content(f"{mapper_data_src}/{relative_vernacular_path}")
-
-
-def create_validation_version(
-        collection_id: int or str,
-        mapped_data_path: str,
-        validation_suffix: Optional[str] = None
-):
-    validation_data_dest = os.environ.get("VALIDATION_DATA_DEST")
-    # get path of the mapped data version, not the mapped data
-    validation_root = mapped_data_path.rsplit('data', 1)[0]
-
-    if validation_data_dest:
-        # get path relative to collection_id
-        mapped_data_path = mapped_data_path.split(str(collection_id))[-1]
-        validation_root = (
-            f"{validation_data_dest.rstrip('/')}/{collection_id}/{mapped_data_path}"
-        )
-
-    if not validation_suffix:
-        validation_suffix = (
-            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-    validation_data_path = (
-        f"{validation_root.rstrip('/')}/validation_{validation_suffix}.csv")
-    return validation_data_path
-
-
-def create_content_data_version(
-        collection_id: int or str, 
-        mapped_data_version: str,
-        content_data_suffix: Optional[str] = None
-)-> str:
-    mapped_with_content_dest = os.environ.get('CONTENT_DATA_DEST')
-    # get path of the mapped data version, not the mapped data
-    content_data_root = mapped_data_version
-
-    if mapped_with_content_dest:
-        # get path relative to collection_id
-        mapped_data_path = mapped_data_version.split(str(collection_id))[-1]
-        content_data_root = (
-            f"{mapped_with_content_dest.rstrip('/')}/{collection_id}/{mapped_data_path}"
-        )
-    
-    if not content_data_suffix:
-        content_data_suffix = (
-            datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
-    content_data_path = (
-        f"{content_data_root.rstrip('/')}/content_data_{content_data_suffix}/")
-    return content_data_path
-
-
diff --git a/utils/versions.py b/utils/versions.py
new file mode 100644
index 000000000..f53bebf78
--- /dev/null
+++ b/utils/versions.py
@@ -0,0 +1,160 @@
+import os
+from datetime import datetime
+from typing import Union, Optional
+from . import storage
+
+def get_version(collection_id: Union[int, str], uri: str) -> str:
+    """
+    From an arbitrary path, try to get the version string
+    """
+    collection_id = str(collection_id)
+    uri = uri.rstrip('/')
+    if str(collection_id) not in uri or uri.endswith(str(collection_id)):
+        raise Exception("Not a valid version path")
+    rikolti_data_root, relative_path = uri.split(f"{collection_id}/")
+    path_list = relative_path.split('/')
+    if 'data' in path_list:
+        path_list = path_list[:path_list.index('data')]
+    path_list.insert(0, str(collection_id))
+    version = "/".join(path_list)
+    return version
+
+def create_version(
+    base_version: str, 
+    pipeline_step: str,
+    suffix: Optional[str] = None
+):
+    """
+    Given a path to a version, ex: 3433/vernacular_metadata_v1/, 
+    compose a new version path, ex: 3433/vernacular_metadata_v1/mapped_metadata_v1/
+
+    base_version: str
+        a version path
+    pipeline_step: str
+        a name for the branch indicating metadata state, ex: mapped_metadata
+    branch_suffix: str
+        a uniquely identifying suffix for this branch
+    """
+    if not suffix:
+        suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+    base_version = base_version.rstrip('/')
+    branch_version = (
+        f"{base_version}/{pipeline_step}_{suffix}/"
+    )
+    return branch_version
+
+def create_vernacular_version(
+        collection_id: Union[int, str],
+        suffix: Optional[str] = None
+    ) -> str:
+    version_path = f"{collection_id}/"
+    return create_version(version_path, 'vernacular_metadata', suffix)
+
+def create_mapped_version(
+        vernacular_version: str, suffix: Optional[str] = None) -> str:
+    return create_version(vernacular_version, 'mapped_metadata', suffix)
+
+def create_validation_version(
+        mapped_version: str,
+        suffix: Optional[str] = None
+):
+    validation_version = create_version(mapped_version, 'validation', suffix)
+    return validation_version.rstrip('/') + ".csv"
+
+def create_content_data_version(
+        mapped_version: str, suffix: Optional[str] = None) -> str:
+    return create_version(mapped_version, 'content_data', suffix)
+
+
+def get_most_recent_vernacular_version(collection_id: Union[int, str]):
+    mapper_data_src = os.environ.get("VERNACULAR_DATA")
+    vernacular_versions = storage.list_dirs(f"{mapper_data_src}/{collection_id}/")
+    if not vernacular_versions:
+        raise Exception(
+            "No vernacular metadata versions found for {collection_id}")
+    return get_version(collection_id, sorted(vernacular_versions)[-1])
+
+def get_vernacular_pages(version):
+    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
+    data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
+    try:
+        page_list = storage.list_pages(data_path, recursive=True)
+    except FileNotFoundError as e:
+        print(
+            f"\n\nNo vernacular pages found in {e.filename}\n\n"
+        )
+        raise(e)
+    return [path[len(data_root)+1:] for path in page_list]
+
+def get_mapped_pages(version, **kwargs):
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
+    try:
+        page_list = storage.list_pages(data_path, recursive=True, **kwargs)
+    except FileNotFoundError as e:
+        print(
+            f"\n\nNo mapped pages found in {e.filename}\n\n"
+        )
+        raise(e)
+    return [path[len(data_root)+1:] for path in page_list]
+
+def get_child_directories(version, **kwargs):
+    data_root = os.environ.get('MAPPED_DATA', "file:///tmp")
+    child_directories = storage.list_dirs(
+        f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/",
+        recursive=False
+    )
+    return child_directories
+
+def get_child_pages(version, **kwargs):
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/children/"
+    try:
+        page_list = storage.list_pages(data_path, recursive=False, **kwargs)
+    except FileNotFoundError:
+        return []
+    except OSError:
+        return []
+    return [path[len(data_root)+1:] for path in page_list]
+
+def get_vernacular_page(version_page):
+    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp").rstrip('/')
+    return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
+
+# TODO: check if this is always json.loads
+def get_mapped_page(version_page):
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/')
+    return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
+
+def get_child_page(version_page):
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/')
+    content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
+    return json.loads(content)
+
+def put_vernacular_page(content: str, page_name: Union[int, str], version: str):
+    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
+    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
+    storage.put_page_content(content, path)
+    return f"{version.rstrip('/')}/data/{page_name}"
+
+def put_mapped_page(content, page_name, version):
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}.jsonl"
+    storage.put_page_content(content, path)
+    return f"{version.rstrip('/')}/data/{page_name}.jsonl"
+
+def put_validation_report(content, version_page):
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    path = f"{data_root.rstrip('/')}/{version_page}"
+    storage.put_page_content(content, path)
+    return version_page
+
+def put_content_data_page(content, page_name, version):
+    data_root = os.environ.get("CONTENT_DATA", "file:///tmp")
+    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
+    storage.put_page_content(content, path)
+    return f"{version.rstrip('/')}/data/{page_name}"
+
+
+
+

From b98f63eb45953e7149b265a5165943d4b962ade3 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 13 Nov 2023 17:11:16 -0800
Subject: [PATCH 26/42] Use storage utilities in content harvest also

---
 content_harvester/by_page.py  | 40 ++++++++++++-----------------------
 content_harvester/settings.py |  9 --------
 utils/storage.py              | 32 +++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 5700d65cb..169dc8040 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -1,11 +1,9 @@
 import hashlib
 import json
 import os
-import shutil
 from collections import Counter
 from typing import Optional
 
-import boto3
 import requests
 from requests.adapters import HTTPAdapter, Retry
 
@@ -13,6 +11,7 @@
 
 from . import derivatives
 from . import settings
+from .storage import upload_file
 
 from .versions import (
     get_mapped_page, get_child_directories, get_child_pages, get_child_page,
@@ -206,8 +205,9 @@ def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict:
             else:
                 dest_filename = os.path.basename(content.derivative_filepath)
 
-            content_s3_filepath = self._upload(
-                f"{content.dest_prefix}/{collection_id}", dest_filename, content.derivative_filepath)
+            dest_path = f"{content.dest_prefix}/{collection_id}/{dest_filename}"
+            content_s3_filepath = self._upload(dest_path, content.derivative_filepath)
+            
             content.set_s3_filepath(content_s3_filepath)
 
             # print(
@@ -287,42 +287,30 @@ def _download(self, url: str, destination_file: str, cache: Optional[dict] = Non
 
         return md5
 
-    def _upload(self, dest_prefix, dest_filename, filepath, cache: Optional[dict] = None) -> str:
+    def _upload(self, dest_filepath, src_filepath, cache: Optional[dict] = None) -> str:
         '''
             upload file to CONTENT_ROOT
         '''
         if not cache:
             cache = {}
 
-        if cache.get(dest_filename, {}).get('path'):
-            return cache[dest_filename]['path']
-
-        dest_path = ''
-
-        if settings.CONTENT_ROOT["STORE"] == 'file':
-            dest_path = os.path.join(
-                settings.CONTENT_ROOT["PATH"], dest_prefix)
-            if not os.path.exists(dest_path):
-                os.makedirs(dest_path)
-            dest_path = os.path.join(dest_path, dest_filename)
-            shutil.copyfile(filepath, dest_path)
+        filename = os.path.basename(dest_filepath)
+        if cache.get(filename, {}).get('path'):
+            return cache[filename]['path']
 
-        if settings.CONTENT_ROOT["STORE"] == 's3':
-            s3 = boto3.client('s3')
-            dest_path = (
-                f"{settings.CONTENT_ROOT['PATH']}/{dest_prefix}/{dest_filename}")
-            s3.upload_file(
-                filepath, settings.CONTENT_ROOT["BUCKET"], dest_path)
+        content_root = os.environ.get("CONTENT_ROOT", 'file:///tmp')
+        content_path = f"{content_root.rstrip('/')}/{dest_filepath}"
+        upload_file(src_filepath, content_path)
 
         # (mime, dimensions) = image_info(filepath)
         cache_updates = {
             # 'mime': mime,
             # 'dimensions': dimensions,
-            'path': dest_path
+            'path': content_path
         }
-        cache[dest_filename] = cache_updates
+        cache[filename] = cache_updates
 
-        return dest_path
+        return content_path
 
 
 # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"}
diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index 56aeae3fb..880d2eea7 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -1,18 +1,9 @@
 import os
 
-from urllib.parse import urlparse
-
 from dotenv import load_dotenv
 
 load_dotenv()
 
-CONTENT_ROOT_URL = os.environ.get("CONTENT_ROOT", 'file:///tmp')
-CONTENT_ROOT = {
-    "STORE": urlparse(CONTENT_ROOT_URL).scheme,
-    "BUCKET": urlparse(CONTENT_ROOT_URL).netloc,
-    "PATH": urlparse(CONTENT_ROOT_URL).path,
-}
-
 AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', False)
 AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', False)
 AWS_SESSION_TOKEN = os.environ.get('AWS_SESSION_TOKEN', False)
diff --git a/utils/storage.py b/utils/storage.py
index 467d79d49..3c553f8e9 100644
--- a/utils/storage.py
+++ b/utils/storage.py
@@ -2,7 +2,7 @@
 import re
 
 import boto3
-from datetime import datetime
+import shutil
 
 from urllib.parse import urlparse
 from typing import Optional, Union
@@ -198,3 +198,33 @@ def put_file_content(data: DataStorage, content) -> str:
     return data.uri
 
 
+def upload_file(filepath:str, data_uri: str, **kwargs):
+    data = parse_data_uri(data_uri)
+
+    if data.store == 's3':
+        return upload_s3_file(data, filepath, **kwargs)
+    elif data.store == 'file':
+        return move_file(data, filepath)
+    else:
+        raise Exception(f"Unknown data store: {data.store}")
+
+def upload_s3_file(data: DataStorage, filepath, **kwargs):
+    """
+    Upload a file to s3 at data.path
+    """
+    s3 = boto3.client('s3', **kwargs)
+    s3.upload_file(
+        filepath,
+        data.bucket,
+        data.path
+    )
+    return data.uri
+
+def move_file(data: DataStorage, filepath):
+    destination_path = os.sep.join(data.path.split('/'))
+    directory_path = os.path.dirname(destination_path)
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+
+    shutil.copyfile(filepath, destination_path)
+    return data.uri

From 727bf2adbac97ba3848985992b586b1dc9aa5063 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 13 Nov 2023 17:35:31 -0800
Subject: [PATCH 27/42] mapper: accept optional vernacular version arg,
 defaults to most recent

---
 dags/mapper_dag.py |  4 +++-
 utils/versions.py  | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/dags/mapper_dag.py b/dags/mapper_dag.py
index ece23aa63..824603607 100644
--- a/dags/mapper_dag.py
+++ b/dags/mapper_dag.py
@@ -14,8 +14,9 @@
 
 
 @task()
-def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str] = None):
+def get_vernacular_pages_task(collection: dict, params: Optional[dict]=None):
     collection_id = collection['id']
+    vernacular_version = params.get('vernacular_version') if params else None
     if not vernacular_version:
         vernacular_version = get_most_recent_vernacular_version(collection_id)
     pages = get_vernacular_pages(vernacular_version)
@@ -45,6 +46,7 @@ def get_vernacular_pages_task(collection: dict, vernacular_version: Optional[str
     params={
         'collection_id': Param(None, description="Collection ID to map"),
         'validate': Param(True, description="Validate mapping?"),
+        'vernacular_version': Param(None, description="Vernacular version to map, ex: 3433/vernacular_metadata_v1/")
     },
     tags=["rikolti"],
 )
diff --git a/utils/versions.py b/utils/versions.py
index f53bebf78..90dc9ee33 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -65,14 +65,14 @@ def create_content_data_version(
         mapped_version: str, suffix: Optional[str] = None) -> str:
     return create_version(mapped_version, 'content_data', suffix)
 
-
 def get_most_recent_vernacular_version(collection_id: Union[int, str]):
-    mapper_data_src = os.environ.get("VERNACULAR_DATA")
-    vernacular_versions = storage.list_dirs(f"{mapper_data_src}/{collection_id}/")
-    if not vernacular_versions:
+    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
+    versions = storage.list_dirs(f"{data_root.rstrip('/')}/{collection_id}/")
+    if not versions:
         raise Exception(
             "No vernacular metadata versions found for {collection_id}")
-    return get_version(collection_id, sorted(vernacular_versions)[-1])
+    recent_version = sorted(versions)[-1]
+    return f"{collection_id}/{recent_version}/"
 
 def get_vernacular_pages(version):
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")

From 1ccde3fb54c6b6f14f2e70a206f0d5abad03e391 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Tue, 14 Nov 2023 11:16:52 -0800
Subject: [PATCH 28/42] Update by_mapper_type jobs to use versioning

---
 dags/dev_validate_by_mapper_type_to_gdrive.py |  9 ++---
 dags/utils_by_mapper_type.py                  | 35 ++++++++++++++-----
 dags/validate_by_mapper_type.py               |  9 ++---
 .../fetch_registry_collections.py             |  2 +-
 metadata_mapper/map_registry_collections.py   |  8 +++--
 5 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/dags/dev_validate_by_mapper_type_to_gdrive.py b/dags/dev_validate_by_mapper_type_to_gdrive.py
index 9cade3a27..168a87de1 100644
--- a/dags/dev_validate_by_mapper_type_to_gdrive.py
+++ b/dags/dev_validate_by_mapper_type_to_gdrive.py
@@ -28,12 +28,9 @@
 )
 def dev_validate_by_mapper_type():
     endpoint=make_mapper_type_endpoint()
-    validation_reports = validate_endpoint_task(endpoint)
-    (
-        fetch_endpoint_task(endpoint) >>
-        map_endpoint_task(endpoint) >>
-        validation_reports
-    )
+    fetched_versions = fetch_endpoint_task(endpoint)
+    mapped_versions = map_endpoint_task(endpoint, fetched_versions)
+    validation_reports = validate_endpoint_task(endpoint, mapped_versions)
 
     local_filepaths = s3_to_localfilesystem.expand(
         s3_url=validation_reports)
diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py
index a4781afd2..0aab734c8 100644
--- a/dags/utils_by_mapper_type.py
+++ b/dags/utils_by_mapper_type.py
@@ -10,6 +10,7 @@
 from rikolti.metadata_mapper.map_registry_collections import map_endpoint
 from rikolti.metadata_mapper.map_registry_collections import registry_endpoint
 from rikolti.metadata_mapper.validate_mapping import create_collection_validation_csv
+from rikolti.utils.versions import get_version, get_mapped_pages
 
 logger = logging.getLogger("airflow.task")
 
@@ -31,8 +32,6 @@ def make_mapper_type_endpoint(params=None):
 @task()
 def fetch_endpoint_task(endpoint, params=None):
     """
-    TODO: map the output of this job to the input of the map_endpoint_task
-    re: versioning
     3433: [
             {
                 document_count: int
@@ -44,26 +43,41 @@ def fetch_endpoint_task(endpoint, params=None):
     """
     limit = params.get('limit', None) if params else None
     fetcher_job_result = fetch_endpoint(endpoint, limit, logger)
+    fetched_versions = {}
     for collection_id in fetcher_job_result.keys():
+        version = get_version(
+            collection_id,
+            fetcher_job_result[collection_id][0]['vernacular_filepath']
+        )
         print(
             "Review fetched data at: https://rikolti-data.s3.us-west-2."
-            f"amazonaws.com/index.html#{collection_id}/"
+            f"amazonaws.com/index.html#{version}"
         )
-    return fetcher_job_result
+        fetched_versions[collection_id] = version
+    return fetched_versions
 
 @task()
-def map_endpoint_task(endpoint, params=None):
+def map_endpoint_task(endpoint, fetched_versions, params=None):
     limit = params.get('limit', None) if params else None
-    mapper_job_results = map_endpoint(endpoint, limit)
+    mapper_job_results = map_endpoint(endpoint, fetched_versions, limit)
     for mapper_job in mapper_job_results:
         print(
             "Review mapped data at: https://rikolti-data.s3.us-west-2."
             f"amazonaws.com/index.html#{mapper_job['collection_id']}/"
         )
-    return mapper_job_results
+    mapped_versions = {}
+    for mapper_job_result in mapper_job_results:
+        print(mapper_job_result.keys())
+        mapped_version = get_version(
+            mapper_job_result['collection_id'],
+            mapper_job_result['mapped_page_paths'][0]
+        )
+        mapped_versions[mapper_job_result['collection_id']] = mapped_version
+
+    return mapped_versions
 
 @task()
-def validate_endpoint_task(url, params=None):
+def validate_endpoint_task(url, mapped_versions, params=None):
     limit = params.get('limit', None) if params else None
 
     response = requests.get(url=url)
@@ -78,8 +92,11 @@ def validate_endpoint_task(url, params=None):
     s3_paths = []
     for collection in registry_endpoint(url):
         print(f"{collection['collection_id']:<6} Validating collection")
+        collection_id = collection['collection_id']
+        mapped_version = mapped_versions.get(str(collection_id))
+        mapped_pages = get_mapped_pages(mapped_version)
         num_rows, file_location = create_collection_validation_csv(
-            collection['collection_id'], mapped_page_paths)
+            collection_id, mapped_pages)
         csv_paths.append(file_location)
         validation_data_dest = os.environ.get("MAPPED_DATA", "file:///tmp")
         if validation_data_dest.startswith("s3"):
diff --git a/dags/validate_by_mapper_type.py b/dags/validate_by_mapper_type.py
index 560f4ea1d..54d678325 100644
--- a/dags/validate_by_mapper_type.py
+++ b/dags/validate_by_mapper_type.py
@@ -27,11 +27,8 @@
 )
 def validate_by_mapper_type():
     endpoint=make_mapper_type_endpoint()
-    validation_reports = validate_endpoint_task(endpoint)
-    (
-        fetch_endpoint_task(endpoint) >>
-        map_endpoint_task(endpoint) >>
-        validation_reports
-    )
+    fetched_versions = fetch_endpoint_task(endpoint)
+    mapped_versions = map_endpoint_task(endpoint, fetched_versions)
+    validation_reports = validate_endpoint_task(endpoint, mapped_versions)
 
 validate_by_mapper_type()
\ No newline at end of file
diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py
index eb39438a5..7637ea3d4 100644
--- a/metadata_fetcher/fetch_registry_collections.py
+++ b/metadata_fetcher/fetch_registry_collections.py
@@ -71,7 +71,7 @@ def fetch_endpoint(url, limit=None, job_logger=logger):
 
         success = all([page['status'] == 'success' for page in fetch_result])
         total_items = sum([page['document_count'] for page in fetch_result])
-        total_pages = fetch_result[-1]['page'] + 1
+        total_pages = len(fetch_result)
         diff_items = total_items - collection['solr_count']
         diff_items_label = ""
         if diff_items > 0:
diff --git a/metadata_mapper/map_registry_collections.py b/metadata_mapper/map_registry_collections.py
index 584830b86..93fdcac20 100644
--- a/metadata_mapper/map_registry_collections.py
+++ b/metadata_mapper/map_registry_collections.py
@@ -23,7 +23,7 @@ def registry_endpoint(url):
             yield collection
 
 
-def map_endpoint(url, limit=None):
+def map_endpoint(url, fetched_versions, limit=None):
     response = requests.get(url=url)
     response.raise_for_status()
     total = response.json().get('meta', {}).get('total_count', 1)
@@ -51,7 +51,9 @@ def map_endpoint(url, limit=None):
             f"{collection_id:<6}: call lambda with collection_id: {collection_id}")
 
         try:
-            map_result = lambda_shepherd.map_collection(collection_id)
+            vernacular_version = fetched_versions[str(collection_id)]
+            map_result = lambda_shepherd.map_collection(
+                collection_id, vernacular_version)
         except FileNotFoundError:
             print(f"{collection_id:<6}: not fetched yet", file=sys.stderr)
             continue
@@ -112,10 +114,10 @@ def map_endpoint(url, limit=None):
             f"solr count last updated: {collection['solr_last_updated']}"
         )
         print(map_report_row)
+        map_report.append(map_result)
 
         if limit and progress >= limit:
             break
-        map_report.append(map_result)
 
     return map_report
 

From f580e47833aee92655fe94d4343b30eb74952c38 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Wed, 15 Nov 2023 10:46:48 -0800
Subject: [PATCH 29/42] Add utilities to the content harvester docker image

---
 .../Dockerfile => Dockerfile.content_harvester              | 5 +++--
 README.md                                                   | 2 +-
 content_harvester/README.md                                 | 6 ++++--
 content_harvester/by_collection.py                          | 3 ++-
 content_harvester/by_page.py                                | 4 ++--
 content_harvester/docker-compose.yml                        | 3 ++-
 6 files changed, 14 insertions(+), 9 deletions(-)
 rename content_harvester/Dockerfile => Dockerfile.content_harvester (86%)

diff --git a/content_harvester/Dockerfile b/Dockerfile.content_harvester
similarity index 86%
rename from content_harvester/Dockerfile
rename to Dockerfile.content_harvester
index aa0ba15e4..8a485c917 100644
--- a/content_harvester/Dockerfile
+++ b/Dockerfile.content_harvester
@@ -9,11 +9,12 @@ RUN sed -i 's/<policy domain="coder" rights="none" pattern="PDF" \/>/<!--<policy
 
 WORKDIR /
 
-COPY requirements.txt ./
+COPY content_harvester/requirements.txt ./
 
 RUN pip install --upgrade pip && pip install -r requirements.txt
 
-COPY ./ /content_harvester
+COPY content_harvester/ /content_harvester
+COPY utils/ /rikolti/utils
 
 RUN chmod +x /content_harvester/by_collection.py
 
diff --git a/README.md b/README.md
index 186e02603..064ef160f 100644
--- a/README.md
+++ b/README.md
@@ -193,7 +193,7 @@ If you would like to run the content harvester on AWS infrastructure using the E
 
 > A note about Docker vs. ECS: Since we do not actively maintain our own Docker daemon, and since MWAA workers do not come with a Docker daemon installed, we cannot use a docker execution environment in deployed MWAA and instead use ECS to run our content harvester containers on Fargate infrastructure. The EcsRunTaskOperator allows us to run a pre-defined ECS Task Definition. The EcsRegisterTaskDefinitionOperator allows us to define an ECS Task Definition which we could then run. At this time, we are defining the Task Definition in our [cloudformation templates](https://github.com/cdlib/pad-airflow), rather than using the EcsRegisterTaskDefinitionOperator, but this does mean that we cannot modify the container's image or version using the EcsRunTaskOperator.
 
-If you would like to run your own rikolti/content_harvester image instead of pulling the image from AWS, then from inside the Rikolti repo, run `docker build -t rikolti/content_harvester content_harvester` to build the `rikolti/content_harvester` image locally and update the `content_harvester_image` to be `rikolti/content_harvester`.
+If you would like to run your own rikolti/content_harvester image instead of pulling the image from AWS, then from inside the Rikolti repo, run `docker build -f Dockerfile.content_harvester -t rikolti/content_harvester .` to build the `rikolti/content_harvester` image locally and update the `content_harvester_image` to be `rikolti/content_harvester`.
 
 Finally, from inside the aws-mwaa-local-runner repo, run `./mwaa-local-env build-image` to build the docker image, and `./mwaa-local-env start` to start the mwaa local environment.
 
diff --git a/content_harvester/README.md b/content_harvester/README.md
index badf0551f..a849a3feb 100644
--- a/content_harvester/README.md
+++ b/content_harvester/README.md
@@ -38,8 +38,10 @@ You can bypass uploading to s3 by setting `settings.CONTENT_DATA = "file://<loca
 
 # Local Development
 
+From inside the rikolti folder:
 ```
-docker build -t rikolti/content_harvester .
+docker build -f Dockerfile.content_harvester -t rikolti/content_harvester .
+cd content_harvester
 docker compose run --entrypoint "python3 -m content_harvester.by_registry_endpoint" --rm content_harvester https://registry.cdlib.org/api/v1/rikoltimapper/26147/?format=json
 ```
 
@@ -59,7 +61,7 @@ To build manually: From a terminal with AWS credentials, get login password for
 ```
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/b6c7x7s4
 docker buildx create --use
-docker buildx build --platform linux/arm64,linux/amd64 -t public.ecr.aws/b6c7x7s4/rikolti/content_harvester content_harvester --push
+docker buildx build -f Dockerfile.content_harvester --platform linux/arm64,linux/amd64 -t public.ecr.aws/b6c7x7s4/rikolti/content_harvester . --push
 ```
 
 # TODO:
diff --git a/content_harvester/by_collection.py b/content_harvester/by_collection.py
index bf0bc057a..71816e38d 100644
--- a/content_harvester/by_collection.py
+++ b/content_harvester/by_collection.py
@@ -1,7 +1,8 @@
 import json
 
 from .by_page import harvest_page_content
-from .versions import get_mapped_pages, create_content_data_version
+from . import settings
+from rikolti.utils.versions import get_mapped_pages, create_content_data_version
 
 
 # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo"}
diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 169dc8040..8bc91efe4 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -11,9 +11,9 @@
 
 from . import derivatives
 from . import settings
-from .storage import upload_file
 
-from .versions import (
+from rikolti.utils.storage import upload_file
+from rikolti.utils.versions import (
     get_mapped_page, get_child_directories, get_child_pages, get_child_page,
     get_version, put_content_data_page
 )
diff --git a/content_harvester/docker-compose.yml b/content_harvester/docker-compose.yml
index 7d28ead0c..e166f4689 100644
--- a/content_harvester/docker-compose.yml
+++ b/content_harvester/docker-compose.yml
@@ -2,7 +2,7 @@ services:
   content_harvester:
     build: 
       context: ./
-      dockerfile: Dockerfile
+      dockerfile: ../Dockerfile.content_harvester
     image: rikolti/content_harvester
     # default entrypoint is python -m content_harvester.by_page
     # override with tail -f /dev/null to keep container running for development
@@ -16,6 +16,7 @@ services:
       - ../rikolti_data:/rikolti_data
       - ../rikolti_content:/rikolti_content
       - ./:/content_harvester
+      - ../utils:/rikolti/utils
     environment:
       - MAPPED_DATA=file:///rikolti_data
       - CONTENT_DATA=file:///rikolti_data

From cbfdeb2959c1d3fa18068574d4c669a2fe447a77 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Wed, 15 Nov 2023 13:43:02 -0800
Subject: [PATCH 30/42] ContentHarvester class no longer makes sense

---
 content_harvester/by_page.py       | 456 ++++++++++++-----------------
 content_harvester/content_types.py | 122 ++++++++
 content_harvester/settings.py      |   4 +-
 3 files changed, 305 insertions(+), 277 deletions(-)
 create mode 100644 content_harvester/content_types.py

diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 8bc91efe4..498633bea 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -9,7 +9,7 @@
 
 from urllib.parse import urlparse
 
-from . import derivatives
+from .content_types import Media, Thumbnail
 from . import settings
 
 from rikolti.utils.storage import upload_file
@@ -22,10 +22,6 @@ class DownloadError(Exception):
     pass
 
 
-class UnsupportedMimetype(Exception):
-    pass
-
-
 def get_child_records(mapped_page_path, parent_id) -> list:
     mapped_child_records = []
     children = get_child_pages(mapped_page_path)
@@ -36,281 +32,189 @@ def get_child_records(mapped_page_path, parent_id) -> list:
     return mapped_child_records
 
 
-class Content(object):
-    def __init__(self, content_src):
-        self.missing = True if not content_src else False
-        self.src_url = content_src.get('url')
-        self.src_filename = content_src.get(
-            'filename',
-            list(
-                filter(
-                    lambda x: bool(x), content_src.get('url', '').split('/')
-                )
-            )[-1]
+def configure_http_session() -> requests.Session:
+    http = requests.Session()
+    retry_strategy = Retry(
+        total=3,
+        status_forcelist=[413, 429, 500, 502, 503, 504],
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    http.mount("https://", adapter)
+    http.mount("http://", adapter)
+    return http
+
+
+def harvest_content(content, collection_id, http, src_auth, download_cache):
+    if not content.downloaded():
+        md5 = download_content(
+            content.src_url, 
+            content.tmp_filepath, 
+            http, 
+            src_auth, 
+            download_cache
         )
-        self.src_mime_type = content_src.get('mimetype')
-        self.tmp_filepath = os.path.join('/tmp', self.src_filename)
-        self.derivative_filepath = None
-        self.s3_filepath = None
-
-    def downloaded(self):
-        return os.path.exists(self.tmp_filepath)
-
-    def processed(self):
-        return (
-            self.derivative_filepath and 
-            os.path.exists(self.derivative_filepath)
+    elif download_cache:
+        md5 = download_cache.get(
+            content.src_url,
+            hashlib.md5(open(content.tmp_filepath, 'rb').read()).hexdigest()
         )
+    if not content.processed():
+        content.create_derivatives()
+
+    if type(content).__name__ == 'Thumbnail':
+        dest_filename = md5
+    else:
+        dest_filename = os.path.basename(content.derivative_filepath)
+
+    dest_path = f"{content.dest_prefix}/{collection_id}/{dest_filename}"
+    content_s3_filepath = upload_content(content.derivative_filepath, dest_path)
+    
+    content.set_s3_filepath(content_s3_filepath)
+    # print(
+    #     f"[{collection_id}, {page_filename}, {calisphere_id}] "
+    #     f"{type(content).__name__} Path: {content.s3_filepath}"
+    # )
 
-    def set_s3_filepath(self, s3_filepath):
-        self.s3_filepath = s3_filepath
-
-    def __bool__(self):
-        return not self.missing
-
-    def __del__(self):
-        if self.downloaded() and self.tmp_filepath != self.s3_filepath:
-            os.remove(self.tmp_filepath)
-        if self.processed() and self.derivative_filepath != self.s3_filepath:
-            os.remove(self.derivative_filepath)
-
-
-class Media(Content):
-    def __init__(self, content_src):
-        super().__init__(content_src)
-        self.src_nuxeo_type = content_src.get('nuxeo_type')
-        if self.src_nuxeo_type == 'SampleCustomPicture':
-            self.dest_mime_type = 'image/jp2'
-            self.dest_prefix = "jp2"
-        else:
-            self.dest_mime_type = self.src_mime_type
-            self.dest_prefix = "media"
-
-    def create_derivatives(self):
-        self.derivative_filepath = self.tmp_filepath
-        if self.src_nuxeo_type == 'SampleCustomPicture':
-            try:
-                self.check_mimetype(self.src_mime_type)
-                self.derivative_filepath = derivatives.make_jp2(
-                    self.tmp_filepath)
-            except UnsupportedMimetype as e:
-                print(
-                    "ERROR: nuxeo type is SampleCustomPicture, "
-                    "but mimetype is not supported"
-                )
-                raise(e)
-
-    def check_mimetype(self, mimetype):
-        ''' do a basic pre-check on the object to see if we think it's
-        something know how to deal with '''
-        valid_types = [
-            'image/jpeg', 'image/gif', 'image/tiff', 'image/png',
-            'image/jp2', 'image/jpx', 'image/jpm'
-        ]
-
-        # see if we recognize this mime type
-        if mimetype in valid_types:
-            print(
-                f"Mime-type '{mimetype}' was pre-checked and recognized as "
-                "something we can try to convert."
-            )
-        elif mimetype in ['application/pdf']:
-            raise UnsupportedMimetype(
-                f"Mime-type '{mimetype}' was pre-checked and recognized as "
-                "something we don't want to convert."
-            )
-        else:
-            raise UnsupportedMimetype(
-                f"Mime-type '{mimetype}' was unrecognized. We don't know how "
-                "to deal with this"
-            )
+    return {
+        'mimetype': content.dest_mime_type,
+        'path': content.s3_filepath
+    }
 
 
-class Thumbnail(Content):
-    def __init__(self, content_src):
-        super().__init__(content_src)
-        self.src_mime_type = content_src.get('mimetype', 'image/jpeg')
-        self.dest_mime_type = 'image/jpeg' # do we need this? 
-        self.dest_prefix = "thumbnails"
-
-    def create_derivatives(self):
-        self.derivative_filepath = None
-        if self.src_mime_type == 'image/jpeg':
-            self.derivative_filepath = self.tmp_filepath
-        elif self.src_mime_type == 'application/pdf':
-            self.derivative_filepath = derivatives.pdf_to_thumb(
-                self.tmp_filepath)
-        elif self.src_mime_type == 'video/mp4':
-            self.derivative_filepath = derivatives.video_to_thumb(
-                self.tmp_filepath)
-        else:
-            raise UnsupportedMimetype(f"thumbnail: {self.src_mime_type}")
-
-    def check_mimetype(self, mimetype):
-        if mimetype not in ['image/jpeg', 'application/pdf', 'video/mp4']:
-            raise UnsupportedMimetype(f"thumbnail: {mimetype}")
-
-
-class ContentHarvester(object):
-
-    # context = {'collection_id': '12345', 'page_filename': '1.jsonl'}
-    def __init__(self, mapped_page_path, collection_id, page_filename, src_auth=None):
-        self.mapped_page_path = mapped_page_path
-        self.http = requests.Session()
-
-        retry_strategy = Retry(
-            total=3,
-            status_forcelist=[413, 429, 500, 502, 503, 504],
-        )
-        adapter = HTTPAdapter(max_retries=retry_strategy)
-        self.http.mount("https://", adapter)
-        self.http.mount("http://", adapter)
-
-        self.src_auth = src_auth
-        self.collection_id = collection_id
-        self.page_filename = page_filename
-
-
-    # returns content = {thumbnail, media, children} where children
-    # is an array of the self-same content dictionary
-    def harvest(self, record: dict, download_cache: Optional[dict] = None) -> dict:
-        calisphere_id = record.get('calisphere-id')
-
-        # maintain backwards compatibility to 'is_shown_by' field
-        thumbnail_src = record.get(
-            'thumbnail_source', record.get('is_shown_by'))
-        if isinstance(thumbnail_src, str):
-            record['thumbnail_source'] = {'url': thumbnail_src}
-
-        # get media first, sometimes media is used for thumbnail
-        content_types = [(Media, 'media'), (Thumbnail, 'thumbnail')]
-        for content_cls, field in content_types:
-            content = record.get(f"{field}_source")
-            if not content:
-                continue
-
-            content = content_cls(content)
-            if not content.downloaded():
-                md5 = self._download(content.src_url, content.tmp_filepath, download_cache)
-            elif download_cache:
-                md5 = download_cache.get(
-                    content.src_url,
-                    hashlib.md5(open(content.tmp_filepath, 'rb').read()).hexdigest()
-                )
-            if not content.processed():
-                content.create_derivatives()
-
-            if field == 'thumbnail':
-                dest_filename = md5
-            else:
-                dest_filename = os.path.basename(content.derivative_filepath)
-
-            dest_path = f"{content.dest_prefix}/{collection_id}/{dest_filename}"
-            content_s3_filepath = self._upload(dest_path, content.derivative_filepath)
-            
-            content.set_s3_filepath(content_s3_filepath)
-
-            # print(
-            #     f"[{self.collection_id}, {self.page_filename}, {calisphere_id}] "
-            #     f"{type(content).__name__} Path: {content.s3_filepath}"
-            # )
-            
-            record[field] = {
-                'mimetype': content.dest_mime_type,
-                'path': content.s3_filepath
-            }
-
-        # Recurse through the record's children (if any)
-        mapped_version = get_version(
-            self.collection_id, self.mapped_page_path)
-        child_directories = get_child_directories(mapped_version)
-        print(f"CHILD DIRECTORIES: {child_directories}")
-        if child_directories:
-            child_records = get_child_records(
-                self.mapped_page_path, calisphere_id)
-            if child_records:
-                print(
-                    f"[{self.collection_id}, {self.page_filename}, {calisphere_id}]: "
-                    f"{len(child_records)} children found."
+# returns content = {thumbnail, media, children} where children
+# is an array of the self-same content dictionary
+def harvest_record(record: dict,
+            collection_id,
+            page_filename,
+            mapped_page_path,
+            http: requests.Session,
+            src_auth: Optional[tuple[str,str]] = None,
+            download_cache: Optional[dict] = None,
+            ) -> dict:
+    calisphere_id = record.get('calisphere-id')
+
+    # maintain backwards compatibility to 'is_shown_by' field
+    thumbnail_src = record.get(
+        'thumbnail_source', record.get('is_shown_by'))
+    if isinstance(thumbnail_src, str):
+        record['thumbnail_source'] = {'url': thumbnail_src}
+
+    # get media first, sometimes media is used for thumbnail
+    media = record.get('media_source')
+    if media:
+        record['media'] = harvest_content(
+            Media(media), collection_id, http, src_auth, download_cache)
+    thumbnail = record.get('thumbnail_source')
+    if thumbnail:
+        record['thumbnail'] = harvest_content(
+            Thumbnail(thumbnail), collection_id, http, src_auth, download_cache)
+
+    # Recurse through the record's children (if any)
+    mapped_version = get_version(
+        collection_id, mapped_page_path)
+    child_directories = get_child_directories(mapped_version)
+    print(f"CHILD DIRECTORIES: {child_directories}")
+    if child_directories:
+        child_records = get_child_records(
+            mapped_page_path, calisphere_id)
+        if child_records:
+            print(
+                f"[{collection_id}, {page_filename}, {calisphere_id}]: "
+                f"{len(child_records)} children found."
+            )
+            record['children'] = [
+                harvest_record(
+                    child_record, 
+                    collection_id, 
+                    page_filename, 
+                    mapped_page_path, 
+                    http, 
+                    src_auth, 
+                    download_cache=download_cache
                 )
-                record['children'] = [self.harvest(c, download_cache=download_cache) for c in child_records]
+                for child_record in child_records
+            ]
+
+    return record
+
+
+def download_content(url: str, 
+                destination_file: str, 
+                http: requests.Session, 
+                src_auth: Optional[tuple[str, str]] = None, 
+                cache: Optional[dict] = None
+            ):
+    '''
+        download source file to local disk
+    '''
+    if src_auth and urlparse(url).scheme != 'https':
+        raise DownloadError(f"Basic auth not over https is a bad idea! {url}")
+
+    if not cache:
+        cache = {}
+    cached_data = cache.get(url, {})
+
+    request = {
+        "url": url,
+        "auth": src_auth,
+        "stream": True,
+        "timeout": (12.05, (60 * 10) + 0.05)  # connect, read
+    }
+    if cached_data:
+        request['headers'] = {
+            'If-None-Match': cached_data.get('If-None-Match'),
+            'If-Modified-Since': cached_data.get('If-Modified-Since')
+        }
+        request['headers'] = {k:v for k,v in request['headers'].items() if v}
+
+    response = http.get(**request)
+    response.raise_for_status()
+
+    # short-circuit here
+    if response.status_code == 304: # 304 - not modified
+        return cached_data.get('md5')
+
+    hasher = hashlib.new('md5')
+    with open(destination_file, 'wb') as f:
+        for block in response.iter_content(1024 * hasher.block_size):
+            hasher.update(block)
+            f.write(block)
+    md5 = hasher.hexdigest()
+
+    cache_updates = {
+        'If-None-Match': response.headers.get('ETag'),
+        'If-Modified-Since': response.headers.get('Last-Modified'),
+        'Mime-Type': response.headers.get('Content-type'),
+        'md5': md5
+    }
+    cache_updates = {k:v for k,v in cache_updates.items() if v}
+    cache['url'] = cached_data.update(cache_updates)
 
-        return record
+    return md5
 
-    def _download(self, url: str, destination_file: str, cache: Optional[dict] = None):
-        '''
-            download source file to local disk
-        '''
-        if self.src_auth and urlparse(url).scheme != 'https':
-            raise DownloadError(f"Basic auth not over https is a bad idea! {url}")
 
-        if not cache:
-            cache = {}
+def upload_content(filepath: str, destination: str, cache: Optional[dict] = None) -> str:
+    '''
+        upload file to CONTENT_ROOT
+    '''
+    if not cache:
+        cache = {}
 
-        cached_data = cache.get(url, {})
+    filename = os.path.basename(destination)
+    if cache.get(filename, {}).get('path'):
+        return cache[filename]['path']
 
-        request = {
-            "url": url,
-            "auth": self.src_auth,
-            "stream": True,
-            "timeout": (12.05, (60 * 10) + 0.05)  # connect, read
-        }
-        if cached_data:
-            request['headers'] = {
-                'If-None-Match': cached_data.get('If-None-Match'),
-                'If-Modified-Since': cached_data.get('If-Modified-Since')
-            }
-            request['headers'] = {k:v for k,v in request['headers'].items() if v}
-
-        response = self.http.get(**request)
-        response.raise_for_status()
-
-        # short-circuit here
-        if response.status_code == 304: # 304 - not modified
-            return cached_data.get('md5')
-
-        hasher = hashlib.new('md5')
-        with open(destination_file, 'wb') as f:
-            for block in response.iter_content(1024 * hasher.block_size):
-                hasher.update(block)
-                f.write(block)
-        md5 = hasher.hexdigest()
-
-        cache_updates = {
-            'If-None-Match': response.headers.get('ETag'),
-            'If-Modified-Since': response.headers.get('Last-Modified'),
-            'Mime-Type': response.headers.get('Content-type'),
-            'md5': md5
-        }
-        cache_updates = {k:v for k,v in cache_updates.items() if v}
-        cache['url'] = cached_data.update(cache_updates)
-
-        return md5
-
-    def _upload(self, dest_filepath, src_filepath, cache: Optional[dict] = None) -> str:
-        '''
-            upload file to CONTENT_ROOT
-        '''
-        if not cache:
-            cache = {}
-
-        filename = os.path.basename(dest_filepath)
-        if cache.get(filename, {}).get('path'):
-            return cache[filename]['path']
-
-        content_root = os.environ.get("CONTENT_ROOT", 'file:///tmp')
-        content_path = f"{content_root.rstrip('/')}/{dest_filepath}"
-        upload_file(src_filepath, content_path)
-
-        # (mime, dimensions) = image_info(filepath)
-        cache_updates = {
-            # 'mime': mime,
-            # 'dimensions': dimensions,
-            'path': content_path
-        }
-        cache[filename] = cache_updates
+    content_root = os.environ.get("CONTENT_ROOT", 'file:///tmp')
+    content_path = f"{content_root.rstrip('/')}/{destination}"
+    upload_file(filepath, content_path)
 
-        return content_path
+    # (mime, dimensions) = image_info(filepath)
+    cache[filename] = {
+        # 'mime': mime,
+        # 'dimensions': dimensions,
+        'path': content_path
+    }
+    return content_path
 
 
 # {"collection_id": 26098, "rikolti_mapper_type": "nuxeo.nuxeo", "page_filename": "file:///rikolti_data/r-0"}
@@ -323,12 +227,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
     auth = None
     if rikolti_mapper_type == 'nuxeo.nuxeo':
         auth = (settings.NUXEO_USER, settings.NUXEO_PASS)
-    harvester = ContentHarvester(
-        mapped_page_path,
-        collection_id=collection_id,
-        page_filename=page_filename,
-        src_auth=auth
-    )
+    http_session = configure_http_session()
 
     records = json.loads(get_mapped_page(mapped_page_path))
     print(
@@ -343,7 +242,14 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
         # )
         # spit out progress so far if an error has been encountered
         try:
-            record_with_content = harvester.harvest(record)
+            record_with_content = harvest_record(
+                record, 
+                collection_id, 
+                page_filename, 
+                mapped_page_path, 
+                http_session, 
+                auth
+            )
             # put_content_data_page(
             #     json.dumps(record_with_content), 
             #     record_with_content.get('calisphere-id').replace(os.sep, '_') + ".json",
diff --git a/content_harvester/content_types.py b/content_harvester/content_types.py
new file mode 100644
index 000000000..3bd620ee4
--- /dev/null
+++ b/content_harvester/content_types.py
@@ -0,0 +1,122 @@
+import os
+from . import derivatives
+
+class UnsupportedMimetype(Exception):
+    pass
+
+
+class Content(object):
+    def __init__(self, content_src):
+        self.missing = True if not content_src else False
+        self.src_url = content_src.get('url')
+        self.src_filename = content_src.get(
+            'filename',
+            list(
+                filter(
+                    lambda x: bool(x), content_src.get('url', '').split('/')
+                )
+            )[-1]
+        )
+        self.src_mime_type = content_src.get('mimetype')
+        self.tmp_filepath = os.path.join('/tmp', self.src_filename)
+        self.derivative_filepath = None
+        self.s3_filepath = None
+
+    def downloaded(self):
+        return os.path.exists(self.tmp_filepath)
+
+    def processed(self):
+        return (
+            self.derivative_filepath and 
+            os.path.exists(self.derivative_filepath)
+        )
+
+    def set_s3_filepath(self, s3_filepath):
+        self.s3_filepath = s3_filepath
+
+    def __bool__(self):
+        return not self.missing
+
+    def __del__(self):
+        if self.downloaded() and self.tmp_filepath != self.s3_filepath:
+            os.remove(self.tmp_filepath)
+        if self.processed() and self.derivative_filepath != self.s3_filepath:
+            os.remove(self.derivative_filepath)
+
+
+class Media(Content):
+    def __init__(self, content_src):
+        super().__init__(content_src)
+        self.src_nuxeo_type = content_src.get('nuxeo_type')
+        if self.src_nuxeo_type == 'SampleCustomPicture':
+            self.dest_mime_type = 'image/jp2'
+            self.dest_prefix = "jp2"
+        else:
+            self.dest_mime_type = self.src_mime_type
+            self.dest_prefix = "media"
+
+    def create_derivatives(self):
+        self.derivative_filepath = self.tmp_filepath
+        if self.src_nuxeo_type == 'SampleCustomPicture':
+            try:
+                self.check_mimetype(self.src_mime_type)
+                self.derivative_filepath = derivatives.make_jp2(
+                    self.tmp_filepath)
+            except UnsupportedMimetype as e:
+                print(
+                    "ERROR: nuxeo type is SampleCustomPicture, "
+                    "but mimetype is not supported"
+                )
+                raise(e)
+
+    def check_mimetype(self, mimetype):
+        ''' do a basic pre-check on the object to see if we think it's
+        something know how to deal with '''
+        valid_types = [
+            'image/jpeg', 'image/gif', 'image/tiff', 'image/png',
+            'image/jp2', 'image/jpx', 'image/jpm'
+        ]
+
+        # see if we recognize this mime type
+        if mimetype in valid_types:
+            print(
+                f"Mime-type '{mimetype}' was pre-checked and recognized as "
+                "something we can try to convert."
+            )
+        elif mimetype in ['application/pdf']:
+            raise UnsupportedMimetype(
+                f"Mime-type '{mimetype}' was pre-checked and recognized as "
+                "something we don't want to convert."
+            )
+        else:
+            raise UnsupportedMimetype(
+                f"Mime-type '{mimetype}' was unrecognized. We don't know how "
+                "to deal with this"
+            )
+
+
+class Thumbnail(Content):
+    def __init__(self, content_src):
+        super().__init__(content_src)
+        self.src_mime_type = content_src.get('mimetype', 'image/jpeg')
+        self.dest_mime_type = 'image/jpeg' # do we need this? 
+        self.dest_prefix = "thumbnails"
+
+    def create_derivatives(self):
+        self.derivative_filepath = None
+        if self.src_mime_type == 'image/jpeg':
+            self.derivative_filepath = self.tmp_filepath
+        elif self.src_mime_type == 'application/pdf':
+            self.derivative_filepath = derivatives.pdf_to_thumb(
+                self.tmp_filepath)
+        elif self.src_mime_type == 'video/mp4':
+            self.derivative_filepath = derivatives.video_to_thumb(
+                self.tmp_filepath)
+        else:
+            raise UnsupportedMimetype(f"thumbnail: {self.src_mime_type}")
+
+    def check_mimetype(self, mimetype):
+        if mimetype not in ['image/jpeg', 'application/pdf', 'video/mp4']:
+            raise UnsupportedMimetype(f"thumbnail: {mimetype}")
+
+
diff --git a/content_harvester/settings.py b/content_harvester/settings.py
index 880d2eea7..a48254e6d 100644
--- a/content_harvester/settings.py
+++ b/content_harvester/settings.py
@@ -9,8 +9,8 @@
 AWS_SESSION_TOKEN = os.environ.get('AWS_SESSION_TOKEN', False)
 AWS_REGION = os.environ.get('AWS_REGION', False)
 
-NUXEO_USER = os.environ.get('NUXEO_USER', False)
-NUXEO_PASS = os.environ.get('NUXEO_PASS', False)
+NUXEO_USER = os.environ.get('NUXEO_USER', '')
+NUXEO_PASS = os.environ.get('NUXEO_PASS', '')
 
 CONTENT_PROCESSES = {
     'magick': '/usr/bin/convert',

From 225f679780e06d1adac0946542b5d1d7ab0905d4 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 16 Nov 2023 13:29:10 -0800
Subject: [PATCH 31/42] get_mapped_data always returns json.loads(content)

---
 content_harvester/by_page.py        | 6 +++---
 metadata_mapper/validate_mapping.py | 2 +-
 utils/versions.py                   | 6 +-----
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index 498633bea..ba670c552 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,7 +14,7 @@
 
 from rikolti.utils.storage import upload_file
 from rikolti.utils.versions import (
-    get_mapped_page, get_child_directories, get_child_pages, get_child_page,
+    get_mapped_page, get_child_directories, get_child_pages,
     get_version, put_content_data_page
 )
 
@@ -28,7 +28,7 @@ def get_child_records(mapped_page_path, parent_id) -> list:
     children = [page for page in children
                 if (page.rsplit('/')[-1]).startswith(parent_id)]
     for child in children:
-        mapped_child_records.extend(get_child_page(child))
+        mapped_child_records.extend(get_mapped_page(child))
     return mapped_child_records
 
 
@@ -229,7 +229,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
         auth = (settings.NUXEO_USER, settings.NUXEO_PASS)
     http_session = configure_http_session()
 
-    records = json.loads(get_mapped_page(mapped_page_path))
+    records = get_mapped_page(mapped_page_path)
     print(
         f"[{collection_id}, {page_filename}]: "
         f"Harvesting content for {len(records)} records"
diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py
index 0580b2c07..eb1fb2523 100644
--- a/metadata_mapper/validate_mapping.py
+++ b/metadata_mapper/validate_mapping.py
@@ -87,7 +87,7 @@ def validate_page(collection_id: int, page_path: str,
         "page_path": page_path
     }
     mapped_metadata = validator.generate_keys(
-                        json.loads(get_mapped_page(page_path)),
+                        get_mapped_page(page_path),
                         type="Rikolti",
                         context=context
                       )
diff --git a/utils/versions.py b/utils/versions.py
index 90dc9ee33..68c77a9c9 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -1,3 +1,4 @@
+import json
 import os
 from datetime import datetime
 from typing import Union, Optional
@@ -121,12 +122,7 @@ def get_vernacular_page(version_page):
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp").rstrip('/')
     return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
 
-# TODO: check if this is always json.loads
 def get_mapped_page(version_page):
-    data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/')
-    return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
-
-def get_child_page(version_page):
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/')
     content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
     return json.loads(content)

From 4ed1e52876d2e8cbabf8e7f601b4be75e8656e09 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 16 Nov 2023 13:30:19 -0800
Subject: [PATCH 32/42] dry up versions.py with a pipeline step

---
 utils/versions.py | 60 +++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/utils/versions.py b/utils/versions.py
index 68c77a9c9..788d36ad8 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -75,30 +75,30 @@ def get_most_recent_vernacular_version(collection_id: Union[int, str]):
     recent_version = sorted(versions)[-1]
     return f"{collection_id}/{recent_version}/"
 
-def get_vernacular_pages(version):
-    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
-    data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
-    try:
-        page_list = storage.list_pages(data_path, recursive=True)
-    except FileNotFoundError as e:
-        print(
-            f"\n\nNo vernacular pages found in {e.filename}\n\n"
-        )
-        raise(e)
-    return [path[len(data_root)+1:] for path in page_list]
+def get_pages(version: str, pipeline_step: str, **kwargs):
+    if pipeline_step == 'vernacular': 
+        data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
+    elif pipeline_step == 'mapped':
+        data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    else:
+        raise Exception("Invalid pipeline step")
 
-def get_mapped_pages(version, **kwargs):
-    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
     try:
         page_list = storage.list_pages(data_path, recursive=True, **kwargs)
     except FileNotFoundError as e:
         print(
-            f"\n\nNo mapped pages found in {e.filename}\n\n"
+            f"\n\nNo {pipeline_step} pages found in {e.filename}\n\n"
         )
         raise(e)
     return [path[len(data_root)+1:] for path in page_list]
 
+def get_vernacular_pages(version, **kwargs):
+    return get_pages(version, 'vernacular_metadata', **kwargs)
+
+def get_mapped_pages(version, **kwargs):
+    return get_pages(version, 'mapped_metadata', **kwargs)
+
 def get_child_directories(version, **kwargs):
     data_root = os.environ.get('MAPPED_DATA', "file:///tmp")
     child_directories = storage.list_dirs(
@@ -127,30 +127,30 @@ def get_mapped_page(version_page):
     content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
     return json.loads(content)
 
-def put_vernacular_page(content: str, page_name: Union[int, str], version: str):
-    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
+def put_page(content: str, page_name: Union[int, str], version: str, pipeline_step: str):
+    if pipeline_step == "vernacular":
+        data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
+    elif pipeline_step == "mapped":
+        data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    elif pipeline_step == "content_data":
+        data_root = os.environ.get("CONTENT_DATA", "file:///tmp")
+    else:
+        raise Exception("Invalid pipeline step")
     path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
     storage.put_page_content(content, path)
     return f"{version.rstrip('/')}/data/{page_name}"
 
+def put_vernacular_page(content: str, page_name: Union[int, str], version: str):
+    return put_page(content, page_name, version, "vernacular")
+
 def put_mapped_page(content, page_name, version):
-    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
-    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}.jsonl"
-    storage.put_page_content(content, path)
-    return f"{version.rstrip('/')}/data/{page_name}.jsonl"
+    return put_page(content, f"{page_name}.jsonl", version, "mapped")
+
+def put_content_data_page(content, page_name, version):
+    return put_page(content, f"{page_name}", version, "content_data")
 
 def put_validation_report(content, version_page):
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
     path = f"{data_root.rstrip('/')}/{version_page}"
     storage.put_page_content(content, path)
     return version_page
-
-def put_content_data_page(content, page_name, version):
-    data_root = os.environ.get("CONTENT_DATA", "file:///tmp")
-    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
-    storage.put_page_content(content, path)
-    return f"{version.rstrip('/')}/data/{page_name}"
-
-
-
-

From 8638f6b2d09298cce0579b790409057cf1d3926f Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 16 Nov 2023 14:01:50 -0800
Subject: [PATCH 33/42] ruff up

---
 dags/harvest_dag.py                        | 1 -
 dags/validate_by_mapper_type.py            | 2 +-
 metadata_fetcher/fetchers/nuxeo_fetcher.py | 1 -
 metadata_fetcher/lambda_function.py        | 2 +-
 metadata_mapper/utilities.py               | 5 +----
 utils/storage.py                           | 1 -
 6 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/dags/harvest_dag.py b/dags/harvest_dag.py
index 5365a730f..17568f570 100644
--- a/dags/harvest_dag.py
+++ b/dags/harvest_dag.py
@@ -1,5 +1,4 @@
 from datetime import datetime
-import os
 from airflow.decorators import dag, task
 from airflow.models.param import Param
 
diff --git a/dags/validate_by_mapper_type.py b/dags/validate_by_mapper_type.py
index 54d678325..9895cabd4 100644
--- a/dags/validate_by_mapper_type.py
+++ b/dags/validate_by_mapper_type.py
@@ -29,6 +29,6 @@ def validate_by_mapper_type():
     endpoint=make_mapper_type_endpoint()
     fetched_versions = fetch_endpoint_task(endpoint)
     mapped_versions = map_endpoint_task(endpoint, fetched_versions)
-    validation_reports = validate_endpoint_task(endpoint, mapped_versions)
+    validation_reports = validate_endpoint_task(endpoint, mapped_versions)  # noqa: F841
 
 validate_by_mapper_type()
\ No newline at end of file
diff --git a/metadata_fetcher/fetchers/nuxeo_fetcher.py b/metadata_fetcher/fetchers/nuxeo_fetcher.py
index 754c88390..38cbe8a39 100644
--- a/metadata_fetcher/fetchers/nuxeo_fetcher.py
+++ b/metadata_fetcher/fetchers/nuxeo_fetcher.py
@@ -1,6 +1,5 @@
 import json
 import logging
-import os
 import subprocess
 from urllib.parse import quote as urllib_quote
 
diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py
index 17489fb51..1b71452a1 100644
--- a/metadata_fetcher/lambda_function.py
+++ b/metadata_fetcher/lambda_function.py
@@ -58,7 +58,7 @@ def fetch_collection(payload, vernacular_version, context) -> list[dict]:
     # TODO: could be cleaner to stash ucd's table of contents in a known
     # location and have each iteration of the fetcher reference that location,
     # then we could resolve this difference in return values
-    if len(fetch_status) == 1 and type(fetch_status[0]) == list:
+    if len(fetch_status) == 1 and isinstance(fetch_status[0], list):
         fetch_status = fetch_status[0]
 
     if not json.loads(next_page).get('finished'):
diff --git a/metadata_mapper/utilities.py b/metadata_mapper/utilities.py
index af188f419..cd7d707cd 100644
--- a/metadata_mapper/utilities.py
+++ b/metadata_mapper/utilities.py
@@ -1,8 +1,5 @@
 import importlib
-import json
-from typing import Callable, Union
-
-from . import settings
+from typing import Callable
 
 
 def returns_callable(func: Callable) -> Callable:
diff --git a/utils/storage.py b/utils/storage.py
index 3c553f8e9..4dccba274 100644
--- a/utils/storage.py
+++ b/utils/storage.py
@@ -5,7 +5,6 @@
 import shutil
 
 from urllib.parse import urlparse
-from typing import Optional, Union
 from collections import namedtuple
 
 DataStorage = namedtuple(

From 3443fa736d8f9afc2390cd0b52a1aa9013d9daf5 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Thu, 16 Nov 2023 14:41:39 -0800
Subject: [PATCH 34/42] Make rikolti.utils available within dev docker
 container

---
 Dockerfile => Dockerfile.dev | 0
 docker-compose.yml           | 5 +++--
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename Dockerfile => Dockerfile.dev (100%)

diff --git a/Dockerfile b/Dockerfile.dev
similarity index 100%
rename from Dockerfile
rename to Dockerfile.dev
diff --git a/docker-compose.yml b/docker-compose.yml
index b55b0240f..ac0c76a64 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,6 +2,7 @@ services:
   python:
     build:
       context: ./
-      dockerfile: ./Dockerfile
+      dockerfile: ./Dockerfile.dev
     volumes:
-      - .:/home/rikolti
\ No newline at end of file
+      - .:/home/rikolti
+      - ./utils:/home/rikolti/rikolti/utils
\ No newline at end of file

From 5b432f83d7f2c5ab85df6567b6a3632f696f69bf Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Wed, 22 Nov 2023 10:51:09 -0800
Subject: [PATCH 35/42] remove outdated comments

---
 utils/storage.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/utils/storage.py b/utils/storage.py
index 4dccba274..68589667a 100644
--- a/utils/storage.py
+++ b/utils/storage.py
@@ -153,11 +153,7 @@ def get_file_contents(data: DataStorage):
 
 def put_page_content(content:str, data_uri: str, **kwargs) -> str:
     """
-    Write content to a file at relative_path (relative to data_path).
-    relative_path is a list of strings, each string is a directory name 
-    representing a directory tree.
-    handle s3 or file storage, use '/' as separator for s3 key and os.sep
-    as separtors for file storage
+    Write content to a file at data_uri
     """
     data = parse_data_uri(data_uri)
 

From e065137f7ee3a9312cd29f92c74ba07e7fb9a9f7 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 09:13:16 -0800
Subject: [PATCH 36/42] remove pipeline step abstraction

---
 utils/versions.py | 96 ++++++++++++++++-------------------------------
 1 file changed, 33 insertions(+), 63 deletions(-)

diff --git a/utils/versions.py b/utils/versions.py
index 788d36ad8..95c076f24 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -20,51 +20,37 @@ def get_version(collection_id: Union[int, str], uri: str) -> str:
     version = "/".join(path_list)
     return version
 
-def create_version(
-    base_version: str, 
-    pipeline_step: str,
-    suffix: Optional[str] = None
-):
-    """
-    Given a path to a version, ex: 3433/vernacular_metadata_v1/, 
-    compose a new version path, ex: 3433/vernacular_metadata_v1/mapped_metadata_v1/
-
-    base_version: str
-        a version path
-    pipeline_step: str
-        a name for the branch indicating metadata state, ex: mapped_metadata
-    branch_suffix: str
-        a uniquely identifying suffix for this branch
-    """
-    if not suffix:
-        suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
-    base_version = base_version.rstrip('/')
-    branch_version = (
-        f"{base_version}/{pipeline_step}_{suffix}/"
-    )
-    return branch_version
-
 def create_vernacular_version(
         collection_id: Union[int, str],
         suffix: Optional[str] = None
     ) -> str:
-    version_path = f"{collection_id}/"
-    return create_version(version_path, 'vernacular_metadata', suffix)
+    version_path = f"{collection_id}"
+    if not suffix:
+        suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+    return f"{version_path}/vernacular_metadata_{suffix}/"
 
 def create_mapped_version(
         vernacular_version: str, suffix: Optional[str] = None) -> str:
-    return create_version(vernacular_version, 'mapped_metadata', suffix)
+    vernacular_version = vernacular_version.rstrip('/')
+    if not suffix:
+        suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+    return f"{vernacular_version}/mapped_metadata_{suffix}/"
 
 def create_validation_version(
         mapped_version: str,
         suffix: Optional[str] = None
 ):
-    validation_version = create_version(mapped_version, 'validation', suffix)
-    return validation_version.rstrip('/') + ".csv"
+    mapped_version = mapped_version.rstrip('/')
+    if not suffix:
+        suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+    return f"{mapped_version}/validation_{suffix}.csv"
 
 def create_content_data_version(
         mapped_version: str, suffix: Optional[str] = None) -> str:
-    return create_version(mapped_version, 'content_data', suffix)
+    mapped_version = mapped_version.rstrip('/')
+    if not suffix:
+        suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
+    return f"{mapped_version}/content_data_{suffix}/"
 
 def get_most_recent_vernacular_version(collection_id: Union[int, str]):
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
@@ -75,29 +61,17 @@ def get_most_recent_vernacular_version(collection_id: Union[int, str]):
     recent_version = sorted(versions)[-1]
     return f"{collection_id}/{recent_version}/"
 
-def get_pages(version: str, pipeline_step: str, **kwargs):
-    if pipeline_step == 'vernacular': 
-        data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
-    elif pipeline_step == 'mapped':
-        data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
-    else:
-        raise Exception("Invalid pipeline step")
-
+def get_vernacular_pages(version, **kwargs):
+    data_root = os.environ.get('VERNACULAR_DATA', "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
-    try:
-        page_list = storage.list_pages(data_path, recursive=True, **kwargs)
-    except FileNotFoundError as e:
-        print(
-            f"\n\nNo {pipeline_step} pages found in {e.filename}\n\n"
-        )
-        raise(e)
+    page_list = storage.list_pages(data_path, recursive=True, **kwargs)
     return [path[len(data_root)+1:] for path in page_list]
 
-def get_vernacular_pages(version, **kwargs):
-    return get_pages(version, 'vernacular_metadata', **kwargs)
-
 def get_mapped_pages(version, **kwargs):
-    return get_pages(version, 'mapped_metadata', **kwargs)
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
+    page_list = storage.list_pages(data_path, recursive=True, **kwargs)
+    return [path[len(data_root)+1:] for path in page_list]
 
 def get_child_directories(version, **kwargs):
     data_root = os.environ.get('MAPPED_DATA', "file:///tmp")
@@ -127,27 +101,23 @@ def get_mapped_page(version_page):
     content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
     return json.loads(content)
 
-def put_page(content: str, page_name: Union[int, str], version: str, pipeline_step: str):
-    if pipeline_step == "vernacular":
-        data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
-    elif pipeline_step == "mapped":
-        data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
-    elif pipeline_step == "content_data":
-        data_root = os.environ.get("CONTENT_DATA", "file:///tmp")
-    else:
-        raise Exception("Invalid pipeline step")
+def put_vernacular_page(content: str, page_name: Union[int, str], version: str):
+    data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
     path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
     storage.put_page_content(content, path)
     return f"{version.rstrip('/')}/data/{page_name}"
 
-def put_vernacular_page(content: str, page_name: Union[int, str], version: str):
-    return put_page(content, page_name, version, "vernacular")
-
 def put_mapped_page(content, page_name, version):
-    return put_page(content, f"{page_name}.jsonl", version, "mapped")
+    data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
+    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}.jsonl"
+    storage.put_page_content(content, path)
+    return f"{version.rstrip('/')}/data/{page_name}.jsonl"
 
 def put_content_data_page(content, page_name, version):
-    return put_page(content, f"{page_name}", version, "content_data")
+    data_root = os.environ.get("CONTENT_DATA", "file:///tmp")
+    path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
+    storage.put_page_content(content, path)
+    return f"{version.rstrip('/')}/data/{page_name}"
 
 def put_validation_report(content, version_page):
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp")

From 717584b6858dd24f6773ad1fe23689fa32e1b7ea Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 09:29:14 -0800
Subject: [PATCH 37/42] update readme to better reflect process

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 064ef160f..08ab70d42 100644
--- a/README.md
+++ b/README.md
@@ -193,7 +193,11 @@ If you would like to run the content harvester on AWS infrastructure using the E
 
 > A note about Docker vs. ECS: Since we do not actively maintain our own Docker daemon, and since MWAA workers do not come with a Docker daemon installed, we cannot use a docker execution environment in deployed MWAA and instead use ECS to run our content harvester containers on Fargate infrastructure. The EcsRunTaskOperator allows us to run a pre-defined ECS Task Definition. The EcsRegisterTaskDefinitionOperator allows us to define an ECS Task Definition which we could then run. At this time, we are defining the Task Definition in our [cloudformation templates](https://github.com/cdlib/pad-airflow), rather than using the EcsRegisterTaskDefinitionOperator, but this does mean that we cannot modify the container's image or version using the EcsRunTaskOperator.
 
-If you would like to run your own rikolti/content_harvester image instead of pulling the image from AWS, then from inside the Rikolti repo, run `docker build -f Dockerfile.content_harvester -t rikolti/content_harvester .` to build the `rikolti/content_harvester` image locally and update the `content_harvester_image` to be `rikolti/content_harvester`.
+If you would like to run your own rikolti/content_harvester image instead of pulling the image from AWS, then from inside the Rikolti repo, run `docker build -f Dockerfile.content_harvester -t rikolti/content_harvester .` to build the `rikolti/content_harvester` image locally and add the following line to `dags/startup.sh` to update `CONTENT_HARVEST_IMAGE` to be `rikolt/content_harvester`:
+
+```
+export CONTENT_HARVEST_IMAGE=rikolti/content_harvester
+```
 
 Finally, from inside the aws-mwaa-local-runner repo, run `./mwaa-local-env build-image` to build the docker image, and `./mwaa-local-env start` to start the mwaa local environment.
 

From d269f32d1b993b8f49fd62396b3bd462c647b69d Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 09:31:53 -0800
Subject: [PATCH 38/42] Update singular 'get_---_page' to
 'get_---_page_content'

---
 content_harvester/by_page.py        | 6 +++---
 metadata_mapper/lambda_function.py  | 4 ++--
 metadata_mapper/validate_mapping.py | 4 ++--
 utils/versions.py                   | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/content_harvester/by_page.py b/content_harvester/by_page.py
index ba670c552..b2de1e7c1 100644
--- a/content_harvester/by_page.py
+++ b/content_harvester/by_page.py
@@ -14,7 +14,7 @@
 
 from rikolti.utils.storage import upload_file
 from rikolti.utils.versions import (
-    get_mapped_page, get_child_directories, get_child_pages,
+    get_mapped_page_content, get_child_directories, get_child_pages,
     get_version, put_content_data_page
 )
 
@@ -28,7 +28,7 @@ def get_child_records(mapped_page_path, parent_id) -> list:
     children = [page for page in children
                 if (page.rsplit('/')[-1]).startswith(parent_id)]
     for child in children:
-        mapped_child_records.extend(get_mapped_page(child))
+        mapped_child_records.extend(get_mapped_page_content(child))
     return mapped_child_records
 
 
@@ -229,7 +229,7 @@ def harvest_page_content(collection_id, mapped_page_path, content_data_version,
         auth = (settings.NUXEO_USER, settings.NUXEO_PASS)
     http_session = configure_http_session()
 
-    records = get_mapped_page(mapped_page_path)
+    records = get_mapped_page_content(mapped_page_path)
     print(
         f"[{collection_id}, {page_filename}]: "
         f"Harvesting content for {len(records)} records"
diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py
index b62271999..329a22a51 100644
--- a/metadata_mapper/lambda_function.py
+++ b/metadata_mapper/lambda_function.py
@@ -8,7 +8,7 @@
 
 from . import settings
 from .mappers.mapper import Record, Vernacular
-from rikolti.utils.versions import get_vernacular_page, put_mapped_page
+from rikolti.utils.versions import get_vernacular_page_content, put_mapped_page
 
 logger = logging.getLogger(__name__)
 
@@ -98,7 +98,7 @@ def map_page(
     vernacular_reader = import_vernacular_reader(
         collection.get('rikolti_mapper_type'))
     page_filename = os.path.basename(vernacular_page_path)
-    api_resp = get_vernacular_page(vernacular_page_path)
+    api_resp = get_vernacular_page_content(vernacular_page_path)
 
     source_vernacular = vernacular_reader(collection_id, page_filename)
     source_metadata_records = source_vernacular.parse(api_resp)
diff --git a/metadata_mapper/validate_mapping.py b/metadata_mapper/validate_mapping.py
index eb1fb2523..926e2e7bf 100644
--- a/metadata_mapper/validate_mapping.py
+++ b/metadata_mapper/validate_mapping.py
@@ -11,7 +11,7 @@
 from .validator.validation_mode import ValidationMode
 from .validator.validator import Validator
 from rikolti.utils.versions import (
-    get_mapped_page, get_version, get_mapped_pages)
+    get_mapped_page_content, get_version, get_mapped_pages)
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -87,7 +87,7 @@ def validate_page(collection_id: int, page_path: str,
         "page_path": page_path
     }
     mapped_metadata = validator.generate_keys(
-                        get_mapped_page(page_path),
+                        get_mapped_page_content(page_path),
                         type="Rikolti",
                         context=context
                       )
diff --git a/utils/versions.py b/utils/versions.py
index 95c076f24..5fcfc6289 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -61,7 +61,7 @@ def get_most_recent_vernacular_version(collection_id: Union[int, str]):
     recent_version = sorted(versions)[-1]
     return f"{collection_id}/{recent_version}/"
 
-def get_vernacular_pages(version, **kwargs):
+def get_vernacular_page(version, **kwargs):
     data_root = os.environ.get('VERNACULAR_DATA', "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
     page_list = storage.list_pages(data_path, recursive=True, **kwargs)
@@ -92,11 +92,11 @@ def get_child_pages(version, **kwargs):
         return []
     return [path[len(data_root)+1:] for path in page_list]
 
-def get_vernacular_page(version_page):
+def get_vernacular_page_content(version_page):
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp").rstrip('/')
     return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
 
-def get_mapped_page(version_page):
+def get_mapped_page_content(version_page):
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/')
     content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
     return json.loads(content)

From bb7b41caeab1e580a03470286daf19a311a32111 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 09:33:47 -0800
Subject: [PATCH 39/42] update docs

---
 metadata_fetcher/fetchers/ucd_json_fetcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metadata_fetcher/fetchers/ucd_json_fetcher.py b/metadata_fetcher/fetchers/ucd_json_fetcher.py
index 1c91fc4f6..b5c7d2387 100644
--- a/metadata_fetcher/fetchers/ucd_json_fetcher.py
+++ b/metadata_fetcher/fetchers/ucd_json_fetcher.py
@@ -52,7 +52,7 @@ def fetch_all_pages(self, response: requests.Response) -> list:
             response: requests.Response
 
         Returns:
-            int
+            list
         """
         ns = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
         xml = ElementTree.fromstring(response.text)

From d8be0f35e640c25d8ea194ef6135865c22003721 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 09:34:30 -0800
Subject: [PATCH 40/42] Update all instances of os.makedirs with exist_ok=True

---
 dags/shared_tasks.py | 2 +-
 utils/storage.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dags/shared_tasks.py b/dags/shared_tasks.py
index 96fd2b072..c7afe646a 100644
--- a/dags/shared_tasks.py
+++ b/dags/shared_tasks.py
@@ -283,7 +283,7 @@ def s3_to_localfilesystem(s3_url=None, params=None):
         path.insert(0, 'tmp')
         path = os.path.sep + os.path.sep.join(path)
         if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
+            os.makedirs(os.path.dirname(path), exist_ok=True)
 
         # write contents of s3 file to local filesystem
         with open(path, 'wb') as sync_file:
diff --git a/utils/storage.py b/utils/storage.py
index 68589667a..69e336d81 100644
--- a/utils/storage.py
+++ b/utils/storage.py
@@ -186,7 +186,7 @@ def put_file_content(data: DataStorage, content) -> str:
     file_path = os.sep.join(data.path.split('/'))
     directory_path = os.path.dirname(file_path)
     if not os.path.exists(directory_path):
-        os.makedirs(directory_path)
+        os.makedirs(directory_path, exist_ok=True)
 
     with open(file_path, 'w') as f:
         f.write(content)
@@ -219,7 +219,7 @@ def move_file(data: DataStorage, filepath):
     destination_path = os.sep.join(data.path.split('/'))
     directory_path = os.path.dirname(destination_path)
     if not os.path.exists(directory_path):
-        os.makedirs(directory_path)
+        os.makedirs(directory_path, exist_ok=True)
 
     shutil.copyfile(filepath, destination_path)
     return data.uri

From eac7a7f60b6504c5d35ba2d5483aa557d1a6f054 Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 13:36:57 -0800
Subject: [PATCH 41/42] typo

---
 utils/versions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/versions.py b/utils/versions.py
index 5fcfc6289..82b7c6c23 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -61,7 +61,7 @@ def get_most_recent_vernacular_version(collection_id: Union[int, str]):
     recent_version = sorted(versions)[-1]
     return f"{collection_id}/{recent_version}/"
 
-def get_vernacular_page(version, **kwargs):
+def get_vernacular_pages(version, **kwargs):
     data_root = os.environ.get('VERNACULAR_DATA', "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
     page_list = storage.list_pages(data_path, recursive=True, **kwargs)

From 9a84aa62a5e86b0897f45bf9a113a6fb75c73a1d Mon Sep 17 00:00:00 2001
From: amy wieliczka <amywieliczka@gmail.com>
Date: Mon, 27 Nov 2023 14:13:27 -0800
Subject: [PATCH 42/42] add docstrings to storage and versions modules

---
 utils/storage.py  |  35 +++++++++++++++-
 utils/versions.py | 103 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/utils/storage.py b/utils/storage.py
index 69e336d81..35524f799 100644
--- a/utils/storage.py
+++ b/utils/storage.py
@@ -7,6 +7,15 @@
 from urllib.parse import urlparse
 from collections import namedtuple
 
+
+"""
+This module implements list, get, and put operations for s3:// or file:// URIs.
+Broadly, these functions take a data_uri, where a data_uri is always a
+URI-formatted absolute path to a storage location, and kwargs, which are always
+passed along to the underlying boto3 call and can be used for AWS credentials
+"""
+
+
 DataStorage = namedtuple(
     "DateStorage", "uri, store, bucket, path"
 )
@@ -18,6 +27,15 @@ def parse_data_uri(data_uri: str):
 
 
 def list_dirs(data_uri: str, recursive=False, **kwargs) -> list[str]:
+    """
+    Returns a list of all directories that are the immediate child of data_uri
+
+    Returns relative paths - it's the only function in this module that does
+    not return absolute paths, and it might be worth changing it for
+    consistency, even though it's usage does not require absolute paths.
+
+    There is no recursive implementation at this time.
+    """
     data = parse_data_uri(data_uri)
     if data.store == 's3': 
         s3 = boto3.client('s3', **kwargs)
@@ -43,6 +61,13 @@ def list_dirs(data_uri: str, recursive=False, **kwargs) -> list[str]:
 
 
 def list_pages(data_uri: str, recursive: bool=True, **kwargs) -> list:
+    """
+    Returns a list of all files that are the child of data_uri as URI-formatted
+    absolute paths.
+
+    Recursively traverses a directory tree by default, pass recursive=False to
+    return only direct children.
+    """
     data = parse_data_uri(data_uri)
 
     if data.store == 's3':
@@ -112,6 +137,9 @@ def list_file_pages(data: DataStorage, recursive: bool=True) -> list:
 
 
 def get_page_content(data_uri: str, **kwargs):
+    """
+    Returns the contents of the file stored at data_uri.
+    """
     data = parse_data_uri(data_uri)
     if data.store == 's3':
         return get_s3_contents(data)
@@ -153,7 +181,8 @@ def get_file_contents(data: DataStorage):
 
 def put_page_content(content:str, data_uri: str, **kwargs) -> str:
     """
-    Write content to a file at data_uri
+    Writes content to a file located at data_uri and returns data_uri.
+    Creates any subdirectories required to successfully write to data_uri.
     """
     data = parse_data_uri(data_uri)
 
@@ -194,6 +223,10 @@ def put_file_content(data: DataStorage, content) -> str:
 
 
 def upload_file(filepath:str, data_uri: str, **kwargs):
+    """
+    Moves the file located at filepath to data_uri, returns data_uri.
+    Creates any subdirectories required to successfully write to data_uri.
+    """
     data = parse_data_uri(data_uri)
 
     if data.store == 's3':
diff --git a/utils/versions.py b/utils/versions.py
index 82b7c6c23..1b1dd54d0 100644
--- a/utils/versions.py
+++ b/utils/versions.py
@@ -4,9 +4,27 @@
 from typing import Union, Optional
 from . import storage
 
+"""
+This module works with "version paths" and "version pages"
+
+If s3://bucket-name/3433/vernacular_data_v1/data/page1.xml is the URI to the
+first page of vernacular data, then 3433/vernacular_data_v1/ is the "version"
+or "version path" and 3433/vernacular_data_v1/data/page1.xml is the
+"version page". A version path always starts with the collection id.
+
+This module implements the creation of new version paths given an existing
+version path, a method to find a version path in an arbitrary string - usually
+an absolute path URI:
+"""
+
 def get_version(collection_id: Union[int, str], uri: str) -> str:
     """
-    From an arbitrary path, try to get the version string
+    Takes an arbitrary string (usually a URI) and tries to find a version path
+    by splitting on the collection id and discarding everything prior to the
+    collection ID and also discarding everything after the special "data"
+    keyword.
+
+    Returns a version path.
     """
     collection_id = str(collection_id)
     uri = uri.rstrip('/')
@@ -24,6 +42,12 @@ def create_vernacular_version(
         collection_id: Union[int, str],
         suffix: Optional[str] = None
     ) -> str:
+    """
+    Given a collection id, ex: 3433, and version suffix, ex: v1, creates a new
+    vernacular version, ex: 3433/vernacular_metadata_v1/
+
+    If no suffix is provided, uses the current datetime.
+    """
     version_path = f"{collection_id}"
     if not suffix:
         suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
@@ -31,6 +55,13 @@ def create_vernacular_version(
 
 def create_mapped_version(
         vernacular_version: str, suffix: Optional[str] = None) -> str:
+    """
+    Given a vernacular version, ex: 3433/vernacular_metadata_v1/ and version
+    suffix, ex: v2, creates a new mapped version, ex:
+    3433/vernacular_metadata_v1/mapped_metadata_v2/
+
+    If no suffix is provided, uses the current datetime.
+    """
     vernacular_version = vernacular_version.rstrip('/')
     if not suffix:
         suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
@@ -40,6 +71,14 @@ def create_validation_version(
         mapped_version: str,
         suffix: Optional[str] = None
 ):
+    """
+    Given a mapped version, ex: 3433/vernacular_metadata_v1/mapped_metadata_v2/
+    and a version suffix, ex: v2, creates a new validation version, ex:
+    3433/vernacular_metadata_v1/mapped_metadata_v2/validation_v2.csv
+    Validation versions paths are also version pages.
+
+    If no suffix is provided, uses the current datetime.
+    """
     mapped_version = mapped_version.rstrip('/')
     if not suffix:
         suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
@@ -47,12 +86,24 @@ def create_validation_version(
 
 def create_content_data_version(
         mapped_version: str, suffix: Optional[str] = None) -> str:
+    """
+    Given a mapped version, ex: 3433/vernacular_metadata_v1/mapped_metadata_v2/
+    and a version suffix, ex: v2, creates a new content data version, ex:
+    3433/vernacular_metadata_v1/mapped_metadata_v2/content_data_v2/
+
+    If no suffix is provided, uses the current datetime.
+    """
     mapped_version = mapped_version.rstrip('/')
     if not suffix:
         suffix = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
     return f"{mapped_version}/content_data_{suffix}/"
 
 def get_most_recent_vernacular_version(collection_id: Union[int, str]):
+    """
+    Sorts the contents of $VERNACULAR_DATA/<collection_id>/, and returns the
+    version path of the first item - this presumes a sortable vernacular
+    version suffix.
+    """
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
     versions = storage.list_dirs(f"{data_root.rstrip('/')}/{collection_id}/")
     if not versions:
@@ -62,18 +113,34 @@ def get_most_recent_vernacular_version(collection_id: Union[int, str]):
     return f"{collection_id}/{recent_version}/"
 
 def get_vernacular_pages(version, **kwargs):
+    """
+    resolves a vernacular version to a data_uri at $VERNACULAR_DATA/<version>/
+    returns a list of version pages.
+    """
     data_root = os.environ.get('VERNACULAR_DATA', "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
     page_list = storage.list_pages(data_path, recursive=True, **kwargs)
     return [path[len(data_root)+1:] for path in page_list]
 
 def get_mapped_pages(version, **kwargs):
+    """
+    resolves a mapped version to a data_uri at $MAPPED_DATA/<version>/
+    returns a list of version pages located at that data_uri.
+    """
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/"
     page_list = storage.list_pages(data_path, recursive=True, **kwargs)
     return [path[len(data_root)+1:] for path in page_list]
 
 def get_child_directories(version, **kwargs):
+    """
+    resolves a mapped version to a data_uri at $MAPPED_DATA/<version>/data/
+    returns a list of directories.
+
+    complex objects are stored in a directory named "children" within the
+    mapped version data directory. This function is used to check if any
+    directory named "children" is inside the mapped version's data directory.
+    """
     data_root = os.environ.get('MAPPED_DATA', "file:///tmp")
     child_directories = storage.list_dirs(
         f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/",
@@ -82,6 +149,10 @@ def get_child_directories(version, **kwargs):
     return child_directories
 
 def get_child_pages(version, **kwargs):
+    """
+    resolves a mapped version to a data_uri at $MAPPED_DATA/<version>/data/children/
+    returns a list of version pages located at data_uri.
+    """
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
     data_path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/children/"
     try:
@@ -93,33 +164,63 @@ def get_child_pages(version, **kwargs):
     return [path[len(data_root)+1:] for path in page_list]
 
 def get_vernacular_page_content(version_page):
+    """
+    resolves a version page to a data_uri at $VERNACULAR_DATA/<version_page>/
+    returns the contents of the page.
+    """
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp").rstrip('/')
     return storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
 
 def get_mapped_page_content(version_page):
+    """
+    resolves a version page to a data_uri at $MAPPED_DATA/<version_page>/
+    returns the contents of the page loaded as json
+    """
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp").rstrip('/')
     content = storage.get_page_content(f"{data_root.rstrip('/')}/{version_page}")
     return json.loads(content)
 
 def put_vernacular_page(content: str, page_name: Union[int, str], version: str):
+    """
+    resolves a version path to a page uri at $VERNACULAR_DATA/<version>/data/<page_name>
+    and writes content to that data uri. returns the version page.
+    """
     data_root = os.environ.get("VERNACULAR_DATA", "file:///tmp")
     path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
     storage.put_page_content(content, path)
     return f"{version.rstrip('/')}/data/{page_name}"
 
 def put_mapped_page(content, page_name, version):
+    """
+    resolves a version path to a page uri at $MAPPED_DATA/<version>/data/<page_name>.jsonl
+    and writes content to that data uri. returns the version page.
+
+    content should be a json.dumped string of a list of dicts.
+    """
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
     path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}.jsonl"
     storage.put_page_content(content, path)
     return f"{version.rstrip('/')}/data/{page_name}.jsonl"
 
 def put_content_data_page(content, page_name, version):
+    """
+    resolves a version path to a page uri at $CONTENT_DATA/<version>/data/<page_name>
+    and writes content to that data uri. returns the version page.
+
+    content should be a json.dumped string of a list of dicts.
+    """
     data_root = os.environ.get("CONTENT_DATA", "file:///tmp")
     path = f"{data_root.rstrip('/')}/{version.rstrip('/')}/data/{page_name}"
     storage.put_page_content(content, path)
     return f"{version.rstrip('/')}/data/{page_name}"
 
 def put_validation_report(content, version_page):
+    """
+    resolves a version path to a page uri at $MAPPED_DATA/<version page>
+    and writes content to that data uri. returns the version page.
+
+    content should be a csv string.
+    """
     data_root = os.environ.get("MAPPED_DATA", "file:///tmp")
     path = f"{data_root.rstrip('/')}/{version_page}"
     storage.put_page_content(content, path)