diff --git a/.gitignore b/.gitignore index 42b7a9285..558f1e1ba 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,6 @@ samconfig.toml !/.github startup.sh -__pycache__ \ No newline at end of file +__pycache__ +.venv/ +env/ \ No newline at end of file diff --git a/README.md b/README.md index 8cb7c61bb..5ba3ded1f 100644 --- a/README.md +++ b/README.md @@ -205,6 +205,23 @@ If you would like to mount your own codebase to the content_harvester container export MOUNT_CODEBASE= ``` +In order to run the indexer code, make sure the following variables are set: + +``` +export RIKOLTI_ES_ENDPOINT= # ask for endpoint url +export RIKOLTI_HOME=/usr/local/airflow/dags/rikolti +export INDEX_RETENTION=1 +``` + +Also make sure to set your temporary AWS credentials and the region so that the mwaa-local-runner container can authenticate when talking to the OpenSearch API: + +``` +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= +export AWS_SESSION_TOKEN= +export AWS_REGION=us-west-2 +``` + Finally, from inside the aws-mwaa-local-runner repo, run `./mwaa-local-env build-image` to build the docker image, and `./mwaa-local-env start` to start the mwaa local environment. For more information on `mwaa-local-env`, look for instructions in the [ucldc/aws-mwaa-local-runner:README](https://github.com/ucldc/aws-mwaa-local-runner/#readme) to build the docker image, run the container, and do local development. diff --git a/dags/index_to_prod_dag.py b/dags/index_to_prod_dag.py index 675ae29f8..2b68abb65 100644 --- a/dags/index_to_prod_dag.py +++ b/dags/index_to_prod_dag.py @@ -11,7 +11,7 @@ schedule=None, start_date=datetime(2023, 1, 1), catchup=False, - params={'collection_id': Param(None, description="Collection ID to index")}, + params={'collection_id': Param(None, description="Collection ID to move to prod")}, tags=["rikolti"], ) def index_to_prod_dag(): diff --git a/dags/requirements.txt b/dags/requirements.txt index 3f215a3d7..3ede404d1 100644 --- a/dags/requirements.txt +++ b/dags/requirements.txt @@ -1,5 +1,6 @@ --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.6.3/constraints-3.10.txt" boto3 +opensearch-py requests sickle python-dotenv diff --git a/env.example b/env.example index 824102d0c..ef3e91ce8 100644 --- a/env.example +++ b/env.example @@ -32,7 +32,11 @@ export CONTENT_ROOT=file:///usr/local/airflow/rikolti_content # indexer export RIKOLTI_ES_ENDPOINT= # ask for endpoint url -export RIKOLTI_ES_PASS= # ask for password - export RIKOLTI_HOME=/usr/local/airflow/dags/rikolti -export INDEX_RETENTION=1 # number of unaliased indices to retain during cleanup \ No newline at end of file +export INDEX_RETENTION=1 # number of unaliased indices to retain during cleanup + +# indexer when run locally via aws-mwaa-local-runner +# export AWS_ACCESS_KEY_ID= +# export AWS_SECRET_ACCESS_KEY= +# export AWS_SESSION_TOKEN= +# export AWS_REGION=us-west-2 \ No newline at end of file diff --git a/record_indexer/README.md b/record_indexer/README.md index 3efa68df0..d88d1f875 100644 --- a/record_indexer/README.md +++ b/record_indexer/README.md @@ -8,7 +8,7 @@ python index_templates/rikolti_template.py This creates a template that will be used whenever an index with name matching `rikolti*` is added to the cluster. -## Run indexer +## Run indexer from command line Create a new index for a collection and add it to the `rikolti-stg` alias: @@ -22,8 +22,29 @@ Add the current stage index for a collection to the `rikolti-prd` alias: python -m record_indexer.move_index_to_prod ``` +## Indexer development using aws-mwaa-local-runner +See the Rikolti README page section on [Airflow Development](https://github.com/ucldc/rikolti/#airflow-development). In particular, make sure that indexer-related env vars are set as described there. +## Index lifecycle + +The lifecycle of an index is as follows: + +#### Create new index +1. Create a new index named `rikolti-{collection_id}-{version}`, where `version` is the current datetime). +2. Remove any existing indices for the collection from the `rikolti-stg` alias. +3. Add the new index to the `rikolti-stg` alias. +4. Delete any older unaliased indices, retaining the number of unaliased indices specified by `settings.INDEX_RETENTION`. + +Note that the index creation code enforces the existence of one stage index at a time. + +#### Move staged index to production +1. Identify the current stage index for the collection. +2. Remove any existing indices for the collection from the `rikolti-prd` alias. +3. Add the current stage index to the `rikolti-prd` alias. (This means that at this stage in the lifecycle, the index will be aliased to `rikolti-stg` and `rikolti-prd` at the same time.) + +#### Delete old index +This happens during index creation (see step 4. above). diff --git a/record_indexer/add_page_to_index.py b/record_indexer/add_page_to_index.py index fffdfa1de..0c252ff9e 100644 --- a/record_indexer/add_page_to_index.py +++ b/record_indexer/add_page_to_index.py @@ -43,7 +43,7 @@ def build_bulk_request_body(records: list, index: str): # https://opensearch.org/docs/1.2/opensearch/rest-api/document-apis/bulk/ body = "" for record in records: - doc_id = record.get("calisphere-id") + doc_id = record.get("id") action = {"create": {"_index": index, "_id": doc_id}} diff --git a/record_indexer/requirements.txt b/record_indexer/requirements.txt index 558aee9cb..76fb42aa7 100644 --- a/record_indexer/requirements.txt +++ b/record_indexer/requirements.txt @@ -1,4 +1,5 @@ boto3 +opensearch-py python-dotenv requests requests-aws4auth diff --git a/record_indexer/settings.py b/record_indexer/settings.py index 2e1aa9132..9fa9b99be 100644 --- a/record_indexer/settings.py +++ b/record_indexer/settings.py @@ -1,11 +1,17 @@ import os +from boto3 import Session from dotenv import load_dotenv +from opensearchpy import AWSV4SignerAuth load_dotenv() +def get_auth(): + credentials = Session().get_credentials() + return AWSV4SignerAuth(credentials, os.environ.get("AWS_REGION")) + ENDPOINT = os.environ.get("RIKOLTI_ES_ENDPOINT") -AUTH = ("rikolti", os.environ.get("RIKOLTI_ES_PASS")) +AUTH = get_auth() RIKOLTI_HOME = os.environ.get("RIKOLTI_HOME", "/usr/local/airflow/dags/rikolti") RECORD_INDEX_CONFIG = os.sep.join( diff --git a/requirements_dev.txt b/requirements_dev.txt index 712efe192..796b61651 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,5 +1,6 @@ -r ./metadata_mapper/requirements.txt -r ./metadata_fetcher/requirements.txt +-r ./record_indexer/requirements.txt ipython ruff isort