From b90f585c080b22b9dcae49c4ba6fc89cca525b8c Mon Sep 17 00:00:00 2001 From: gilh Date: Wed, 25 Oct 2023 15:07:02 +0100 Subject: [PATCH] Add indexer task for flash videos. --- manage/airflow/dags/_common_.py | 2 +- manage/airflow/dags/warc_cdx.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/manage/airflow/dags/_common_.py b/manage/airflow/dags/_common_.py index dd8d856..408d4d5 100755 --- a/manage/airflow/dags/_common_.py +++ b/manage/airflow/dags/_common_.py @@ -30,7 +30,7 @@ class Config(): # Define the common parameters for running Docker tasks: w3act_task_image = 'ukwa/python-w3act:2.1.5' - ukwa_task_image = 'ukwa/ukwa-manage:2.4.1' + ukwa_task_image = 'ukwa/ukwa-manage:2.4.2' ukwa_reports_image = 'ukwa/ukwa-reports:1.0.0' hadoop_docker_image = 'ukwa/docker-hadoop:2.1.2' postgres_image = 'postgres:9.6.2' diff --git a/manage/airflow/dags/warc_cdx.py b/manage/airflow/dags/warc_cdx.py index eb0fc5d..a9486fd 100644 --- a/manage/airflow/dags/warc_cdx.py +++ b/manage/airflow/dags/warc_cdx.py @@ -112,6 +112,18 @@ def generate_cdx_dag(hadoop_service): command='windex cdx-index -v -t {{ params.trackdb_url }} -H {{ params.hadoop_service }} -S webrecorder -c {{ params.cdx_service }} -C {{ params.cdx_collection }} -B {{ params.batch_size }} --years-back {{ params.years_back }}', ) + cdx_wf = DockerOperator( + task_id=f'index_{hadoop_service}_warcit-flash-videos_cdx', + image=c.ukwa_task_image, + # Add Hadoop settings: + entrypoint=entrypoint, + environment= { + 'MRJOB_CONF': mrjob_conf, + 'PUSH_GATEWAY': c.push_gateway, + }, + command='windex cdx-index -v -t {{ params.trackdb_url }} -H {{ params.hadoop_service }} -S warcit-flash-videos -c {{ params.cdx_service }} -C {{ params.cdx_collection }} -B {{ params.batch_size }} --years-back {{ params.years_back }}', + ) + cdx_fc = DockerOperator( task_id=f'index_{hadoop_service}_frequent_cdx', image=c.ukwa_task_image,