Skip to content

Commit

Permalink
Add indexer task for flash videos.
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Oct 25, 2023
1 parent a878272 commit b90f585
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
2 changes: 1 addition & 1 deletion manage/airflow/dags/_common_.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Config():

# Define the common parameters for running Docker tasks:
w3act_task_image = 'ukwa/python-w3act:2.1.5'
ukwa_task_image = 'ukwa/ukwa-manage:2.4.1'
ukwa_task_image = 'ukwa/ukwa-manage:2.4.2'
ukwa_reports_image = 'ukwa/ukwa-reports:1.0.0'
hadoop_docker_image = 'ukwa/docker-hadoop:2.1.2'
postgres_image = 'postgres:9.6.2'
Expand Down
12 changes: 12 additions & 0 deletions manage/airflow/dags/warc_cdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,18 @@ def generate_cdx_dag(hadoop_service):
command='windex cdx-index -v -t {{ params.trackdb_url }} -H {{ params.hadoop_service }} -S webrecorder -c {{ params.cdx_service }} -C {{ params.cdx_collection }} -B {{ params.batch_size }} --years-back {{ params.years_back }}',
)

cdx_wf = DockerOperator(
task_id=f'index_{hadoop_service}_warcit-flash-videos_cdx',
image=c.ukwa_task_image,
# Add Hadoop settings:
entrypoint=entrypoint,
environment= {
'MRJOB_CONF': mrjob_conf,
'PUSH_GATEWAY': c.push_gateway,
},
command='windex cdx-index -v -t {{ params.trackdb_url }} -H {{ params.hadoop_service }} -S warcit-flash-videos -c {{ params.cdx_service }} -C {{ params.cdx_collection }} -B {{ params.batch_size }} --years-back {{ params.years_back }}',
)

cdx_fc = DockerOperator(
task_id=f'index_{hadoop_service}_frequent_cdx',
image=c.ukwa_task_image,
Expand Down

0 comments on commit b90f585

Please sign in to comment.