diff --git a/application/Makefile b/application/Makefile index af9ec77da..74de5cdf6 100644 --- a/application/Makefile +++ b/application/Makefile @@ -1,7 +1,7 @@ .PHONY: help setup run-develop build-docker clean -VERSION_APPLICATION=0.0.96 -VERSION_GRPC=0.0.96 +VERSION_APPLICATION=0.0.97 +VERSION_GRPC=0.0.97 .DEFAULT: help help: diff --git a/application/src/tira/endpoints/data_api.py b/application/src/tira/endpoints/data_api.py index c8fdf6d89..4cb21d429 100644 --- a/application/src/tira/endpoints/data_api.py +++ b/application/src/tira/endpoints/data_api.py @@ -1,5 +1,9 @@ import logging import json +import textwrap + +from django.core.exceptions import BadRequest + from tira.forms import * import tira.tira_model as model from tira.checks import check_permissions, check_resources_exist, check_conditional_permissions @@ -434,36 +438,142 @@ def add_registration(request, context, task_id, vm_id): return JsonResponse({'status': 0, "message": f"Encountered an exception: {e}"}, status=HTTPStatus.INTERNAL_SERVER_ERROR) +def expand_links(component): + links = [*component.get('links', [])] + ir_datasets_id = component.get('ir_datasets_id', None) + if ir_datasets_id: + if '/' in ir_datasets_id: + base = ir_datasets_id.split('/')[0] + fragment = f'#{ir_datasets_id}' + else: + base = ir_datasets_id + fragment = '' + + links.append({ + 'display_name': 'ir_datasets', + 'href': f'https://ir-datasets.com/{base}.html{fragment}', + 'target': '_blank', + }) + + tirex_submission_id = component.get('tirex_submission_id', None) + if tirex_submission_id: + links.append({ + 'display_name': 'Submission in TIREx', + 'href': f'/submissions/{tirex_submission_id}', + }) + + if links: + component['links'] = links + + return component + + +def flatten_components(components): + flattened_components = [] + for identifier, data in components.items(): + component = {'identifier': identifier, **data} + + if 'components' in component: + component['components'] = flatten_components(data['components']) + + if 'tirex_submission_id' in data: + component['tirex_submission_id'] = data['tirex_submission_id'] + + flattened_components.append(expand_links(component)) + + return flattened_components + + @add_context def tirex_components(request, context): - context['tirex_components'] = settings.TIREX_COMPONENTS + context['tirex_components'] = flatten_components(settings.TIREX_COMPONENTS) return JsonResponse({'status': 0, 'context': context}) -def get_snippet_to_run_components(request): - all_components = settings.TIREX_COMPONENTS - component_ids = request.GET.get('components', 'false') +def flatten_tirex_components_to_id(obj, t=None): + ret = {} - # All links with display_name == "Submission in TIREx" have the ID of the component in their link, its a bit ugly, but at the moment we need to extract the ID from there. - # E.g., the ID from the URL "/submissions/ir-benchmarks/ows/query-segmentation-hyp-a" would be ir-benchmarks/ows/query-segmentation-hyp-a + if type(obj) != dict: + return ret - # Also Ugly: we need to determine which type of processor (query processor, document processor, etc) something is by using its top-level category, e.g., "Query Processing". + if 'tirex_submission_id' in obj: + assert obj['tirex_submission_id'] not in ret + obj['type'] = t + ret[obj['tirex_submission_id']] = obj - # I think it makes sense to build a small method that uses the settings.TIREX_COMPONENTS as input and produces a mapping form component ID (the thing below "Submission in TIREx") to the properties, e.g., query processor true or false, etc. + for k, v in obj.items(): + for i, j in flatten_tirex_components_to_id(v, t if t else k).items(): + ret[i] = j - # I think we can hard code everything against ROBUST04, we can switch this later. - dataset_initialization = 'dataset = pt.get_dataset("irds:disks45/nocr/trec-robust-2004")\n' + return ret - additional_variables = '' - # If we have a query processor, we need to add an additional variable "topics" - # just for this hard coded example: - current_component_is_query_processor = True - if current_component_is_query_processor: - additional_variables += "topics = dataset.get_topics(variant='title')\n" +TIREX_ID_TO_COMPONENT = flatten_tirex_components_to_id(settings.TIREX_COMPONENTS) + + +def get_snippet_to_run_components(request): + component_key = request.GET.get('component') + + if component_key not in TIREX_ID_TO_COMPONENT: + return JsonResponse({'status': 1, 'message': f'Component "{component_key}" not found.'}) + + component = TIREX_ID_TO_COMPONENT[component_key] + component_type = component['type'] + dataset_initialization = textwrap.dedent(''' + # You can replace Robust04 with other datasets + dataset = pt.get_dataset("irds:disks45/nocr/trec-robust-2004") + ''').strip() + snippet = '' + + if component_type == 'dataset': + dataset_initialization = '' + ir_datasets_id = component.get('ir_datasets_id') + if ir_datasets_id: + snippet = f''' + dataset = pt.get_dataset('irds:{ir_datasets_id}') + + indexer = pt.IterDictIndexer('./index') + indexref = indexer.index(dataset.get_corpus_iter()) + ''' + else: + snippet = f''' + def get_corpus_iter(): + # Iterate over the {component['display_name']} corpus + corpus = ... + for doc in corpus: + yield {{'docno': doc.docno, 'text': doc.content}} + + indexer = pt.IterDictIndexer('./index') + indexref = indexer.index(get_corpus_iter()) + ''' + elif component_type == 'document_processing': + tirex_submission_id = component.get('tirex_submission_id') + if tirex_submission_id: + snippet = f''' + transformed_docs = tira.pt.transform_documents('{tirex_submission_id}', dataset) + ''' + elif component_type == 'query_processing': + tirex_submission_id = component.get('tirex_submission_id') + if tirex_submission_id: + snippet = f''' + topics = dataset.get_topics(variant='title') + transformed_queries = tira.pt.transform_queries('{tirex_submission_id}', topics) + ''' + elif component_type in ('retrieval', 'reranking'): + tirex_submission_id = component.get('tirex_submission_id') + if tirex_submission_id: + snippet = f''' + run = tira.pt.from_retriever_submission('{tirex_submission_id}', dataset=dataset_id) + ''' + elif component_type == 'dataset': + pass + else: + JsonResponse({'status': 1, 'message': f'Component type "{component_type}" does not exist...'}) - component_definitions = "tira.pt.transform_queries('ir-benchmarks/ows/query-segmentation-hyb-i', dataset)\n" + if snippet: + snippet = textwrap.dedent(snippet).strip() - snippet = (dataset_initialization + additional_variables + component_definitions).strip() + if dataset_initialization: + snippet = dataset_initialization + '\n' + snippet return JsonResponse({'status': 0, 'context': {'snippet': snippet}}) diff --git a/application/src/tira/frontend-vuetify/src/IrComponents.vue b/application/src/tira/frontend-vuetify/src/IrComponents.vue index 7c7cb05b2..1d908f5c4 100644 --- a/application/src/tira/frontend-vuetify/src/IrComponents.vue +++ b/application/src/tira/frontend-vuetify/src/IrComponents.vue @@ -40,15 +40,16 @@ {{ link.display_name }} - +