From 64151695be68bc91ff9d7168d440d4d4205f458d Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 27 Jan 2025 10:37:22 -0600 Subject: [PATCH 01/16] fix(recap): Replicate attachment pages from RECAP fetch API Fixes: #4828 --- cl/recap/tasks.py | 92 ++++++++++++++++++++++-- cl/recap/tests.py | 173 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+), 4 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index e63a45dfa6..2901c3609c 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -171,7 +171,10 @@ def do_pacer_fetch(fq: PacerFetchQueue): mark_fq_successful.si(fq.pk), ).apply_async() elif fq.request_type == REQUEST_TYPE.ATTACHMENT_PAGE: - result = fetch_attachment_page.apply_async(args=(fq.pk,)) + result = chain( + fetch_attachment_page.si(fq.pk), + replicate_fq_att_page_to_subdocket_rds.s(), + ).apply_async() return result @@ -1970,7 +1973,9 @@ def fetch_pacer_doc_by_rd( ignore_result=True, ) @transaction.atomic -def fetch_attachment_page(self: Task, fq_pk: int) -> None: +def fetch_attachment_page( + self: Task, fq_pk: int +) -> tuple[list[RECAPDocument], int | None] | None: """Fetch a PACER attachment page by rd_pk This is very similar to process_recap_attachment, except that it manages @@ -1978,8 +1983,11 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: :param self: The celery task :param fq_pk: The PK of the RECAP Fetch Queue to update. - :return: None + :return: A two-tuple containing a list of RDs that require replication and + the PQ ID for the first RD to be replicated, or None if replication to + sub-dockets is not required. """ + fq = PacerFetchQueue.objects.get(pk=fq_pk) rd = fq.recap_document # Check court connectivity, if fails retry the task, hopefully, it'll be @@ -1993,24 +2001,26 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: raise self.retry() mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) - if not rd.pacer_doc_id: msg = ( "Unable to get attachment page: Unknown pacer_doc_id for " "RECAP Document object %s" % rd.pk ) mark_fq_status(fq, msg, PROCESSING_STATUS.NEEDS_INFO) + self.request.chain = None return if rd.pacer_doc_id.count("-") > 1: msg = "ACMS attachment pages are not currently supported" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return session_data = get_pacer_cookie_from_cache(fq.user_id) if not session_data: msg = "Unable to find cached cookies. Aborting request." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return try: @@ -2023,6 +2033,7 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: ]: if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return logger.info( f"Ran into HTTPError: {exc.response.status_code}. Retrying." @@ -2030,11 +2041,13 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: raise self.retry(exc=exc) else: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return except requests.RequestException as exc: if self.request.retries == self.max_retries: msg = "Failed to get attachment page from network." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return logger.info("Ran into a RequestException. Retrying.") raise self.retry(exc=exc) @@ -2064,6 +2077,7 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: if att_data == {}: msg = "Not a valid attachment page upload" mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) + self.request.chain = None return try: @@ -2082,17 +2096,87 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: "attachment data" ) mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None return mark_fq_status(fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) msg = "Successfully completed fetch and save." mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + # Logic to replicate the attachment page to sub-dockets matched by RECAPDocument + court_id = rd.docket_entry.docket.court_id + sub_docket_main_rds = list( + get_main_rds(court_id, rd.pacer_doc_id).exclude( + docket_entry__docket__pacer_case_id=rd.docket_entry.docket.pacer_case_id + ) + ) + first_sub_docket_rd = ( + sub_docket_main_rds.pop() if sub_docket_main_rds else None + ) + if first_sub_docket_rd: + # Create a PQ related to the first RD matched that requires replication. + # Use it as a container for the attachment file and metadata for the + # remaining cases that require replication. + first_pq_created = ProcessingQueue.objects.create( + uploader_id=fq.user_id, + pacer_doc_id=first_sub_docket_rd.pacer_doc_id, + pacer_case_id=first_sub_docket_rd.docket_entry.docket.pacer_case_id, + court_id=court_id, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + text.encode(), name="attachment_page.html" + ), + ) + return [sub_rd for sub_rd in sub_docket_main_rds], first_pq_created.pk + + self.request.chain = None + return None + + +@app.task( + bind=True, + ignore_result=True, +) +def replicate_fq_att_page_to_subdocket_rds( + self: Task, rds_and_pq: tuple[list[RECAPDocument], int | None] +) -> None: + """Replicate Attachment page from a FQ to subdocket RECAPDocuments. + + :param self: The celery task + :param rds_and_pq: A two-tuple containing a list of RDs that require + replication and the PQ ID for the first RD to be replicated + :return: None + """ + + main_rds, first_pq_created_id = rds_and_pq + pqs_to_process_pks = [first_pq_created_id] + first_pq_created = ProcessingQueue.objects.get(pk=first_pq_created_id) + att_content = first_pq_created.filepath_local.read() + with transaction.atomic(): + for main_rd in main_rds: + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pq_created = ProcessingQueue.objects.create( + uploader_id=first_pq_created.uploader_id, + pacer_doc_id=main_rd.pacer_doc_id, + pacer_case_id=main_pacer_case_id, + court_id=first_pq_created.court_id, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + att_content, name=first_pq_created.filepath_local.name + ), + ) + pqs_to_process_pks.append(pq_created.pk) + + for pq_pk in pqs_to_process_pks: + async_to_sync(process_recap_attachment)(pq_pk) + def get_fq_docket_kwargs(fq): """Gather the kwargs for the Juriscraper DocketReport from the fq object diff --git a/cl/recap/tests.py b/cl/recap/tests.py index fbb745e067..d6052123bf 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -1436,6 +1436,179 @@ def test_processing_subdocket_case_pdf_attachment_upload( transaction.set_rollback(True) + @mock.patch( + "cl.recap.tasks.get_att_report_by_rd", + return_value=fakes.FakeAttachmentPage, + ) + @mock.patch( + "cl.recap.tasks.is_pacer_court_accessible", + side_effect=lambda a: True, + ) + @mock.patch( + "cl.recap.tasks.get_pacer_cookie_from_cache", + side_effect=lambda x: True, + ) + def test_replicate_subdocket_case_attachment_page_from_fq( + self, + mock_get_pacer_cookie_from_cache, + mock_is_pacer_court_accessible, + mock_get_att_report_by_rd, + ): + """Can we replicate an attachment page Fetch API purchase from a + subdocket case to its corresponding RD across all related dockets? + """ + + # Add the docket entry to every case. + async_to_sync(add_docket_entries)( + self.d_1, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + self.d_2, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + self.d_3, self.de_data_2["docket_entries"] + ) + + d_1_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_1 + ) + d_2_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_2 + ) + d_3_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_3 + ) + main_d_1_rd = d_1_recap_document[0] + main_d_2_rd = d_2_recap_document[0] + main_d_3_rd = d_3_recap_document[0] + + # Create FQ. + fq = PacerFetchQueue.objects.create( + user=User.objects.get(username="recap"), + request_type=REQUEST_TYPE.ATTACHMENT_PAGE, + recap_document_id=main_d_2_rd.pk, + ) + + with mock.patch( + "cl.recap.tasks.get_data_from_att_report", + side_effect=lambda x, y: self.att_data_2, + ): + do_pacer_fetch(fq) + + # After adding attachments, it should exist 3 RD on every docket. + self.assertEqual( + d_1_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_2.pacer_case_id}.", + ) + self.assertEqual( + d_2_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_1.pacer_case_id}.", + ) + self.assertEqual( + d_3_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_3.pacer_case_id}.", + ) + + main_d_1_rd.refresh_from_db() + main_d_2_rd.refresh_from_db() + main_d_3_rd.refresh_from_db() + self.assertEqual( + main_d_1_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + self.assertEqual( + main_d_2_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + self.assertEqual( + main_d_3_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + + # Two of them should be attachments. + d_1_attachments = RECAPDocument.objects.filter( + docket_entry__docket=self.d_1, + document_type=RECAPDocument.ATTACHMENT, + ) + d_2_attachments = RECAPDocument.objects.filter( + docket_entry__docket=self.d_2, + document_type=RECAPDocument.ATTACHMENT, + ) + d_3_attachments = RECAPDocument.objects.filter( + docket_entry__docket=self.d_3, + document_type=RECAPDocument.ATTACHMENT, + ) + + self.assertEqual( + d_1_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_1.pacer_case_id}.", + ) + self.assertEqual( + d_2_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_2.pacer_case_id}.", + ) + self.assertEqual( + d_3_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_3.pacer_case_id}.", + ) + + att_1_data = self.att_data_2["attachments"][0] + att_2_data = self.att_data_2["attachments"][0] + + self.assertEqual( + d_1_attachments.filter(pacer_doc_id=att_1_data["pacer_doc_id"]) + .first() + .attachment_number, + att_1_data["attachment_number"], + ) + self.assertEqual( + d_1_attachments.filter(pacer_doc_id=att_2_data["pacer_doc_id"]) + .first() + .attachment_number, + att_2_data["attachment_number"], + ) + self.assertEqual( + d_2_attachments.filter(pacer_doc_id=att_1_data["pacer_doc_id"]) + .first() + .attachment_number, + att_1_data["attachment_number"], + ) + self.assertEqual( + d_2_attachments.filter(pacer_doc_id=att_2_data["pacer_doc_id"]) + .first() + .attachment_number, + att_2_data["attachment_number"], + ) + + # Assert the number of PQs created to process the additional subdocket RDs. + pqs_created = ProcessingQueue.objects.all() + self.assertEqual(pqs_created.count(), 2) + + pqs_status = {pq.status for pq in pqs_created} + self.assertEqual(pqs_status, {PROCESSING_STATUS.SUCCESSFUL}) + + pqs_related_dockets = {pq.docket_id for pq in pqs_created} + self.assertEqual( + pqs_related_dockets, + {self.d_1.pk, self.d_3.pk}, + msg="Docket ids didn't match.", + ) + # 3 PacerHtmlFiles should have been created, one for each case. + att_html_created = PacerHtmlFiles.objects.all() + self.assertEqual(att_html_created.count(), 3) + related_htmls_de = { + html.content_object.pk for html in att_html_created + } + self.assertEqual( + {de.pk for de in DocketEntry.objects.all()}, related_htmls_de + ) + @mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport) @mock.patch( From 69e7ac68c1114c10257ae33b9a0740ed05942cdb Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 27 Jan 2025 13:16:25 -0600 Subject: [PATCH 02/16] fix(recap): Added early abort to replicate_fq_att_page_to_subdocket_rds --- cl/recap/tasks.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 2901c3609c..933847bb40 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -1975,7 +1975,7 @@ def fetch_pacer_doc_by_rd( @transaction.atomic def fetch_attachment_page( self: Task, fq_pk: int -) -> tuple[list[RECAPDocument], int | None] | None: +) -> tuple[list[RECAPDocument], int] | None: """Fetch a PACER attachment page by rd_pk This is very similar to process_recap_attachment, except that it manages @@ -2143,8 +2143,9 @@ def fetch_attachment_page( bind=True, ignore_result=True, ) +@transaction.atomic def replicate_fq_att_page_to_subdocket_rds( - self: Task, rds_and_pq: tuple[list[RECAPDocument], int | None] + self: Task, rds_and_pq: tuple[list[RECAPDocument], int] | None ) -> None: """Replicate Attachment page from a FQ to subdocket RECAPDocuments. @@ -2154,25 +2155,29 @@ def replicate_fq_att_page_to_subdocket_rds( :return: None """ + if not rds_and_pq: + # Nothing to do. Aborting the task. + # This is required in Celery eager mode, where self.request.chain has no effect. + return None + main_rds, first_pq_created_id = rds_and_pq pqs_to_process_pks = [first_pq_created_id] first_pq_created = ProcessingQueue.objects.get(pk=first_pq_created_id) att_content = first_pq_created.filepath_local.read() - with transaction.atomic(): - for main_rd in main_rds: - main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id - # Create additional pqs for each subdocket case found. - pq_created = ProcessingQueue.objects.create( - uploader_id=first_pq_created.uploader_id, - pacer_doc_id=main_rd.pacer_doc_id, - pacer_case_id=main_pacer_case_id, - court_id=first_pq_created.court_id, - upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, - filepath_local=ContentFile( - att_content, name=first_pq_created.filepath_local.name - ), - ) - pqs_to_process_pks.append(pq_created.pk) + for main_rd in main_rds: + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pq_created = ProcessingQueue.objects.create( + uploader_id=first_pq_created.uploader_id, + pacer_doc_id=main_rd.pacer_doc_id, + pacer_case_id=main_pacer_case_id, + court_id=first_pq_created.court_id, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + att_content, name=first_pq_created.filepath_local.name + ), + ) + pqs_to_process_pks.append(pq_created.pk) for pq_pk in pqs_to_process_pks: async_to_sync(process_recap_attachment)(pq_pk) From 0d6df0d168388e2d5abf26119d2798915378a379 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 27 Jan 2025 14:21:20 -0600 Subject: [PATCH 03/16] fix(recap): Refactor to optimize ProcessingQueue creation in bulk --- cl/recap/tasks.py | 99 +++++++++++++++++------------------------------ 1 file changed, 35 insertions(+), 64 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 933847bb40..97e9708e7f 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -1973,9 +1973,7 @@ def fetch_pacer_doc_by_rd( ignore_result=True, ) @transaction.atomic -def fetch_attachment_page( - self: Task, fq_pk: int -) -> tuple[list[RECAPDocument], int] | None: +def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: """Fetch a PACER attachment page by rd_pk This is very similar to process_recap_attachment, except that it manages @@ -1983,9 +1981,7 @@ def fetch_attachment_page( :param self: The celery task :param fq_pk: The PK of the RECAP Fetch Queue to update. - :return: A two-tuple containing a list of RDs that require replication and - the PQ ID for the first RD to be replicated, or None if replication to - sub-dockets is not required. + :return: A list of PQ IDs that require replication to sub-dockets. """ fq = PacerFetchQueue.objects.get(pk=fq_pk) @@ -1997,7 +1993,7 @@ def fetch_attachment_page( msg = f"Blocked by court: {rd.docket_entry.docket.court_id}" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return None + return [] raise self.retry() mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) @@ -2008,20 +2004,20 @@ def fetch_attachment_page( ) mark_fq_status(fq, msg, PROCESSING_STATUS.NEEDS_INFO) self.request.chain = None - return + return [] if rd.pacer_doc_id.count("-") > 1: msg = "ACMS attachment pages are not currently supported" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] session_data = get_pacer_cookie_from_cache(fq.user_id) if not session_data: msg = "Unable to find cached cookies. Aborting request." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] try: r = get_att_report_by_rd(rd, session_data) @@ -2034,7 +2030,7 @@ def fetch_attachment_page( if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] logger.info( f"Ran into HTTPError: {exc.response.status_code}. Retrying." ) @@ -2042,13 +2038,13 @@ def fetch_attachment_page( else: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] except requests.RequestException as exc: if self.request.retries == self.max_retries: msg = "Failed to get attachment page from network." mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] logger.info("Ran into a RequestException. Retrying.") raise self.retry(exc=exc) except PacerLoginException as exc: @@ -2057,7 +2053,7 @@ def fetch_attachment_page( mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) delete_pacer_cookie_from_cache(fq.user_id) self.request.chain = None - return None + return [] mark_fq_status( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) @@ -2078,7 +2074,7 @@ def fetch_attachment_page( msg = "Not a valid attachment page upload" mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None - return + return [] try: async_to_sync(merge_attachment_page_data)( @@ -2097,13 +2093,13 @@ def fetch_attachment_page( ) mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if self.request.retries == self.max_retries: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None - return + return [] mark_fq_status(fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) msg = "Successfully completed fetch and save." @@ -2116,70 +2112,45 @@ def fetch_attachment_page( docket_entry__docket__pacer_case_id=rd.docket_entry.docket.pacer_case_id ) ) - first_sub_docket_rd = ( - sub_docket_main_rds.pop() if sub_docket_main_rds else None - ) - if first_sub_docket_rd: - # Create a PQ related to the first RD matched that requires replication. - # Use it as a container for the attachment file and metadata for the - # remaining cases that require replication. - first_pq_created = ProcessingQueue.objects.create( - uploader_id=fq.user_id, - pacer_doc_id=first_sub_docket_rd.pacer_doc_id, - pacer_case_id=first_sub_docket_rd.docket_entry.docket.pacer_case_id, - court_id=court_id, - upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, - filepath_local=ContentFile( - text.encode(), name="attachment_page.html" - ), + sub_docket_pqs = [] + for main_rd in sub_docket_main_rds: + # Create PQs related to RD that require replication. + sub_docket_pqs.append( + ProcessingQueue( + uploader_id=fq.user_id, + pacer_doc_id=main_rd.pacer_doc_id, + pacer_case_id=main_rd.docket_entry.docket.pacer_case_id, + court_id=court_id, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + text.encode(), name="attachment_page.html" + ), + ) ) - return [sub_rd for sub_rd in sub_docket_main_rds], first_pq_created.pk + + pqs_created = ProcessingQueue.objects.bulk_create(sub_docket_pqs) + if sub_docket_pqs: + return [pq.pk for pq in pqs_created] self.request.chain = None - return None + return [] @app.task( bind=True, ignore_result=True, ) -@transaction.atomic def replicate_fq_att_page_to_subdocket_rds( - self: Task, rds_and_pq: tuple[list[RECAPDocument], int] | None + self: Task, pq_ids_to_process: list[int] ) -> None: """Replicate Attachment page from a FQ to subdocket RECAPDocuments. :param self: The celery task - :param rds_and_pq: A two-tuple containing a list of RDs that require - replication and the PQ ID for the first RD to be replicated + :param pq_ids_to_process: A list of PQ IDs that require replication to sub-dockets. :return: None """ - if not rds_and_pq: - # Nothing to do. Aborting the task. - # This is required in Celery eager mode, where self.request.chain has no effect. - return None - - main_rds, first_pq_created_id = rds_and_pq - pqs_to_process_pks = [first_pq_created_id] - first_pq_created = ProcessingQueue.objects.get(pk=first_pq_created_id) - att_content = first_pq_created.filepath_local.read() - for main_rd in main_rds: - main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id - # Create additional pqs for each subdocket case found. - pq_created = ProcessingQueue.objects.create( - uploader_id=first_pq_created.uploader_id, - pacer_doc_id=main_rd.pacer_doc_id, - pacer_case_id=main_pacer_case_id, - court_id=first_pq_created.court_id, - upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, - filepath_local=ContentFile( - att_content, name=first_pq_created.filepath_local.name - ), - ) - pqs_to_process_pks.append(pq_created.pk) - - for pq_pk in pqs_to_process_pks: + for pq_pk in pq_ids_to_process: async_to_sync(process_recap_attachment)(pq_pk) From 85342d009a21f67289448c0f76f3ba610f2a9612 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 27 Jan 2025 14:45:28 -0600 Subject: [PATCH 04/16] fix(recap): Refactor of find_subdocket_att_page_rds and find_subdocket_pdf_rds to use bulk_create --- cl/recap/tasks.py | 101 ++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 97e9708e7f..3cd0923396 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -6,7 +6,7 @@ from datetime import datetime from http import HTTPStatus from multiprocessing import process -from typing import List, Optional, Tuple +from typing import Optional, Tuple from zipfile import ZipFile import requests @@ -736,25 +736,27 @@ async def find_subdocket_att_page_rds( original_file_content = text.encode("utf-8") original_file_name = pq.filepath_local.name - @sync_to_async - def save_pq_instances(): - with transaction.atomic(): - for main_rd in main_rds: - main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id - # Create additional pqs for each subdocket case found. - pq_created = ProcessingQueue.objects.create( - uploader_id=pq.uploader_id, - pacer_doc_id=pacer_doc_id, - pacer_case_id=main_pacer_case_id, - court_id=pq.court_id, - upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, - filepath_local=ContentFile( - original_file_content, name=original_file_name - ), - ) - pqs_to_process_pks.append(pq_created.pk) + pqs_to_create = [] + async for main_rd in main_rds: + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pqs_to_create.append( + ProcessingQueue( + uploader_id=pq.uploader_id, + pacer_doc_id=pacer_doc_id, + pacer_case_id=main_pacer_case_id, + court_id=pq.court_id, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + original_file_content, name=original_file_name + ), + ) + ) + + if pqs_to_create: + pqs_created = await ProcessingQueue.objects.abulk_create(pqs_to_create) + pqs_to_process_pks.extend([pq.pk for pq in pqs_created]) - await save_pq_instances() return pqs_to_process_pks @@ -787,37 +789,38 @@ async def find_subdocket_pdf_rds( pdf_binary_content = pq.filepath_local.read() - @sync_to_async - def save_pq_instances(): - with transaction.atomic(): - for i, main_rd in enumerate(main_rds): - if i == 0 and not pq.pacer_case_id: - # If the original PQ does not have a pacer_case_id, - # assign it a pacer_case_id from one of the matched RDs - # to ensure the RD lookup in process_recap_pdf succeeds. - pq.pacer_case_id = ( - main_rd.docket_entry.docket.pacer_case_id - ) - pq.save() - continue + pqs_to_create = [] + main_rds = [rd async for rd in main_rds] + for i, main_rd in enumerate(main_rds): + if i == 0 and not pq.pacer_case_id: + # If the original PQ does not have a pacer_case_id, + # assign it a pacer_case_id from one of the matched RDs + # to ensure the RD lookup in process_recap_pdf succeeds. + pq.pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + await pq.asave() + continue - main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id - # Create additional pqs for each subdocket case found. - pq_created = ProcessingQueue.objects.create( - uploader_id=pq.uploader_id, - pacer_doc_id=pq.pacer_doc_id, - pacer_case_id=main_pacer_case_id, - document_number=pq.document_number, - attachment_number=pq.attachment_number, - court_id=pq.court_id, - upload_type=UPLOAD_TYPE.PDF, - filepath_local=ContentFile( - pdf_binary_content, name=pq.filepath_local.name - ), - ) - pqs_to_process_pks.append(pq_created.pk) + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pqs_to_create.append( + ProcessingQueue( + uploader_id=pq.uploader_id, + pacer_doc_id=pq.pacer_doc_id, + pacer_case_id=main_pacer_case_id, + document_number=pq.document_number, + attachment_number=pq.attachment_number, + court_id=pq.court_id, + upload_type=UPLOAD_TYPE.PDF, + filepath_local=ContentFile( + pdf_binary_content, name=pq.filepath_local.name + ), + ) + ) + + if pqs_to_create: + pqs_created = await ProcessingQueue.objects.abulk_create(pqs_to_create) + pqs_to_process_pks.extend([pq.pk for pq in pqs_created]) - await save_pq_instances() return pqs_to_process_pks @@ -2128,8 +2131,8 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: ) ) - pqs_created = ProcessingQueue.objects.bulk_create(sub_docket_pqs) if sub_docket_pqs: + pqs_created = ProcessingQueue.objects.bulk_create(sub_docket_pqs) return [pq.pk for pq in pqs_created] self.request.chain = None From 8368a590fdf9962549467ce94d453d609c1a229b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 27 Jan 2025 16:02:50 -0600 Subject: [PATCH 05/16] fix(recap): Add logic to disable the replication of the FQ attachment page for appellate courts --- cl/recap/tasks.py | 5 +++ cl/recap/tests.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 2566e35950..e5ca706b4a 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -2115,6 +2115,11 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: # Logic to replicate the attachment page to sub-dockets matched by RECAPDocument court_id = rd.docket_entry.docket.court_id + if is_appellate_court(court_id): + # Subdocket replication for appellate courts is currently not supported. + self.request.chain = None + return [] + sub_docket_main_rds = list( get_main_rds(court_id, rd.pacer_doc_id).exclude( docket_entry__docket__pacer_case_id=rd.docket_entry.docket.pacer_case_id diff --git a/cl/recap/tests.py b/cl/recap/tests.py index 2705f24714..030494e9dc 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -1059,6 +1059,9 @@ def setUpTestData(cls): cls.user = User.objects.get(username="recap") cls.f = SimpleUploadedFile("file.txt", b"file content more content") cls.court = CourtFactory.create(jurisdiction="FD", in_use=True) + cls.court_appellate = CourtFactory.create( + jurisdiction="F", in_use=True + ) cls.att_data_2 = AppellateAttachmentPageFactory( attachments=[ AppellateAttachmentFactory( @@ -1100,6 +1103,25 @@ def setUpTestData(cls): pacer_case_id="104492", ) + cls.d_1_a = DocketFactory( + source=Docket.RECAP, + docket_number="23-4568", + court=cls.court_appellate, + pacer_case_id="105490", + ) + cls.d_2_a = DocketFactory( + source=Docket.RECAP, + docket_number="23-4568", + court=cls.court_appellate, + pacer_case_id="105491", + ) + cls.d_3_a = DocketFactory( + source=Docket.RECAP, + docket_number="23-4568", + court=cls.court_appellate, + pacer_case_id="105492", + ) + def test_processing_subdocket_case_attachment_page(self): """Can we replicate an attachment page upload from a subdocket case to its corresponding RD across all related dockets? @@ -1755,6 +1777,92 @@ def test_replicate_subdocket_case_attachment_page_from_fq( {de.pk for de in DocketEntry.objects.all()}, related_htmls_de ) + @mock.patch( + "cl.recap.tasks.get_att_report_by_rd", + return_value=fakes.FakeAttachmentPage, + ) + @mock.patch( + "cl.recap.tasks.is_pacer_court_accessible", + side_effect=lambda a: True, + ) + @mock.patch( + "cl.recap.tasks.get_pacer_cookie_from_cache", + side_effect=lambda x: True, + ) + def test_avoid_appellate_replication_for_subdocket_attachment_page_fq( + self, + mock_get_pacer_cookie_from_cache, + mock_is_pacer_court_accessible, + mock_get_att_report_by_rd, + ): + """Attachment page replication is currently not supported for appellate + subdockets. + """ + # Add the docket entry to every case. + async_to_sync(add_docket_entries)( + self.d_1_a, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + self.d_2_a, self.de_data_2["docket_entries"] + ) + + d_1_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_1_a + ) + d_2_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_2_a + ) + main_d_2_rd = d_2_recap_document[0] + + # Create FQ. + fq = PacerFetchQueue.objects.create( + user=User.objects.get(username="recap"), + request_type=REQUEST_TYPE.ATTACHMENT_PAGE, + recap_document_id=main_d_2_rd.pk, + ) + + with mock.patch( + "cl.recap.tasks.get_data_from_appellate_att_report", + side_effect=lambda x, y: self.att_data_2, + ): + do_pacer_fetch(fq) + + # After adding attachments, it should exist 3 RD in main_d_2_rd and 1 RD + # in the other cases. + self.assertEqual( + d_1_recap_document.count(), + 1, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_2.pacer_case_id}.", + ) + self.assertEqual( + d_2_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_1.pacer_case_id}.", + ) + + main_d_2_rd.refresh_from_db() + self.assertEqual( + main_d_2_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + d_2_attachments = RECAPDocument.objects.filter( + docket_entry__docket=self.d_2_a, + document_type=RECAPDocument.ATTACHMENT, + ) + self.assertEqual( + d_2_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_2.pacer_case_id}.", + ) + + # No additional PQs created. + pqs_created = ProcessingQueue.objects.all() + self.assertEqual(pqs_created.count(), 0) + + # 1 PacerHtmlFiles should have been created for the FQ request. + att_html_created = PacerHtmlFiles.objects.all() + self.assertEqual(att_html_created.count(), 1) + @mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport) @mock.patch( From 51b7011ff3393c35e45a0b20de06ce318f8e386a Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 27 Jan 2025 16:21:26 -0600 Subject: [PATCH 06/16] fix(recap): Fixed PACER cookies collision in related FQ tests --- cl/recap/tests.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cl/recap/tests.py b/cl/recap/tests.py index 030494e9dc..ae181ef0e7 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -12,7 +12,7 @@ from dateutil.tz import tzutc from django.conf import settings from django.contrib.auth.hashers import make_password -from django.contrib.auth.models import User +from django.contrib.auth.models import Permission, User from django.core import mail from django.core.files.base import ContentFile from django.core.files.uploadedfile import SimpleUploadedFile @@ -1056,7 +1056,15 @@ class ReplicateRecapUploadsTest(TestCase): @classmethod def setUpTestData(cls): - cls.user = User.objects.get(username="recap") + user_profile = UserProfileWithParentsFactory.create( + user__username="pandora", + user__password=make_password("password"), + ) + cls.user = user_profile.user + permissions = Permission.objects.filter( + codename__in=["has_recap_api_access", "has_recap_upload_access"] + ) + cls.user.user_permissions.add(*permissions) cls.f = SimpleUploadedFile("file.txt", b"file content more content") cls.court = CourtFactory.create(jurisdiction="FD", in_use=True) cls.court_appellate = CourtFactory.create( From 5d023bdfc16af7be5bd1d137ae47cde20287b69b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 29 Jan 2025 10:05:53 -0600 Subject: [PATCH 07/16] fix(recap): Moved fetch_attachment_page nested attributes access to variables --- cl/recap/tasks.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 8731180698..4656099053 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -1999,18 +1999,21 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: fq = PacerFetchQueue.objects.get(pk=fq_pk) rd = fq.recap_document + court_id = rd.docket_entry.docket.court_id + pacer_case_id = rd.docket_entry.docket.pacer_case_id + pacer_doc_id = rd.pacer_doc_id # Check court connectivity, if fails retry the task, hopefully, it'll be # retried in a different not blocked node - if not is_pacer_court_accessible(rd.docket_entry.docket.court_id): + if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: - msg = f"Blocked by court: {rd.docket_entry.docket.court_id}" + msg = f"Blocked by court: {court_id}" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] raise self.retry() mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) - if not rd.pacer_doc_id: + if not pacer_doc_id: msg = ( "Unable to get attachment page: Unknown pacer_doc_id for " "RECAP Document object %s" % rd.pk @@ -2073,7 +2076,7 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: raise self.retry(exc=exc) text = r.response.text - is_appellate = is_appellate_court(rd.docket_entry.docket.court_id) + is_appellate = is_appellate_court(court_id) # Determine the appropriate parser function based on court jurisdiction # (appellate or district) att_data_parser = ( @@ -2081,7 +2084,7 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: if is_appellate else get_data_from_att_report ) - att_data = att_data_parser(text, rd.docket_entry.docket.court_id) + att_data = att_data_parser(text, court_id) if att_data == {}: msg = "Not a valid attachment page upload" @@ -2092,7 +2095,7 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: try: async_to_sync(merge_attachment_page_data)( rd.docket_entry.docket.court, - rd.docket_entry.docket.pacer_case_id, + pacer_case_id, att_data["pacer_doc_id"], # Appellate attachments don't contain a document_number None if is_appellate else att_data["document_number"], @@ -2119,15 +2122,14 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) # Logic to replicate the attachment page to sub-dockets matched by RECAPDocument - court_id = rd.docket_entry.docket.court_id if is_appellate_court(court_id): # Subdocket replication for appellate courts is currently not supported. self.request.chain = None return [] sub_docket_main_rds = list( - get_main_rds(court_id, rd.pacer_doc_id).exclude( - docket_entry__docket__pacer_case_id=rd.docket_entry.docket.pacer_case_id + get_main_rds(court_id, pacer_doc_id).exclude( + docket_entry__docket__pacer_case_id=pacer_case_id ) ) sub_docket_pqs = [] From d96362befb4aa431146ca26b9efc4d8483ef8623 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 31 Jan 2025 12:04:51 -0600 Subject: [PATCH 08/16] fix(search): Fixed docket_number filtering for numbers with repeated values Fixes: #4942 --- cl/lib/elasticsearch_utils.py | 19 ++++++-- cl/search/documents.py | 15 +++++- cl/search/es_indices.py | 1 + cl/search/tests/tests_es_opinion.py | 43 ++++++++++++++++- cl/search/tests/tests_es_oral_arguments.py | 39 +++++++++++++++ cl/search/tests/tests_es_parenthetical.py | 3 +- cl/search/tests/tests_es_recap.py | 55 +++++++++++++++++++++- 7 files changed, 163 insertions(+), 12 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index a33118310b..3b4688ab9c 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -465,7 +465,18 @@ def build_term_query( validate_query_syntax(value, QueryType.FILTER) if make_phrase: - return [Q("match_phrase", **{field: {"query": value, "slop": slop}})] + return [ + Q( + "match_phrase", + **{ + field: { + "query": value, + "slop": slop, + "analyzer": "search_analyzer_exact", + } + }, + ) + ] if isinstance(value, list): value = list(filter(None, value)) @@ -767,7 +778,7 @@ def build_es_plain_filters(cd: CleanData) -> List: # Build docket number term query queries_list.extend( build_term_query( - "docketNumber", + "docketNumber.exact", cd.get("docket_number", ""), make_phrase=True, slop=1, @@ -2374,7 +2385,7 @@ def build_join_es_filters(cd: CleanData) -> List: ), *build_text_filter("caseName.exact", cd.get("case_name", "")), *build_term_query( - "docketNumber", + "docketNumber.exact", cd.get("docket_number", ""), make_phrase=True, slop=1, @@ -2418,7 +2429,7 @@ def build_join_es_filters(cd: CleanData) -> List: cd.get("filed_after", ""), ), *build_term_query( - "docketNumber", + "docketNumber.exact", cd.get("docket_number", ""), make_phrase=True, slop=1, diff --git a/cl/search/documents.py b/cl/search/documents.py index fcae4baaf6..edf698a139 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -65,8 +65,19 @@ class ParentheticalGroupDocument(Document): attr="representative.describing_opinion.cluster.slug" ) docket_id = fields.IntegerField(attr="opinion.cluster.docket_id") - docketNumber = fields.KeywordField( - attr="opinion.cluster.docket.docket_number" + docketNumber = fields.TextField( + attr="opinion.cluster.docket.docket_number", + analyzer="text_en_splitting_cl", + term_vector="with_positions_offsets", + fields={ + "exact": fields.TextField( + attr="opinion.cluster.docket.docket_number", + analyzer="english_exact", + search_analyzer="search_analyzer_exact", + term_vector="with_positions_offsets", + ), + }, + search_analyzer="search_analyzer", ) judge = fields.TextField( attr="opinion.cluster.judges", diff --git a/cl/search/es_indices.py b/cl/search/es_indices.py index 717a6abee9..8adb416307 100644 --- a/cl/search/es_indices.py +++ b/cl/search/es_indices.py @@ -6,6 +6,7 @@ parenthetical_group_index.settings( number_of_shards=settings.ELASTICSEARCH_NUMBER_OF_SHARDS, number_of_replicas=settings.ELASTICSEARCH_NUMBER_OF_REPLICAS, + analysis=settings.ELASTICSEARCH_DSL["analysis"], ) # Define oral arguments elasticsearch index diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 7014aaa8c8..5d39045cce 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -1481,16 +1481,55 @@ async def test_can_filter_using_filed_range(self) -> None: r = await self._test_article_count(search_params, 1, "filed_range") self.assertIn("Honda", r.content.decode()) - async def test_can_filter_using_a_docket_number(self) -> None: + def test_can_filter_using_a_docket_number(self) -> None: """Can we query by docket number?""" + + # Regular docket_number filtering. search_params = {"q": "*", "docket_number": "2"} # Frontend - r = await self._test_article_count(search_params, 1, "docket_number") + r = async_to_sync(self._test_article_count)( + search_params, 1, "docket_number" + ) self.assertIn( "Honda", r.content.decode(), "Result not found by docket number!" ) + # Filter by case by docket_number containing repeated numbers like: 1:21-bk-0021 + with self.captureOnCommitCallbacks(execute=True): + cluster = OpinionClusterFactory( + case_name="Strickland v. Lorem.", + docket=DocketFactory( + court=self.court_1, docket_number="1:21-bk-0021" + ), + precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, + ) + + params = { + "type": SEARCH_TYPES.OPINION, + "docket_number": "1:21-bk-0021", + } + r = async_to_sync(self._test_article_count)(params, 1, "docket_number") + self.assertIn("1:21-bk-0021", r.content.decode()) + + # docket_number filter works properly combined with child document fields + with self.captureOnCommitCallbacks(execute=True): + OpinionFactory.create(cluster=cluster, plain_text="Lorem Ipsum") + + params = { + "type": SEARCH_TYPES.OPINION, + "q": "Lorem Ipsum", + "docket_number": "1:21-bk-0021", + } + r = async_to_sync(self._test_article_count)( + params, 1, "docket_number and text" + ) + self.assertIn("1:21-bk-0021", r.content.decode()) + self.assertIn("Lorem Ipsum", r.content.decode()) + + # Remove factories to prevent affecting other tests. + cluster.delete() + async def test_can_filter_by_citation_number(self) -> None: """Can we query by citation number?""" get_dicts = [{"q": "*", "citation": "33"}, {"q": "citation:33"}] diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index 1147fa92e7..2fee163971 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -1258,6 +1258,45 @@ def test_oa_docket_number_filtering(self) -> None: ) self.assertIn("SEC", r.content.decode()) + # Filter by docket_number containing repeated numbers like: 1:21-bk-0021 + with mock.patch( + "cl.lib.es_signal_processor.allow_es_audio_indexing", + side_effect=lambda x, y: True, + ), self.captureOnCommitCallbacks(execute=True): + docket = DocketFactory.create( + docket_number="1:21-bk-0021", + court_id=self.court_1.pk, + date_argued=datetime.date(2013, 8, 14), + ) + AudioFactory.create( + case_name="Lorem Ipsum", + docket_id=docket.pk, + duration=420, + local_path_original_file="test/audio/ander_v._leo.mp3", + local_path_mp3=self.filepath_local, + sha1="a49ada009774496ac01fb49818837e2296705c97", + stt_status=Audio.STT_COMPLETE, + ) + search_params = { + "type": SEARCH_TYPES.ORAL_ARGUMENT, + "docket_number": "1:21-bk-0021", + } + r = self.client.get( + reverse("show_results"), + search_params, + ) + self.assertEqual( + self.get_article_count(r), + 1, + msg="Did not get expected number of results when filtering by " + "docket number. Expected %s, but got %s." % (expected, actual), + ) + self.assertIn("Lorem Ipsum", r.content.decode()) + self.assertIn("1:21-bk-0021", r.content.decode()) + + # Remove factories to prevent affecting other tests. + docket.delete() + def test_oa_jurisdiction_filtering(self) -> None: """Filter by court""" search_params = { diff --git a/cl/search/tests/tests_es_parenthetical.py b/cl/search/tests/tests_es_parenthetical.py index 95e97a9f8d..163690a7fa 100644 --- a/cl/search/tests/tests_es_parenthetical.py +++ b/cl/search/tests/tests_es_parenthetical.py @@ -169,7 +169,6 @@ def setUpTestData(cls): @classmethod def setUpClass(cls): super().setUpClass() - cls.rebuild_index("search.ParentheticalGroup") def test_filter_search(self) -> None: """Test filtering and search at the same time""" @@ -465,7 +464,7 @@ async def test_pa_search_form_search_and_filtering(self) -> None: actual, expected, msg="Did not get expected number of results when filtering by " - "case name. Expected %s, but got %s." % (expected, actual), + "docket_number. Expected %s, but got %s." % (expected, actual), ) r = await self.async_client.get( reverse("show_results"), diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 933913ea7e..a6a8bd1781 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -535,12 +535,63 @@ async def test_document_description_filter(self) -> None: # Frontend, 1 result expected since RECAPDocuments are grouped by case await self._test_article_count(params, 1, "description") - async def test_docket_number_filter(self) -> None: + def test_docket_number_filter(self) -> None: """Confirm docket_number filter works properly""" + + # Regular docket_number filtering. params = {"type": SEARCH_TYPES.RECAP, "docket_number": "1:21-bk-1234"} # Frontend, 1 result expected since RECAPDocuments are grouped by case - await self._test_article_count(params, 1, "docket_number") + async_to_sync(self._test_article_count)(params, 1, "docket_number") + + # Filter by case by docket_number containing repeated numbers like: 1:21-bk-0021 + with self.captureOnCommitCallbacks(execute=True): + entry = DocketEntryWithParentsFactory( + docket__docket_number="1:21-bk-0021", + docket__court=self.court, + docket__source=Docket.RECAP, + entry_number=1, + date_filed=datetime.date(2015, 8, 19), + description="MOTION for Leave to File Amicus Curiae Lorem", + ) + + params = {"type": SEARCH_TYPES.RECAP, "docket_number": "1:21-bk-0021"} + r = async_to_sync(self._test_article_count)(params, 1, "docket_number") + self.assertIn("1:21-bk-0021", r.content.decode()) + + # docket_number filter works properly combined with child document fields + with self.captureOnCommitCallbacks(execute=True): + RECAPDocumentFactory( + docket_entry=entry, + description="New File", + document_number="1", + is_available=False, + page_count=5, + ) + + params = { + "type": SEARCH_TYPES.RECAP, + "docket_number": "1:21-bk-0021", + "document_number": 1, + } + r = async_to_sync(self._test_article_count)( + params, 1, "docket_number and document_number" + ) + self.assertIn("1:21-bk-0021", r.content.decode()) + self.assertIn("New File", r.content.decode()) + + # Fielded query also works for numbers containing repeated numbers + params = { + "type": SEARCH_TYPES.RECAP, + "q": "docketNumber:1:21-bk-0021", + } + r = async_to_sync(self._test_article_count)( + params, 1, "docketNumber fielded query" + ) + self.assertIn("1:21-bk-0021", r.content.decode()) + + # Remove factories to prevent affecting other tests. + entry.docket.delete() async def test_attachment_number_filter(self) -> None: """Confirm attachment number filter works properly""" From 631c5427b222f8762d08d8fdae4075a818f20ff7 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 31 Jan 2025 14:35:03 -0600 Subject: [PATCH 09/16] fix(search): Apply fix for repeated docket_number values in fielded docketNumber queries --- cl/lib/elasticsearch_utils.py | 12 ------------ cl/lib/utils.py | 23 ++++++++++++++++++++++- cl/search/tests/tests.py | 28 ++++++++++++++++++++++++++++ cl/search/tests/tests_es_recap.py | 10 ++++++++++ 4 files changed, 60 insertions(+), 13 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 3b4688ab9c..18c0c4f39d 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -374,18 +374,6 @@ def build_fulltext_query( """ if value: validate_query_syntax(value, QueryType.QUERY_STRING) - # In Elasticsearch, the colon (:) character is used to separate the - # field name and the field value in a query. - # To avoid parsing errors escape any colon characters in the value - # parameter with a backslash. - if "docketNumber:" in value: - docket_number_matches = re.findall("docketNumber:([^ ]+)", value) - for match in docket_number_matches: - replacement = match.replace(":", r"\:") - value = value.replace( - f"docketNumber:{match}", f"docketNumber:{replacement}" - ) - # Used for the phrase query_string, no conjunctions appended. query_value = cleanup_main_query(value) # To enable the search of each term in the query across multiple fields diff --git a/cl/lib/utils.py b/cl/lib/utils.py index 15bd965e84..508c0d4a74 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -345,6 +345,27 @@ def cleanup_main_query(query_string: str) -> str: is_date_str = re.match( "[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z", item ) + + if "docketNumber:" in item: + potential_docket_number = item.split("docketNumber:", 1)[1] + + if not potential_docket_number: + # The docket_number is already within a phrase or () + cleaned_items.append(item) + else: + # Improve the docket_number query by: + # If it's a known docket_number format, wrap it in quotes and + # add a ~1 slop to match slight variations like 1:21-bk-1234-ABC → 1:21-bk-1234 + # If it's not a known docket_number format, just wrap it in + # parentheses to avoid syntax errors caused by : in the number. + slop_suffix = ( + "~1" if is_docket_number(potential_docket_number) else "" + ) + cleaned_items.append( + f'docketNumber:"{potential_docket_number}"{slop_suffix}' + ) + continue + if any([not_numeric, is_date_str]): cleaned_items.append(item) continue @@ -356,7 +377,7 @@ def cleanup_main_query(query_string: str) -> str: # Some sort of number, probably a docket number or other type of number # Wrap in quotes to do a phrase search - if is_docket_number(item) and "docketNumber:" not in query_string: + if is_docket_number(item): # Confirm is a docket number and clean it. So docket_numbers with # suffixes can be searched: 1:21-bk-1234-ABC -> 1:21-bk-1234, item = clean_docket_number(item) diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index f2503ee63e..f174fc3e98 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -948,6 +948,34 @@ def test_query_cleanup_function(self) -> None: "b*ra*e b*rav*", "b?ra?e b?rav*", ), + ( + "Lorem docketNumber:1:21-bk-0021 test", + 'Lorem docketNumber:"1:21-bk-0021"~1 test', + ), + ( + "Lorem docketNumber:1:21-bk-0021 AND docketNumber:1:21-bk-0022", + 'Lorem docketNumber:"1:21-bk-0021"~1 AND docketNumber:"1:21-bk-0022"~1', + ), + ( + "Lorem docketNumber:1:21:0021 test", + 'Lorem docketNumber:"1:21:0021" test', + ), + ( + "docketNumber:(ASBCA No. 59126)", + 'docketNumber:(ASBCA No. "59126")', + ), + ( + 'docketNumber:"1:21-bk-0021" test', + 'docketNumber:"1:21-bk-0021" test', + ), + ( + "docketNumber:1:21-bk-0021-ABC test", + 'docketNumber:"1:21-bk-0021-ABC"~1 test', + ), + ( + "12-9238 docketNumber:1:21-bk-0021", + 'docketNumber:"12-9238"~1 docketNumber:"1:21-bk-0021"~1', + ), ) for q, a in q_a: print("Does {q} --> {a} ? ".format(**{"q": q, "a": a})) diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index a6a8bd1781..b49edd765e 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -580,6 +580,16 @@ def test_docket_number_filter(self) -> None: self.assertIn("1:21-bk-0021", r.content.decode()) self.assertIn("New File", r.content.decode()) + # docket_number text query containing repeated numbers works properly + params = { + "type": SEARCH_TYPES.RECAP, + "q": "1:21-bk-0021", + } + r = async_to_sync(self._test_article_count)( + params, 1, "docketNumber text query" + ) + self.assertIn("1:21-bk-0021", r.content.decode()) + # Fielded query also works for numbers containing repeated numbers params = { "type": SEARCH_TYPES.RECAP, From d3cfd849f295b05314a6dd945750e80d34300249 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 31 Jan 2025 15:25:56 -0600 Subject: [PATCH 10/16] fix(search): Fix cleanup_main_query for fielded queries with a value inside a phrase --- cl/lib/utils.py | 9 +++++++-- cl/search/tests/tests.py | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cl/lib/utils.py b/cl/lib/utils.py index 508c0d4a74..b0e9dfa5fa 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -330,8 +330,13 @@ def cleanup_main_query(query_string: str) -> str: if not item: continue - if item.startswith('"') or item.endswith('"'): - # Start or end of a phrase; flip whether we're inside a phrase + if ( + item.startswith('"') + or item.endswith('"') + or bool(re.match(r'\w+:"[^"]', item)) + ): + # Start or end of a phrase or a fielded query using quotes e.g: field:"test" + # flip whether we're inside a phrase inside_a_phrase = not inside_a_phrase cleaned_items.append(item) continue diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index f174fc3e98..b4b2199672 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -976,6 +976,14 @@ def test_query_cleanup_function(self) -> None: "12-9238 docketNumber:1:21-bk-0021", 'docketNumber:"12-9238"~1 docketNumber:"1:21-bk-0021"~1', ), + ( + 'test case_name_full:"Lorem ipsum 2" test', + 'test case_name_full:"Lorem ipsum 2" test', + ), + ( + 'docketNumber:"docket number 2"', + 'docketNumber:"docket number 2"', + ), ) for q, a in q_a: print("Does {q} --> {a} ? ".format(**{"q": q, "a": a})) From d206980d6db1dc83f9297c23104450c5da511650 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 31 Jan 2025 15:41:26 -0600 Subject: [PATCH 11/16] fix(search): Fixed cleanup_main_query comments --- cl/lib/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/lib/utils.py b/cl/lib/utils.py index b0e9dfa5fa..5beac77507 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -355,14 +355,14 @@ def cleanup_main_query(query_string: str) -> str: potential_docket_number = item.split("docketNumber:", 1)[1] if not potential_docket_number: - # The docket_number is already within a phrase or () + # The docket_number is wrapped in parentheses cleaned_items.append(item) else: # Improve the docket_number query by: # If it's a known docket_number format, wrap it in quotes and # add a ~1 slop to match slight variations like 1:21-bk-1234-ABC → 1:21-bk-1234 # If it's not a known docket_number format, just wrap it in - # parentheses to avoid syntax errors caused by : in the number. + # quotes to avoid syntax errors caused by : in the number. slop_suffix = ( "~1" if is_docket_number(potential_docket_number) else "" ) From 31eac0dbde7e5424126b53f20f0fe3414e25a540 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 3 Feb 2025 13:21:13 -0600 Subject: [PATCH 12/16] fix(recap): Apply early return suggestion for consistency --- cl/recap/tasks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 4656099053..90338dd0a9 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -2148,12 +2148,12 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[int]: ) ) - if sub_docket_pqs: - pqs_created = ProcessingQueue.objects.bulk_create(sub_docket_pqs) - return [pq.pk for pq in pqs_created] - - self.request.chain = None - return [] + if not sub_docket_pqs: + self.request.chain = None + return [] + # Return PQ IDs to process attachment page replication for sub-dockets. + pqs_created = ProcessingQueue.objects.bulk_create(sub_docket_pqs) + return [pq.pk for pq in pqs_created] @app.task( From bce2b4736ea779c2ab5092929b8f2c06ce9621b5 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Mon, 3 Feb 2025 15:27:42 -0400 Subject: [PATCH 13/16] fix(people_db): Add missing religion and position values This commit updates the person model to include four new religion values and the position model to include the "Special Trial Judge" type. --- ...ion_alter_personevent_religion_and_more.py | 291 ++++++++++++++++++ ...ter_personevent_religion_and_more_noop.sql | 18 ++ cl/people_db/models.py | 6 + 3 files changed, 315 insertions(+) create mode 100644 cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more.py create mode 100644 cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more_noop.sql diff --git a/cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more.py b/cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more.py new file mode 100644 index 0000000000..926d43286e --- /dev/null +++ b/cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more.py @@ -0,0 +1,291 @@ +# Generated by Django 5.1.5 on 2025-02-03 18:22 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("people_db", "0017_pghistory_v3_4_0_trigger_update"), + ] + + operations = [ + migrations.AlterField( + model_name="person", + name="religion", + field=models.CharField( + blank=True, + choices=[ + ("ca", "Catholic"), + ("pr", "Protestant"), + ("je", "Jewish"), + ("mu", "Muslim"), + ("at", "Atheist"), + ("ag", "Agnostic"), + ("mo", "Mormon"), + ("bu", "Buddhist"), + ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pr", "Presbyterian"), + ], + help_text="The religion of a person", + max_length=30, + ), + ), + migrations.AlterField( + model_name="personevent", + name="religion", + field=models.CharField( + blank=True, + choices=[ + ("ca", "Catholic"), + ("pr", "Protestant"), + ("je", "Jewish"), + ("mu", "Muslim"), + ("at", "Atheist"), + ("ag", "Agnostic"), + ("mo", "Mormon"), + ("bu", "Buddhist"), + ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pr", "Presbyterian"), + ], + help_text="The religion of a person", + max_length=30, + ), + ), + migrations.AlterField( + model_name="position", + name="position_type", + field=models.CharField( + blank=True, + choices=[ + ( + "Judge", + [ + ("jud", "Judge"), + ("jus", "Justice"), + ("ad-law-jud", "Administrative Law Judge"), + ("act-jud", "Acting Judge"), + ("act-jus", "Acting Justice"), + ("act-pres-jud", "Acting Presiding Judge"), + ( + "act-c-admin-jus", + "Acting Chief Administrative Justice", + ), + ("ass-jud", "Associate Judge"), + ("ass-jus", "Associate Justice"), + ("ass-c-jud", "Associate Chief Judge"), + ("ass-pres-jud", "Associate Presiding Judge"), + ("asst-pres-jud", "Assistant Presiding Judge"), + ("c-jud", "Chief Judge"), + ("c-jus", "Chief Justice"), + ("c-spec-m", "Chief Special Master"), + ("c-admin-jus", "Chief Administrative Justice"), + ("pres-jud", "Presiding Judge"), + ("pres-jus", "Presiding Justice"), + ("sup-jud", "Supervising Judge"), + ( + "ad-pres-jus", + "Administrative Presiding Justice", + ), + ("com", "Commissioner"), + ("com-dep", "Deputy Commissioner"), + ("jud-pt", "Judge Pro Tem"), + ("jus-pt", "Justice Pro Tem"), + ("ref-jud-tr", "Judge Trial Referee"), + ("ref-off", "Official Referee"), + ("ref-state-trial", "State Trial Referee"), + ("ret-act-jus", "Active Retired Justice"), + ("ret-ass-jud", "Retired Associate Judge"), + ("ret-c-jud", "Retired Chief Judge"), + ("ret-jus", "Retired Justice"), + ("ret-senior-jud", "Senior Judge"), + ("mag", "Magistrate"), + ("c-mag", "Chief Magistrate"), + ("pres-mag", "Presiding Magistrate"), + ("mag-pt", "Magistrate Pro Tem"), + ("mag-rc", "Magistrate (Recalled)"), + ("mag-part-time", "Magistrate (Part-Time)"), + ("spec-chair", "Special Chairman"), + ("spec-jud", "Special Judge"), + ("spec-m", "Special Master"), + ( + "spec-scjcbc", + "Special Superior Court Judge for Complex Business Cases", + ), + ("spec-tr-jud", "Special Trial Judge"), + ("chair", "Chairman"), + ("chan", "Chancellor"), + ("presi-jud", "President"), + ("res-jud", "Reserve Judge"), + ("trial-jud", "Trial Judge"), + ("vice-chan", "Vice Chancellor"), + ("vice-cj", "Vice Chief Judge"), + ], + ), + ( + "Attorney General", + [ + ("att-gen", "Attorney General"), + ("att-gen-ass", "Assistant Attorney General"), + ( + "att-gen-ass-spec", + "Special Assistant Attorney General", + ), + ("sen-counsel", "Senior Counsel"), + ("dep-sol-gen", "Deputy Solicitor General"), + ], + ), + ( + "Appointing Authority", + [ + ("pres", "President of the United States"), + ("gov", "Governor"), + ("mayor", "Mayor"), + ], + ), + ( + "Clerkships", + [ + ("clerk", "Clerk"), + ("clerk-chief-dep", "Chief Deputy Clerk"), + ("staff-atty", "Staff Attorney"), + ], + ), + ("prof", "Professor"), + ("adj-prof", "Adjunct Professor"), + ("prac", "Practitioner"), + ("pros", "Prosecutor"), + ("pub-def", "Public Defender"), + ("da", "District Attorney"), + ("ada", "Assistant District Attorney"), + ("legis", "Legislator"), + ("sen", "Senator"), + ("state-sen", "State Senator"), + ], + help_text="If this is a judicial position, this indicates the role the person had. This field may be blank if job_title is complete instead.", + max_length=20, + null=True, + ), + ), + migrations.AlterField( + model_name="positionevent", + name="position_type", + field=models.CharField( + blank=True, + choices=[ + ( + "Judge", + [ + ("jud", "Judge"), + ("jus", "Justice"), + ("ad-law-jud", "Administrative Law Judge"), + ("act-jud", "Acting Judge"), + ("act-jus", "Acting Justice"), + ("act-pres-jud", "Acting Presiding Judge"), + ( + "act-c-admin-jus", + "Acting Chief Administrative Justice", + ), + ("ass-jud", "Associate Judge"), + ("ass-jus", "Associate Justice"), + ("ass-c-jud", "Associate Chief Judge"), + ("ass-pres-jud", "Associate Presiding Judge"), + ("asst-pres-jud", "Assistant Presiding Judge"), + ("c-jud", "Chief Judge"), + ("c-jus", "Chief Justice"), + ("c-spec-m", "Chief Special Master"), + ("c-admin-jus", "Chief Administrative Justice"), + ("pres-jud", "Presiding Judge"), + ("pres-jus", "Presiding Justice"), + ("sup-jud", "Supervising Judge"), + ( + "ad-pres-jus", + "Administrative Presiding Justice", + ), + ("com", "Commissioner"), + ("com-dep", "Deputy Commissioner"), + ("jud-pt", "Judge Pro Tem"), + ("jus-pt", "Justice Pro Tem"), + ("ref-jud-tr", "Judge Trial Referee"), + ("ref-off", "Official Referee"), + ("ref-state-trial", "State Trial Referee"), + ("ret-act-jus", "Active Retired Justice"), + ("ret-ass-jud", "Retired Associate Judge"), + ("ret-c-jud", "Retired Chief Judge"), + ("ret-jus", "Retired Justice"), + ("ret-senior-jud", "Senior Judge"), + ("mag", "Magistrate"), + ("c-mag", "Chief Magistrate"), + ("pres-mag", "Presiding Magistrate"), + ("mag-pt", "Magistrate Pro Tem"), + ("mag-rc", "Magistrate (Recalled)"), + ("mag-part-time", "Magistrate (Part-Time)"), + ("spec-chair", "Special Chairman"), + ("spec-jud", "Special Judge"), + ("spec-m", "Special Master"), + ( + "spec-scjcbc", + "Special Superior Court Judge for Complex Business Cases", + ), + ("spec-tr-jud", "Special Trial Judge"), + ("chair", "Chairman"), + ("chan", "Chancellor"), + ("presi-jud", "President"), + ("res-jud", "Reserve Judge"), + ("trial-jud", "Trial Judge"), + ("vice-chan", "Vice Chancellor"), + ("vice-cj", "Vice Chief Judge"), + ], + ), + ( + "Attorney General", + [ + ("att-gen", "Attorney General"), + ("att-gen-ass", "Assistant Attorney General"), + ( + "att-gen-ass-spec", + "Special Assistant Attorney General", + ), + ("sen-counsel", "Senior Counsel"), + ("dep-sol-gen", "Deputy Solicitor General"), + ], + ), + ( + "Appointing Authority", + [ + ("pres", "President of the United States"), + ("gov", "Governor"), + ("mayor", "Mayor"), + ], + ), + ( + "Clerkships", + [ + ("clerk", "Clerk"), + ("clerk-chief-dep", "Chief Deputy Clerk"), + ("staff-atty", "Staff Attorney"), + ], + ), + ("prof", "Professor"), + ("adj-prof", "Adjunct Professor"), + ("prac", "Practitioner"), + ("pros", "Prosecutor"), + ("pub-def", "Public Defender"), + ("da", "District Attorney"), + ("ada", "Assistant District Attorney"), + ("legis", "Legislator"), + ("sen", "Senator"), + ("state-sen", "State Senator"), + ], + help_text="If this is a judicial position, this indicates the role the person had. This field may be blank if job_title is complete instead.", + max_length=20, + null=True, + ), + ), + ] diff --git a/cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more_noop.sql b/cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more_noop.sql new file mode 100644 index 0000000000..c698d310cd --- /dev/null +++ b/cl/people_db/migrations/0018_alter_person_religion_alter_personevent_religion_and_more_noop.sql @@ -0,0 +1,18 @@ +BEGIN; +-- +-- Alter field religion on person +-- +-- (no-op) +-- +-- Alter field religion on personevent +-- +-- (no-op) +-- +-- Alter field position_type on position +-- +-- (no-op) +-- +-- Alter field position_type on positionevent +-- +-- (no-op) +COMMIT; diff --git a/cl/people_db/models.py b/cl/people_db/models.py index 75c8346694..2454152dbd 100644 --- a/cl/people_db/models.py +++ b/cl/people_db/models.py @@ -65,6 +65,10 @@ class Person(AbstractDateTimeModel): ("mo", "Mormon"), ("bu", "Buddhist"), ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pr", "Presbyterian"), ) race = models.ManyToManyField( "Race", @@ -370,6 +374,7 @@ class Position(AbstractDateTimeModel): SPECIAL_JUDGE = "spec-jud" SPECIAL_MASTER = "spec-m" SPECIAL_SUPERIOR_COURT_JUDGE_FOR_COMPLEX_BUSINESS_CASES = "spec-scjcbc" + SPECIAL_TRIAL_JUDGE = "spec-tr-jud" # Other CHAIRMAN = "chair" CHANCELLOR = "chan" @@ -468,6 +473,7 @@ class Position(AbstractDateTimeModel): "Special Superior Court Judge for Complex Business " "Cases", ), + (SPECIAL_TRIAL_JUDGE, "Special Trial Judge"), # Other (CHAIRMAN, "Chairman"), (CHANCELLOR, "Chancellor"), From 01b8b73d3e84ec0382e247dba73780e6276deb2a Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Tue, 4 Feb 2025 23:50:34 -0400 Subject: [PATCH 14/16] fix(people): Differentiate Protestant and Presbyterian religions This commit updates the religion list to use distinct values for Protestant and Presbyterian, resolving a collision. --- ...son_religion_alter_personevent_religion.py | 64 +++++++++++++++++++ ...on_religion_alter_personevent_religion.sql | 10 +++ cl/people_db/models.py | 2 +- 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.py create mode 100644 cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.sql diff --git a/cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.py b/cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.py new file mode 100644 index 0000000000..a3625787dd --- /dev/null +++ b/cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.py @@ -0,0 +1,64 @@ +# Generated by Django 5.1.5 on 2025-02-05 03:47 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ( + "people_db", + "0018_alter_person_religion_alter_personevent_religion_and_more", + ), + ] + + operations = [ + migrations.AlterField( + model_name="person", + name="religion", + field=models.CharField( + blank=True, + choices=[ + ("ca", "Catholic"), + ("pr", "Protestant"), + ("je", "Jewish"), + ("mu", "Muslim"), + ("at", "Atheist"), + ("ag", "Agnostic"), + ("mo", "Mormon"), + ("bu", "Buddhist"), + ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pe", "Presbyterian"), + ], + help_text="The religion of a person", + max_length=30, + ), + ), + migrations.AlterField( + model_name="personevent", + name="religion", + field=models.CharField( + blank=True, + choices=[ + ("ca", "Catholic"), + ("pr", "Protestant"), + ("je", "Jewish"), + ("mu", "Muslim"), + ("at", "Atheist"), + ("ag", "Agnostic"), + ("mo", "Mormon"), + ("bu", "Buddhist"), + ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pe", "Presbyterian"), + ], + help_text="The religion of a person", + max_length=30, + ), + ), + ] diff --git a/cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.sql b/cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.sql new file mode 100644 index 0000000000..be388a96fb --- /dev/null +++ b/cl/people_db/migrations/0019_alter_person_religion_alter_personevent_religion.sql @@ -0,0 +1,10 @@ +BEGIN; +-- +-- Alter field religion on person +-- +-- (no-op) +-- +-- Alter field religion on personevent +-- +-- (no-op) +COMMIT; diff --git a/cl/people_db/models.py b/cl/people_db/models.py index 2454152dbd..94a2bd62a9 100644 --- a/cl/people_db/models.py +++ b/cl/people_db/models.py @@ -68,7 +68,7 @@ class Person(AbstractDateTimeModel): ("ep", "Episcopalian"), ("ro", "Roman Catholic"), ("me", "Methodist"), - ("pr", "Presbyterian"), + ("pe", "Presbyterian"), ) race = models.ManyToManyField( "Race", From 2bbf27c098e0c73b4a7b9a83f9267815ade398fb Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Wed, 5 Feb 2025 15:56:59 -0400 Subject: [PATCH 15/16] fix(people_db): Add Unitarian religion and position type --- ...ion_alter_personevent_religion_and_more.py | 295 ++++++++++++++++++ ...ter_personevent_religion_and_more_noop.sql | 18 ++ cl/people_db/models.py | 3 + 3 files changed, 316 insertions(+) create mode 100644 cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more.py create mode 100644 cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more_noop.sql diff --git a/cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more.py b/cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more.py new file mode 100644 index 0000000000..f35f473922 --- /dev/null +++ b/cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more.py @@ -0,0 +1,295 @@ +# Generated by Django 5.1.5 on 2025-02-05 19:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("people_db", "0019_alter_person_religion_alter_personevent_religion"), + ] + + operations = [ + migrations.AlterField( + model_name="person", + name="religion", + field=models.CharField( + blank=True, + choices=[ + ("ca", "Catholic"), + ("pr", "Protestant"), + ("je", "Jewish"), + ("mu", "Muslim"), + ("at", "Atheist"), + ("ag", "Agnostic"), + ("mo", "Mormon"), + ("bu", "Buddhist"), + ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pe", "Presbyterian"), + ("un", "Unitarian"), + ], + help_text="The religion of a person", + max_length=30, + ), + ), + migrations.AlterField( + model_name="personevent", + name="religion", + field=models.CharField( + blank=True, + choices=[ + ("ca", "Catholic"), + ("pr", "Protestant"), + ("je", "Jewish"), + ("mu", "Muslim"), + ("at", "Atheist"), + ("ag", "Agnostic"), + ("mo", "Mormon"), + ("bu", "Buddhist"), + ("hi", "Hindu"), + ("ep", "Episcopalian"), + ("ro", "Roman Catholic"), + ("me", "Methodist"), + ("pe", "Presbyterian"), + ("un", "Unitarian"), + ], + help_text="The religion of a person", + max_length=30, + ), + ), + migrations.AlterField( + model_name="position", + name="position_type", + field=models.CharField( + blank=True, + choices=[ + ( + "Judge", + [ + ("jud", "Judge"), + ("jus", "Justice"), + ("ad-law-jud", "Administrative Law Judge"), + ("act-jud", "Acting Judge"), + ("act-jus", "Acting Justice"), + ("act-pres-jud", "Acting Presiding Judge"), + ( + "act-c-admin-jus", + "Acting Chief Administrative Justice", + ), + ("ass-jud", "Associate Judge"), + ("ass-jus", "Associate Justice"), + ("ass-c-jud", "Associate Chief Judge"), + ("ass-pres-jud", "Associate Presiding Judge"), + ("asst-pres-jud", "Assistant Presiding Judge"), + ("c-jud", "Chief Judge"), + ("c-jus", "Chief Justice"), + ("c-spec-m", "Chief Special Master"), + ("c-admin-jus", "Chief Administrative Justice"), + ("c-spec-tr-jud", "Chief Special Trial Judge"), + ("pres-jud", "Presiding Judge"), + ("pres-jus", "Presiding Justice"), + ("sup-jud", "Supervising Judge"), + ( + "ad-pres-jus", + "Administrative Presiding Justice", + ), + ("com", "Commissioner"), + ("com-dep", "Deputy Commissioner"), + ("jud-pt", "Judge Pro Tem"), + ("jus-pt", "Justice Pro Tem"), + ("ref-jud-tr", "Judge Trial Referee"), + ("ref-off", "Official Referee"), + ("ref-state-trial", "State Trial Referee"), + ("ret-act-jus", "Active Retired Justice"), + ("ret-ass-jud", "Retired Associate Judge"), + ("ret-c-jud", "Retired Chief Judge"), + ("ret-jus", "Retired Justice"), + ("ret-senior-jud", "Senior Judge"), + ("mag", "Magistrate"), + ("c-mag", "Chief Magistrate"), + ("pres-mag", "Presiding Magistrate"), + ("mag-pt", "Magistrate Pro Tem"), + ("mag-rc", "Magistrate (Recalled)"), + ("mag-part-time", "Magistrate (Part-Time)"), + ("spec-chair", "Special Chairman"), + ("spec-jud", "Special Judge"), + ("spec-m", "Special Master"), + ( + "spec-scjcbc", + "Special Superior Court Judge for Complex Business Cases", + ), + ("spec-tr-jud", "Special Trial Judge"), + ("chair", "Chairman"), + ("chan", "Chancellor"), + ("presi-jud", "President"), + ("res-jud", "Reserve Judge"), + ("trial-jud", "Trial Judge"), + ("vice-chan", "Vice Chancellor"), + ("vice-cj", "Vice Chief Judge"), + ], + ), + ( + "Attorney General", + [ + ("att-gen", "Attorney General"), + ("att-gen-ass", "Assistant Attorney General"), + ( + "att-gen-ass-spec", + "Special Assistant Attorney General", + ), + ("sen-counsel", "Senior Counsel"), + ("dep-sol-gen", "Deputy Solicitor General"), + ], + ), + ( + "Appointing Authority", + [ + ("pres", "President of the United States"), + ("gov", "Governor"), + ("mayor", "Mayor"), + ], + ), + ( + "Clerkships", + [ + ("clerk", "Clerk"), + ("clerk-chief-dep", "Chief Deputy Clerk"), + ("staff-atty", "Staff Attorney"), + ], + ), + ("prof", "Professor"), + ("adj-prof", "Adjunct Professor"), + ("prac", "Practitioner"), + ("pros", "Prosecutor"), + ("pub-def", "Public Defender"), + ("da", "District Attorney"), + ("ada", "Assistant District Attorney"), + ("legis", "Legislator"), + ("sen", "Senator"), + ("state-sen", "State Senator"), + ], + help_text="If this is a judicial position, this indicates the role the person had. This field may be blank if job_title is complete instead.", + max_length=20, + null=True, + ), + ), + migrations.AlterField( + model_name="positionevent", + name="position_type", + field=models.CharField( + blank=True, + choices=[ + ( + "Judge", + [ + ("jud", "Judge"), + ("jus", "Justice"), + ("ad-law-jud", "Administrative Law Judge"), + ("act-jud", "Acting Judge"), + ("act-jus", "Acting Justice"), + ("act-pres-jud", "Acting Presiding Judge"), + ( + "act-c-admin-jus", + "Acting Chief Administrative Justice", + ), + ("ass-jud", "Associate Judge"), + ("ass-jus", "Associate Justice"), + ("ass-c-jud", "Associate Chief Judge"), + ("ass-pres-jud", "Associate Presiding Judge"), + ("asst-pres-jud", "Assistant Presiding Judge"), + ("c-jud", "Chief Judge"), + ("c-jus", "Chief Justice"), + ("c-spec-m", "Chief Special Master"), + ("c-admin-jus", "Chief Administrative Justice"), + ("c-spec-tr-jud", "Chief Special Trial Judge"), + ("pres-jud", "Presiding Judge"), + ("pres-jus", "Presiding Justice"), + ("sup-jud", "Supervising Judge"), + ( + "ad-pres-jus", + "Administrative Presiding Justice", + ), + ("com", "Commissioner"), + ("com-dep", "Deputy Commissioner"), + ("jud-pt", "Judge Pro Tem"), + ("jus-pt", "Justice Pro Tem"), + ("ref-jud-tr", "Judge Trial Referee"), + ("ref-off", "Official Referee"), + ("ref-state-trial", "State Trial Referee"), + ("ret-act-jus", "Active Retired Justice"), + ("ret-ass-jud", "Retired Associate Judge"), + ("ret-c-jud", "Retired Chief Judge"), + ("ret-jus", "Retired Justice"), + ("ret-senior-jud", "Senior Judge"), + ("mag", "Magistrate"), + ("c-mag", "Chief Magistrate"), + ("pres-mag", "Presiding Magistrate"), + ("mag-pt", "Magistrate Pro Tem"), + ("mag-rc", "Magistrate (Recalled)"), + ("mag-part-time", "Magistrate (Part-Time)"), + ("spec-chair", "Special Chairman"), + ("spec-jud", "Special Judge"), + ("spec-m", "Special Master"), + ( + "spec-scjcbc", + "Special Superior Court Judge for Complex Business Cases", + ), + ("spec-tr-jud", "Special Trial Judge"), + ("chair", "Chairman"), + ("chan", "Chancellor"), + ("presi-jud", "President"), + ("res-jud", "Reserve Judge"), + ("trial-jud", "Trial Judge"), + ("vice-chan", "Vice Chancellor"), + ("vice-cj", "Vice Chief Judge"), + ], + ), + ( + "Attorney General", + [ + ("att-gen", "Attorney General"), + ("att-gen-ass", "Assistant Attorney General"), + ( + "att-gen-ass-spec", + "Special Assistant Attorney General", + ), + ("sen-counsel", "Senior Counsel"), + ("dep-sol-gen", "Deputy Solicitor General"), + ], + ), + ( + "Appointing Authority", + [ + ("pres", "President of the United States"), + ("gov", "Governor"), + ("mayor", "Mayor"), + ], + ), + ( + "Clerkships", + [ + ("clerk", "Clerk"), + ("clerk-chief-dep", "Chief Deputy Clerk"), + ("staff-atty", "Staff Attorney"), + ], + ), + ("prof", "Professor"), + ("adj-prof", "Adjunct Professor"), + ("prac", "Practitioner"), + ("pros", "Prosecutor"), + ("pub-def", "Public Defender"), + ("da", "District Attorney"), + ("ada", "Assistant District Attorney"), + ("legis", "Legislator"), + ("sen", "Senator"), + ("state-sen", "State Senator"), + ], + help_text="If this is a judicial position, this indicates the role the person had. This field may be blank if job_title is complete instead.", + max_length=20, + null=True, + ), + ), + ] diff --git a/cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more_noop.sql b/cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more_noop.sql new file mode 100644 index 0000000000..c698d310cd --- /dev/null +++ b/cl/people_db/migrations/0020_alter_person_religion_alter_personevent_religion_and_more_noop.sql @@ -0,0 +1,18 @@ +BEGIN; +-- +-- Alter field religion on person +-- +-- (no-op) +-- +-- Alter field religion on personevent +-- +-- (no-op) +-- +-- Alter field position_type on position +-- +-- (no-op) +-- +-- Alter field position_type on positionevent +-- +-- (no-op) +COMMIT; diff --git a/cl/people_db/models.py b/cl/people_db/models.py index 94a2bd62a9..d00105d4ef 100644 --- a/cl/people_db/models.py +++ b/cl/people_db/models.py @@ -69,6 +69,7 @@ class Person(AbstractDateTimeModel): ("ro", "Roman Catholic"), ("me", "Methodist"), ("pe", "Presbyterian"), + ("un", "Unitarian"), ) race = models.ManyToManyField( "Race", @@ -343,6 +344,7 @@ class Position(AbstractDateTimeModel): CHIEF_JUSTICE = "c-jus" CHIEF_SPECIAL_MASTER = "c-spec-m" CHIEF_ADMINISTRATIVE_JUSTICE = "c-admin-jus" + CHIEF_SPECIAL_TRIAL_JUDGE = "c-spec-tr-jud" PRESIDING_JUDGE = "pres-jud" PRESIDING_JUSTICE = "pres-jus" SUPERVISING_JUDGE = "sup-jud" @@ -434,6 +436,7 @@ class Position(AbstractDateTimeModel): (CHIEF_JUSTICE, "Chief Justice"), (CHIEF_SPECIAL_MASTER, "Chief Special Master"), (CHIEF_ADMINISTRATIVE_JUSTICE, "Chief Administrative Justice"), + (CHIEF_SPECIAL_TRIAL_JUDGE, "Chief Special Trial Judge"), (PRESIDING_JUDGE, "Presiding Judge"), (PRESIDING_JUSTICE, "Presiding Justice"), (SUPERVISING_JUDGE, "Supervising Judge"), From ec2b294cfd0a1a788850ae80b34d283e83cc85fb Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 5 Feb 2025 19:25:48 -0600 Subject: [PATCH 16/16] feat(search): Introduced cl_re_index_rds_sealed command Fixes: #4926 --- .../commands/ready_mix_cases_project.py | 4 +- cl/lib/indexing_utils.py | 39 +++++ .../cl_index_parent_and_child_docs.py | 54 +------ .../commands/cl_re_index_rds_sealed.py | 148 ++++++++++++++++++ .../commands/cl_remove_content_from_es.py | 23 +-- cl/search/tests/tests.py | 2 +- cl/search/tests/tests_es_recap.py | 93 ++++++++++- 7 files changed, 293 insertions(+), 70 deletions(-) create mode 100644 cl/lib/indexing_utils.py create mode 100644 cl/search/management/commands/cl_re_index_rds_sealed.py diff --git a/cl/corpus_importer/management/commands/ready_mix_cases_project.py b/cl/corpus_importer/management/commands/ready_mix_cases_project.py index 32c9db7ae3..0f30c32452 100644 --- a/cl/corpus_importer/management/commands/ready_mix_cases_project.py +++ b/cl/corpus_importer/management/commands/ready_mix_cases_project.py @@ -14,11 +14,9 @@ from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger from cl.lib.elasticsearch_utils import build_es_base_query +from cl.lib.indexing_utils import log_last_document_indexed from cl.lib.redis_utils import get_redis_interface from cl.search.documents import DocketDocument -from cl.search.management.commands.cl_index_parent_and_child_docs import ( - log_last_document_indexed, -) from cl.search.models import SEARCH_TYPES, Court, Docket from cl.search.tasks import index_dockets_in_bulk diff --git a/cl/lib/indexing_utils.py b/cl/lib/indexing_utils.py new file mode 100644 index 0000000000..e77ec9cb7b --- /dev/null +++ b/cl/lib/indexing_utils.py @@ -0,0 +1,39 @@ +from datetime import datetime +from typing import Mapping + +from cl.lib.redis_utils import get_redis_interface + + +def log_last_document_indexed( + document_pk: int, log_key: str +) -> Mapping[str | bytes, int | str]: + """Log the last document_id indexed in ES. + + :param document_pk: The last document_id processed. + :param log_key: The log key to use in redis. + :return: The data logged to redis. + """ + + r = get_redis_interface("CACHE") + pipe = r.pipeline() + pipe.hgetall(log_key) + log_info: Mapping[str | bytes, int | str] = { + "last_document_id": document_pk, + "date_time": datetime.now().isoformat(), + } + pipe.hset(log_key, mapping=log_info) + pipe.expire(log_key, 60 * 60 * 24 * 28) # 4 weeks + pipe.execute() + + return log_info + + +def get_last_parent_document_id_processed(log_key: str) -> int: + """Get the last document ID indexed in ES. + :return: The last document ID indexed. + """ + r = get_redis_interface("CACHE") + stored_values = r.hgetall(log_key) + last_document_id = int(stored_values.get("last_document_id", 0)) + + return last_document_id diff --git a/cl/search/management/commands/cl_index_parent_and_child_docs.py b/cl/search/management/commands/cl_index_parent_and_child_docs.py index 366d9fe13e..302beb1607 100644 --- a/cl/search/management/commands/cl_index_parent_and_child_docs.py +++ b/cl/search/management/commands/cl_index_parent_and_child_docs.py @@ -1,6 +1,6 @@ -from datetime import date, datetime +from datetime import date from itertools import batched -from typing import Iterable, Mapping +from typing import Iterable from django.conf import settings from django.db.models import QuerySet @@ -12,7 +12,10 @@ check_fields_that_changed, get_fields_to_update, ) -from cl.lib.redis_utils import get_redis_interface +from cl.lib.indexing_utils import ( + get_last_parent_document_id_processed, + log_last_document_indexed, +) from cl.people_db.models import Person from cl.search.documents import ( DocketDocument, @@ -58,49 +61,6 @@ def compose_redis_key( return f"es_{search_type}_indexing:log" -def log_last_document_indexed( - document_pk: int, log_key: str -) -> Mapping[str | bytes, int | str]: - """Log the last document_id indexed. - - :param document_pk: The last document_id processed. - :param log_key: The log key to use in redis. - :return: The data logged to redis. - """ - - r = get_redis_interface("CACHE") - pipe = r.pipeline() - pipe.hgetall(log_key) - log_info: Mapping[str | bytes, int | str] = { - "last_document_id": document_pk, - "date_time": datetime.now().isoformat(), - } - pipe.hset(log_key, mapping=log_info) - pipe.expire(log_key, 60 * 60 * 24 * 28) # 4 weeks - pipe.execute() - - return log_info - - -def get_last_parent_document_id_processed( - search_type: str, event_doc_type: EventTable | None = None -) -> int: - """Get the last document ID indexed. - - :param search_type: The search type key to get the last document ID. - :param event_doc_type: An optional EventTable enum member specifying the - document type being processed. - :return: The last document ID indexed. - """ - - r = get_redis_interface("CACHE") - log_key = compose_redis_key(search_type, event_doc_type) - stored_values = r.hgetall(log_key) - last_document_id = int(stored_values.get("last_document_id", 0)) - - return last_document_id - - def get_unique_oldest_history_rows( start_date: date, end_date: date, @@ -356,7 +316,7 @@ def handle(self, *args, **options): ) if auto_resume: pk_offset = get_last_parent_document_id_processed( - search_type, update_from_event_tables + compose_redis_key(search_type, update_from_event_tables) ) self.stdout.write( f"Auto-resume enabled starting indexing from ID: {pk_offset}." diff --git a/cl/search/management/commands/cl_re_index_rds_sealed.py b/cl/search/management/commands/cl_re_index_rds_sealed.py new file mode 100644 index 0000000000..0ce660d822 --- /dev/null +++ b/cl/search/management/commands/cl_re_index_rds_sealed.py @@ -0,0 +1,148 @@ +from datetime import date, datetime +from typing import Iterable + +from django.conf import settings + +from cl.lib.argparse_types import valid_date_time +from cl.lib.celery_utils import CeleryThrottle +from cl.lib.command_utils import VerboseCommand +from cl.lib.indexing_utils import ( + get_last_parent_document_id_processed, + log_last_document_indexed, +) +from cl.search.models import SEARCH_TYPES, RECAPDocument +from cl.search.tasks import index_parent_or_child_docs + + +def compose_redis_key() -> str: + """Compose a Redis key based on the search type for indexing log. + :return: A Redis key as a string. + """ + return f"es_re_index_rd_sealed:log" + + +class Command(VerboseCommand): + help = "Re-index RECAPDocuments sealed from a date." + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.options = {} + + def add_arguments(self, parser): + parser.add_argument( + "--queue", + type=str, + default=settings.CELERY_ETL_TASK_QUEUE, + help="The celery queue where the tasks should be processed.", + ) + parser.add_argument( + "--chunk-size", + type=int, + default="100", + help="The number of items to index in a single celery task.", + ) + parser.add_argument( + "--auto-resume", + action="store_true", + default=False, + help="Auto resume the command using the last document_id logged in Redis. " + "If --pk-offset is provided, it'll be ignored.", + ) + parser.add_argument( + "--testing-mode", + action="store_true", + default=False, + help="Use this flag only when running the command in tests based on TestCase", + ) + parser.add_argument( + "--start-date", + type=valid_date_time, + required=True, + help="Start date in ISO-8601 format for a range of documents to " + "update.", + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + self.options = options + auto_resume = options["auto_resume"] + pk_offset = 0 + if auto_resume: + pk_offset = get_last_parent_document_id_processed( + compose_redis_key() + ) + self.stdout.write( + f"Auto-resume enabled starting indexing from ID: {pk_offset}." + ) + start_date: datetime = options["start_date"] + queryset = ( + RECAPDocument.objects.filter( + pk__gte=pk_offset, + is_sealed=True, + date_modified__gte=start_date, + ) + .order_by("pk") + .values_list("pk", flat=True) + ) + q = queryset.iterator() + count = queryset.count() + + self.process_queryset( + q, + count, + pk_offset, + ) + + def process_queryset( + self, + items: Iterable, + count: int, + pk_offset: int, + ) -> None: + """Process a queryset and execute tasks based on the specified celery + task_to_use. + + :param items: Iterable of items to process. Items can be a simple + iterable of IDs or a tuple of (ID, changed_fields) for cases requiring + field changes. + :param count: Total number of items expected to process. + :param pk_offset: + :return: None + """ + + queue = self.options["queue"] + testing_mode = self.options["testing_mode"] + chunk_size = self.options["chunk_size"] + chunk = [] + processed_count = 0 + throttle = CeleryThrottle(queue_name=queue) + # Indexing Parent and their child documents. + for item_id in items: + chunk.append(item_id) + processed_count += 1 + last_item = count == processed_count + if processed_count % chunk_size == 0 or last_item: + throttle.maybe_wait() + index_parent_or_child_docs.si( + chunk, + SEARCH_TYPES.RECAP, + "child", + testing_mode=testing_mode, + ).set(queue=queue).apply_async() + + chunk = [] + + self.stdout.write( + "\rProcessed {}/{}, ({:.0%}), last PK indexed: {},".format( + processed_count, + count, + processed_count * 1.0 / count, + item_id, + ) + ) + if not processed_count % 1000: + # Log every 1000 parent documents processed. + log_last_document_indexed(item_id, compose_redis_key()) + self.stdout.write( + f"Successfully indexed {processed_count} items from pk {pk_offset}." + ) diff --git a/cl/search/management/commands/cl_remove_content_from_es.py b/cl/search/management/commands/cl_remove_content_from_es.py index 8234a099d6..e375cb1d62 100644 --- a/cl/search/management/commands/cl_remove_content_from_es.py +++ b/cl/search/management/commands/cl_remove_content_from_es.py @@ -6,11 +6,11 @@ from cl.lib.argparse_types import valid_date_time from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.redis_utils import get_redis_interface -from cl.search.documents import DocketDocument, OpinionDocument -from cl.search.management.commands.cl_index_parent_and_child_docs import ( +from cl.lib.indexing_utils import ( + get_last_parent_document_id_processed, log_last_document_indexed, ) +from cl.search.documents import DocketDocument, OpinionDocument from cl.search.models import Docket from cl.search.tasks import remove_documents_by_query @@ -22,19 +22,6 @@ def compose_redis_key_remove_content() -> str: return "es_remove_content_from_es:log" -def get_last_parent_document_id_processed() -> int: - """Get the last document ID indexed. - :return: The last document ID indexed. - """ - - r = get_redis_interface("CACHE") - log_key = compose_redis_key_remove_content() - stored_values = r.hgetall(log_key) - last_document_id = int(stored_values.get("last_document_id", 0)) - - return last_document_id - - class Command(VerboseCommand): help = "Remove content from an ES index." @@ -119,7 +106,9 @@ def handle(self, *args, **options): pk_offset = 0 if auto_resume: - pk_offset = get_last_parent_document_id_processed() + pk_offset = get_last_parent_document_id_processed( + compose_redis_key_remove_content() + ) self.stdout.write( f"Auto-resume enabled starting indexing from ID: {pk_offset}." ) diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index f2503ee63e..482276def5 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -29,6 +29,7 @@ from cl.audio.factories import AudioFactory from cl.lib.elasticsearch_utils import simplify_estimated_count +from cl.lib.indexing_utils import log_last_document_indexed from cl.lib.redis_utils import get_redis_interface from cl.lib.storage import clobbering_get_name from cl.lib.test_helpers import AudioTestCase, CourtTestCase, PeopleTestCase @@ -64,7 +65,6 @@ ) from cl.search.management.commands.cl_index_parent_and_child_docs import ( get_unique_oldest_history_rows, - log_last_document_indexed, ) from cl.search.management.commands.cl_remove_content_from_es import ( compose_redis_key_remove_content, diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 933913ea7e..ba981a3fe4 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -27,6 +27,7 @@ set_results_highlights, simplify_estimated_count, ) +from cl.lib.indexing_utils import log_last_document_indexed from cl.lib.redis_utils import get_redis_interface from cl.lib.test_helpers import ( RECAPSearchTestCase, @@ -65,7 +66,6 @@ from cl.search.management.commands.cl_index_parent_and_child_docs import ( compose_redis_key, get_last_parent_document_id_processed, - log_last_document_indexed, ) from cl.search.models import ( SEARCH_TYPES, @@ -5749,6 +5749,8 @@ class IndexDocketRECAPDocumentsCommandTest( """cl_index_parent_and_child_docs command tests for Elasticsearch""" def setUp(self): + self.factory = RequestFactory() + self.site = admin.site self.rebuild_index("search.Docket") self.court = CourtFactory(id="canb", jurisdiction="FB") # Non-recap Docket @@ -5862,7 +5864,7 @@ def test_log_and_get_last_document_id(self): self.assertEqual(last_values["last_document_id"], 2001) last_document_id = get_last_parent_document_id_processed( - SEARCH_TYPES.RECAP + compose_redis_key(SEARCH_TYPES.RECAP) ) self.assertEqual(last_document_id, 2001) @@ -6059,6 +6061,93 @@ def test_index_only_child_docs_when_parent_docs_are_missed(self): child_count = len(article[0].xpath(".//h4")) self.assertEqual(2, child_count) + @mock.patch("cl.search.admin.delete_from_ia") + @mock.patch("cl.search.admin.invalidate_cloudfront") + def test_re_index_recap_documents_sealed( + self, mock_delete_from_ia, mock_invalidate_cloudfront + ): + """Test cl_re_index_rds_sealed to confirm that it properly re-indexes + sealed RECAPDocuments from the provided start_date.""" + + rd_1 = RECAPDocumentFactory( + docket_entry=self.de, + document_number="1", + attachment_number=3, + document_type=RECAPDocument.ATTACHMENT, + is_sealed=False, + filepath_local="test.pdf", + plain_text="Lorem Ipsum dolor", + ) + rd_2 = RECAPDocumentFactory( + docket_entry=self.de_1, + document_number="2", + attachment_number=4, + is_sealed=False, + filepath_local="test.pdf", + document_type=RECAPDocument.ATTACHMENT, + plain_text="Lorem Ipsum dolor not sealed", + ) + + # Call cl_index_parent_and_child_docs command for RECAPDocuments. + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + document_type="child", + ) + + # RECAPDocuments should be indexed. + s = DocketDocument.search() + s = s.query(Q("match", docket_child="recap_document")) + self.assertEqual( + s.count(), 5, msg="Wrong number of RECAPDocuments returned." + ) + + es_rd_1 = ESRECAPDocument.get(id=ES_CHILD_ID(rd_1.pk).RECAP) + self.assertEqual(es_rd_1.plain_text, rd_1.plain_text) + self.assertEqual(es_rd_1.filepath_local, rd_1.filepath_local) + + es_rd_2 = ESRECAPDocument.get(id=ES_CHILD_ID(rd_2.pk).RECAP) + self.assertEqual(es_rd_2.plain_text, rd_2.plain_text) + self.assertEqual(es_rd_2.filepath_local, rd_2.filepath_local) + + # Call seal_documents action. + recap_admin = RECAPDocumentAdmin(RECAPDocument, self.site) + recap_admin.message_user = mock.Mock() + url = reverse("admin:search_recapdocument_changelist") + request = self.factory.post(url) + queryset = RECAPDocument.objects.filter(pk__in=[rd_1.pk]) + mock_date = now().replace(day=15, hour=0) + with mock.patch( + "cl.lib.es_signal_processor.update_es_documents" + ), time_machine.travel(mock_date, tick=False): + recap_admin.seal_documents(request, queryset) + + recap_admin.message_user.assert_called_once_with( + request, + "Successfully sealed and removed 1 document(s).", + ) + + # Re-index RDs sealed documents. + rd_1.refresh_from_db() + with time_machine.travel(mock_date, tick=False): + call_command( + "cl_re_index_rds_sealed", + queue="celery", + start_date=rd_1.date_modified, + testing_mode=True, + ) + + # Confirm that only the sealed document "rd_1" was cleaned in ES. + es_rd_1 = ESRECAPDocument.get(id=ES_CHILD_ID(rd_1.pk).RECAP) + self.assertEqual(es_rd_1.plain_text, "") + self.assertEqual(es_rd_1.filepath_local, None) + + es_rd_2 = ESRECAPDocument.get(id=ES_CHILD_ID(rd_2.pk).RECAP) + self.assertEqual(es_rd_2.plain_text, rd_2.plain_text) + self.assertEqual(es_rd_2.filepath_local, rd_2.filepath_local) + class RECAPIndexingTest( CountESTasksTestCase, ESIndexTestCase, TransactionTestCase