Merge branch 'main' into 4920-store-unmatched-citations

freelawproject · Feb 6, 2025 · 0c08f5b · 0c08f5b
2 parents 690a4c1 + 9816354
commit 0c08f5b
Show file tree

Hide file tree

Showing 23 changed files with 1,666 additions and 168 deletions.
diff --git a/cl/corpus_importer/management/commands/ready_mix_cases_project.py b/cl/corpus_importer/management/commands/ready_mix_cases_project.py
@@ -14,11 +14,9 @@
 from cl.lib.celery_utils import CeleryThrottle
 from cl.lib.command_utils import VerboseCommand, logger
 from cl.lib.elasticsearch_utils import build_es_base_query
+from cl.lib.indexing_utils import log_last_document_indexed
 from cl.lib.redis_utils import get_redis_interface
 from cl.search.documents import DocketDocument
-from cl.search.management.commands.cl_index_parent_and_child_docs import (
-    log_last_document_indexed,
-)
 from cl.search.models import SEARCH_TYPES, Court, Docket
 from cl.search.tasks import index_dockets_in_bulk
 

diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
@@ -374,18 +374,6 @@ def build_fulltext_query(
     """
     if value:
         validate_query_syntax(value, QueryType.QUERY_STRING)
-        # In Elasticsearch, the colon (:) character is used to separate the
-        # field name and the field value in a query.
-        # To avoid parsing errors escape any colon characters in the value
-        # parameter with a backslash.
-        if "docketNumber:" in value:
-            docket_number_matches = re.findall("docketNumber:([^ ]+)", value)
-            for match in docket_number_matches:
-                replacement = match.replace(":", r"\:")
-                value = value.replace(
-                    f"docketNumber:{match}", f"docketNumber:{replacement}"
-                )
-
         # Used for the phrase query_string, no conjunctions appended.
         query_value = cleanup_main_query(value)
         # To enable the search of each term in the query across multiple fields
@@ -465,7 +453,18 @@ def build_term_query(
         validate_query_syntax(value, QueryType.FILTER)
 
     if make_phrase:
-        return [Q("match_phrase", **{field: {"query": value, "slop": slop}})]
+        return [
+            Q(
+                "match_phrase",
+                **{
+                    field: {
+                        "query": value,
+                        "slop": slop,
+                        "analyzer": "search_analyzer_exact",
+                    }
+                },
+            )
+        ]
 
     if isinstance(value, list):
         value = list(filter(None, value))
@@ -767,7 +766,7 @@ def build_es_plain_filters(cd: CleanData) -> List:
         # Build docket number term query
         queries_list.extend(
             build_term_query(
-                "docketNumber",
+                "docketNumber.exact",
                 cd.get("docket_number", ""),
                 make_phrase=True,
                 slop=1,
@@ -2374,7 +2373,7 @@ def build_join_es_filters(cd: CleanData) -> List:
                 ),
                 *build_text_filter("caseName.exact", cd.get("case_name", "")),
                 *build_term_query(
-                    "docketNumber",
+                    "docketNumber.exact",
                     cd.get("docket_number", ""),
                     make_phrase=True,
                     slop=1,
@@ -2418,7 +2417,7 @@ def build_join_es_filters(cd: CleanData) -> List:
                     cd.get("filed_after", ""),
                 ),
                 *build_term_query(
-                    "docketNumber",
+                    "docketNumber.exact",
                     cd.get("docket_number", ""),
                     make_phrase=True,
                     slop=1,

diff --git a/cl/lib/indexing_utils.py b/cl/lib/indexing_utils.py
@@ -0,0 +1,39 @@
+from datetime import datetime
+from typing import Mapping
+
+from cl.lib.redis_utils import get_redis_interface
+
+
+def log_last_document_indexed(
+    document_pk: int, log_key: str
+) -> Mapping[str | bytes, int | str]:
+    """Log the last document_id indexed in ES.
+
+    :param document_pk: The last document_id processed.
+    :param log_key: The log key to use in redis.
+    :return: The data logged to redis.
+    """
+
+    r = get_redis_interface("CACHE")
+    pipe = r.pipeline()
+    pipe.hgetall(log_key)
+    log_info: Mapping[str | bytes, int | str] = {
+        "last_document_id": document_pk,
+        "date_time": datetime.now().isoformat(),
+    }
+    pipe.hset(log_key, mapping=log_info)
+    pipe.expire(log_key, 60 * 60 * 24 * 28)  # 4 weeks
+    pipe.execute()
+
+    return log_info
+
+
+def get_last_parent_document_id_processed(log_key: str) -> int:
+    """Get the last document ID indexed in ES.
+    :return: The last document ID indexed.
+    """
+    r = get_redis_interface("CACHE")
+    stored_values = r.hgetall(log_key)
+    last_document_id = int(stored_values.get("last_document_id", 0))
+
+    return last_document_id
diff --git a/cl/lib/utils.py b/cl/lib/utils.py
@@ -330,8 +330,13 @@ def cleanup_main_query(query_string: str) -> str:
         if not item:
             continue
 
-        if item.startswith('"') or item.endswith('"'):
-            # Start or end of a phrase; flip whether we're inside a phrase
+        if (
+            item.startswith('"')
+            or item.endswith('"')
+            or bool(re.match(r'\w+:"[^"]', item))
+        ):
+            # Start or end of a phrase or a fielded query using quotes e.g: field:"test"
+            # flip whether we're inside a phrase
             inside_a_phrase = not inside_a_phrase
             cleaned_items.append(item)
             continue
@@ -345,6 +350,27 @@ def cleanup_main_query(query_string: str) -> str:
         is_date_str = re.match(
             "[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z", item
         )
+
+        if "docketNumber:" in item:
+            potential_docket_number = item.split("docketNumber:", 1)[1]
+
+            if not potential_docket_number:
+                # The docket_number is wrapped in parentheses
+                cleaned_items.append(item)
+            else:
+                # Improve the docket_number query by:
+                # If it's a known docket_number format, wrap it in quotes and
+                # add a ~1 slop to match slight variations like 1:21-bk-1234-ABC → 1:21-bk-1234
+                # If it's not a known docket_number format, just wrap it in
+                # quotes to avoid syntax errors caused by : in the number.
+                slop_suffix = (
+                    "~1" if is_docket_number(potential_docket_number) else ""
+                )
+                cleaned_items.append(
+                    f'docketNumber:"{potential_docket_number}"{slop_suffix}'
+                )
+            continue
+
         if any([not_numeric, is_date_str]):
             cleaned_items.append(item)
             continue
@@ -356,7 +382,7 @@ def cleanup_main_query(query_string: str) -> str:
 
         # Some sort of number, probably a docket number or other type of number
         # Wrap in quotes to do a phrase search
-        if is_docket_number(item) and "docketNumber:" not in query_string:
+        if is_docket_number(item):
             # Confirm is a docket number and clean it. So docket_numbers with
             # suffixes can be searched: 1:21-bk-1234-ABC -> 1:21-bk-1234,
             item = clean_docket_number(item)