Skip to content

Commit

Permalink
Merge branch 'main' into 4920-store-unmatched-citations
Browse files Browse the repository at this point in the history
  • Loading branch information
mlissner authored Feb 6, 2025
2 parents 690a4c1 + 9816354 commit 0c08f5b
Show file tree
Hide file tree
Showing 23 changed files with 1,666 additions and 168 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,9 @@
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.elasticsearch_utils import build_es_base_query
from cl.lib.indexing_utils import log_last_document_indexed
from cl.lib.redis_utils import get_redis_interface
from cl.search.documents import DocketDocument
from cl.search.management.commands.cl_index_parent_and_child_docs import (
log_last_document_indexed,
)
from cl.search.models import SEARCH_TYPES, Court, Docket
from cl.search.tasks import index_dockets_in_bulk

Expand Down
31 changes: 15 additions & 16 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,18 +374,6 @@ def build_fulltext_query(
"""
if value:
validate_query_syntax(value, QueryType.QUERY_STRING)
# In Elasticsearch, the colon (:) character is used to separate the
# field name and the field value in a query.
# To avoid parsing errors escape any colon characters in the value
# parameter with a backslash.
if "docketNumber:" in value:
docket_number_matches = re.findall("docketNumber:([^ ]+)", value)
for match in docket_number_matches:
replacement = match.replace(":", r"\:")
value = value.replace(
f"docketNumber:{match}", f"docketNumber:{replacement}"
)

# Used for the phrase query_string, no conjunctions appended.
query_value = cleanup_main_query(value)
# To enable the search of each term in the query across multiple fields
Expand Down Expand Up @@ -465,7 +453,18 @@ def build_term_query(
validate_query_syntax(value, QueryType.FILTER)

if make_phrase:
return [Q("match_phrase", **{field: {"query": value, "slop": slop}})]
return [
Q(
"match_phrase",
**{
field: {
"query": value,
"slop": slop,
"analyzer": "search_analyzer_exact",
}
},
)
]

if isinstance(value, list):
value = list(filter(None, value))
Expand Down Expand Up @@ -767,7 +766,7 @@ def build_es_plain_filters(cd: CleanData) -> List:
# Build docket number term query
queries_list.extend(
build_term_query(
"docketNumber",
"docketNumber.exact",
cd.get("docket_number", ""),
make_phrase=True,
slop=1,
Expand Down Expand Up @@ -2374,7 +2373,7 @@ def build_join_es_filters(cd: CleanData) -> List:
),
*build_text_filter("caseName.exact", cd.get("case_name", "")),
*build_term_query(
"docketNumber",
"docketNumber.exact",
cd.get("docket_number", ""),
make_phrase=True,
slop=1,
Expand Down Expand Up @@ -2418,7 +2417,7 @@ def build_join_es_filters(cd: CleanData) -> List:
cd.get("filed_after", ""),
),
*build_term_query(
"docketNumber",
"docketNumber.exact",
cd.get("docket_number", ""),
make_phrase=True,
slop=1,
Expand Down
39 changes: 39 additions & 0 deletions cl/lib/indexing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from datetime import datetime
from typing import Mapping

from cl.lib.redis_utils import get_redis_interface


def log_last_document_indexed(
document_pk: int, log_key: str
) -> Mapping[str | bytes, int | str]:
"""Log the last document_id indexed in ES.
:param document_pk: The last document_id processed.
:param log_key: The log key to use in redis.
:return: The data logged to redis.
"""

r = get_redis_interface("CACHE")
pipe = r.pipeline()
pipe.hgetall(log_key)
log_info: Mapping[str | bytes, int | str] = {
"last_document_id": document_pk,
"date_time": datetime.now().isoformat(),
}
pipe.hset(log_key, mapping=log_info)
pipe.expire(log_key, 60 * 60 * 24 * 28) # 4 weeks
pipe.execute()

return log_info


def get_last_parent_document_id_processed(log_key: str) -> int:
"""Get the last document ID indexed in ES.
:return: The last document ID indexed.
"""
r = get_redis_interface("CACHE")
stored_values = r.hgetall(log_key)
last_document_id = int(stored_values.get("last_document_id", 0))

return last_document_id
32 changes: 29 additions & 3 deletions cl/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,8 +330,13 @@ def cleanup_main_query(query_string: str) -> str:
if not item:
continue

if item.startswith('"') or item.endswith('"'):
# Start or end of a phrase; flip whether we're inside a phrase
if (
item.startswith('"')
or item.endswith('"')
or bool(re.match(r'\w+:"[^"]', item))
):
# Start or end of a phrase or a fielded query using quotes e.g: field:"test"
# flip whether we're inside a phrase
inside_a_phrase = not inside_a_phrase
cleaned_items.append(item)
continue
Expand All @@ -345,6 +350,27 @@ def cleanup_main_query(query_string: str) -> str:
is_date_str = re.match(
"[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z", item
)

if "docketNumber:" in item:
potential_docket_number = item.split("docketNumber:", 1)[1]

if not potential_docket_number:
# The docket_number is wrapped in parentheses
cleaned_items.append(item)
else:
# Improve the docket_number query by:
# If it's a known docket_number format, wrap it in quotes and
# add a ~1 slop to match slight variations like 1:21-bk-1234-ABC → 1:21-bk-1234
# If it's not a known docket_number format, just wrap it in
# quotes to avoid syntax errors caused by : in the number.
slop_suffix = (
"~1" if is_docket_number(potential_docket_number) else ""
)
cleaned_items.append(
f'docketNumber:"{potential_docket_number}"{slop_suffix}'
)
continue

if any([not_numeric, is_date_str]):
cleaned_items.append(item)
continue
Expand All @@ -356,7 +382,7 @@ def cleanup_main_query(query_string: str) -> str:

# Some sort of number, probably a docket number or other type of number
# Wrap in quotes to do a phrase search
if is_docket_number(item) and "docketNumber:" not in query_string:
if is_docket_number(item):
# Confirm is a docket number and clean it. So docket_numbers with
# suffixes can be searched: 1:21-bk-1234-ABC -> 1:21-bk-1234,
item = clean_docket_number(item)
Expand Down
Loading

0 comments on commit 0c08f5b

Please sign in to comment.