Skip to content

Commit

Permalink
feat(citations): find and link pincites
Browse files Browse the repository at this point in the history
  • Loading branch information
quevon24 committed Feb 6, 2025
1 parent 667e33c commit de29df6
Showing 1 changed file with 62 additions and 2 deletions.
64 changes: 62 additions & 2 deletions cl/citations/match_citations_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from eyecite import get_citations
from eyecite.models import FullCaseCitation
from eyecite.tokenizers import HyperscanTokenizer
from natsort import natsort

from cl.citations.types import SupportedCitationType
from cl.citations.utils import (
Expand All @@ -21,19 +22,22 @@
HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")


def fetch_citations(search_query: Search) -> list[Hit]:
def fetch_citations(search_query: Search, fields=None) -> list[Hit]:
"""Fetches citation matches from Elasticsearch based on the provided
search query.
:param search_query: The Elasticsearch DSL Search object.
:param fields: fields to return
:return: A list of ES Hits objects.
"""

if fields is None:
fields = ["id", "caseName", "absolute_url", "dateFiled"]
citation_hits = []
search_query = search_query.sort("id")
# Only retrieve fields required for the lookup.
search_query = search_query.source(
includes=["id", "caseName", "absolute_url", "dateFiled"]
includes=fields
)
# Citation resolution aims for a single match. Setting up a size of 2 is
# enough to determine if there is more than one match.
Expand Down Expand Up @@ -215,6 +219,62 @@ def es_search_db_for_full_citation(
full_citation.citing_opinion,
)
return results, citation_found
else:
# We didn't get an exact match on the volume/reporter/page. Perhaps
# it's a pincite. Find closest citations filtering by volume and
# reporter and excluding self cites.
partial_citation_str = " ".join([full_citation.groups["volume"], full_citation.groups["reporter"]])
filters = [Q(
"match_phrase",
**{"citation.exact": partial_citation_str},
)]
query = Q("bool", must_not=must_not, filter=filters)
citations_query = search_query.query(query)
results = fetch_citations(citations_query, fields=["id", "cluster_id", "citation", "text"])
closest_opinion_clusters = []

# Create a temporal item and add it to the values list (cluster_id,
# page)
citation_item = (0, full_citation.groups["page"])
closest_opinion_clusters.append(citation_item)

for result in results:
# Get the citations from OpinionDocument that matched the partial
# citation
valid_citations = [get_citations(citation)[0] for citation in result["citation"] if partial_citation_str in citation and get_citations(citation)]

for valid_citation in valid_citations:
closest_opinion_clusters.append((result["cluster_id"], valid_citation.groups["page"]))

if len(closest_opinion_clusters) > 1:
# Order by page number
sort_possible_matches = natsort.natsorted(
closest_opinion_clusters, key=lambda item: item[1]
)
# Find te index of the temporal item
citation_item_position = sort_possible_matches.index(citation_item)

if citation_item_position > 0:
# if the position is greater than 0, then the previous item in
# the list is the closest citation, we get the cluster id of the
# previous item
possible_cluster_id_matched = sort_possible_matches[citation_item_position - 1][0]

# We filter the results list to get the possible match
# OpinionDocument
filtered_results = [hit for hit in results if hit.cluster_id == possible_cluster_id_matched]

if len(filtered_results) == 1 and f"*{full_citation.groups["page"]}" in filtered_results[0]["text"]:
# Check if the page number is in the opinion text, currently
# we only look for this format: *page_number
return [filtered_results[0]], citation_found
for result in filtered_results:
# We could have clusters with multiple opinions, we need
# to check if the page number is in any of the opinions
if f"*{full_citation.groups['page']}" in result["text"]:
# We found the page number in the opinion content
return [result], citation_found

# Give up.
return [], citation_found

Expand Down

0 comments on commit de29df6

Please sign in to comment.