diff --git a/cl/citations/match_citations_queries.py b/cl/citations/match_citations_queries.py index ecfab63f58..ac11141c2f 100644 --- a/cl/citations/match_citations_queries.py +++ b/cl/citations/match_citations_queries.py @@ -7,6 +7,7 @@ from eyecite import get_citations from eyecite.models import FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer +from natsort import natsort from cl.citations.types import SupportedCitationType from cl.citations.utils import ( @@ -21,19 +22,22 @@ HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") -def fetch_citations(search_query: Search) -> list[Hit]: +def fetch_citations(search_query: Search, fields=None) -> list[Hit]: """Fetches citation matches from Elasticsearch based on the provided search query. :param search_query: The Elasticsearch DSL Search object. + :param fields: fields to return :return: A list of ES Hits objects. """ + if fields is None: + fields = ["id", "caseName", "absolute_url", "dateFiled"] citation_hits = [] search_query = search_query.sort("id") # Only retrieve fields required for the lookup. search_query = search_query.source( - includes=["id", "caseName", "absolute_url", "dateFiled"] + includes=fields ) # Citation resolution aims for a single match. Setting up a size of 2 is # enough to determine if there is more than one match. @@ -215,6 +219,62 @@ def es_search_db_for_full_citation( full_citation.citing_opinion, ) return results, citation_found + else: + # We didn't get an exact match on the volume/reporter/page. Perhaps + # it's a pincite. Find closest citations filtering by volume and + # reporter and excluding self cites. + partial_citation_str = " ".join([full_citation.groups["volume"], full_citation.groups["reporter"]]) + filters = [Q( + "match_phrase", + **{"citation.exact": partial_citation_str}, + )] + query = Q("bool", must_not=must_not, filter=filters) + citations_query = search_query.query(query) + results = fetch_citations(citations_query, fields=["id", "cluster_id", "citation", "text"]) + closest_opinion_clusters = [] + + # Create a temporal item and add it to the values list (cluster_id, + # page) + citation_item = (0, full_citation.groups["page"]) + closest_opinion_clusters.append(citation_item) + + for result in results: + # Get the citations from OpinionDocument that matched the partial + # citation + valid_citations = [get_citations(citation)[0] for citation in result["citation"] if partial_citation_str in citation and get_citations(citation)] + + for valid_citation in valid_citations: + closest_opinion_clusters.append((result["cluster_id"], valid_citation.groups["page"])) + + if len(closest_opinion_clusters) > 1: + # Order by page number + sort_possible_matches = natsort.natsorted( + closest_opinion_clusters, key=lambda item: item[1] + ) + # Find te index of the temporal item + citation_item_position = sort_possible_matches.index(citation_item) + + if citation_item_position > 0: + # if the position is greater than 0, then the previous item in + # the list is the closest citation, we get the cluster id of the + # previous item + possible_cluster_id_matched = sort_possible_matches[citation_item_position - 1][0] + + # We filter the results list to get the possible match + # OpinionDocument + filtered_results = [hit for hit in results if hit.cluster_id == possible_cluster_id_matched] + + if len(filtered_results) == 1 and f"*{full_citation.groups["page"]}" in filtered_results[0]["text"]: + # Check if the page number is in the opinion text, currently + # we only look for this format: *page_number + return [filtered_results[0]], citation_found + for result in filtered_results: + # We could have clusters with multiple opinions, we need + # to check if the page number is in any of the opinions + if f"*{full_citation.groups['page']}" in result["text"]: + # We found the page number in the opinion content + return [result], citation_found + # Give up. return [], citation_found