diff --git a/app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py b/app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py index f2ce32137..ee4067fe3 100644 --- a/app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py +++ b/app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py @@ -60,3 +60,32 @@ def combine_bounding_boxes(bounding_boxes: List[Tuple[Zone, int]]) -> List[Zone] combined_boxes.append(combine_bounding_boxes_single_system(boxes)) return combined_boxes + + +def stringify_bounding_boxes(bounding_boxes: List[Zone]) -> str: + """ + Convert a list of bounding box types to a string for indexing. The + string encodes some JSON, an array of objects, each of which + represents a bounding box: + + [ + {"ulx": , # X-coordinate of upper left corner of box + "uly": , # Y-coordinate of upper left corner of box + "width": , # Height of the box + "height": }, # Width of the box + ... + ] + + :param bounding_box: A list of bounding boxes (Zone type) + :return: A string representation of the bounding boxes + """ + bbox_strings: List[str] = [] + for box in bounding_boxes: + ulx = box["coordinates"][0] + uly = box["coordinates"][1] + width = box["coordinates"][2] - box["coordinates"][0] # lrx - ulx + height = box["coordinates"][1] - box["coordinates"][3] # uly - lry + bbox_strings.append( + str({"ulx": ulx, "uly": uly, "width": width, "height": height}) + ) + return str(bbox_strings) diff --git a/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py b/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py index b933311a6..232eb50a3 100644 --- a/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py +++ b/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py @@ -6,39 +6,49 @@ from typing import List, Iterator, Any, TypedDict, Literal from .mei_parser import MEIParser -from .mei_parsing_types import ContourType +from .mei_parsing_types import Neume, NeumeComponent +from .bounding_box_utils import combine_bounding_boxes, stringify_bounding_boxes +NgramUnitType = Literal["neume", "neume_component"] -class MusicalSequencesDict(TypedDict): + +class NgramDocument(TypedDict): """ - Type definition for a dictionary containing a sequence of musical data. + A generic type for documents containing n-grams + of information extracted from MEI files. + + ngram_unit: The unit of the n-gram + location: The location of the n-gram in the MEI file (MEI Zones + converted to JSON strings according to bounding_box_utils.stringify_bounding_boxes) """ - neume_names: List[str] - pitch_names: List[str] - intervals: List[str] - contours: List[ContourType] + ngram_unit: NgramUnitType + location: str -class NgramDocument(TypedDict): - id: str - type: str - location: str +class NeumeNgramDocument(NgramDocument): + """ + A type for documents containing n-grams of neume-level information. + neume_names: A string containing the names of the neumes in the n-gram, + separated by underscores. + """ -class NeumeNamesNgramDocument(NgramDocument): neume_names: str -class PitchNamesNgramDocument(NgramDocument): - pitch_names: str +class NeumeComponentNgramDocument(NgramDocument): + """ + A type for documents containing n-grams of neume component-level information. + pitch_names: A string containing the pitch names of the neume components in the n-gram, + in the form of two-character strings (pitch name and octave), separated by underscores. + intervals: A string containing the intervals between the neume components in the n-gram. + contours: A string containing the contours of the neume components in the n-gram. + """ -class IntervalsNgramDocument(NgramDocument): + pitch_names: str intervals: str - - -class ContoursNgramDocument(NgramDocument): contours: str @@ -61,68 +71,75 @@ def generate_ngrams(sequence: List[Any], min_n: int, max_n: int) -> Iterator[Lis class MEITokenizer(MEIParser): """ - - An MEITokenizer object is initialized with an MEI file and a set of parameters that define how the MEI file should be tokenized. These parameters are: - + - min_ngram: The minimum length of n-grams to generate. + - max_ngram: The maximum length of n-grams to generate. """ - def __init__( - self, mei_file: str, min_neume_ngram: int, max_neume_ngram: int - ) -> None: + def __init__(self, mei_file: str, min_ngram: int, max_ngram: int) -> None: super().__init__(mei_file) - self.min_neume_ngram = min_neume_ngram - self.max_neume_ngram = max_neume_ngram - self.sequences = self._get_musical_sequences() + self.min_ngram = min_ngram + self.max_ngram = max_ngram - def _get_musical_sequences(self) -> MusicalSequencesDict: + def get_neume_ngram_docs(self) -> List[NeumeNgramDocument]: """ - Get sequences of musical data from the entire MEI file. - - :return: A dictionary containing the following sequences of - musical data (all sequences in the form of lists of strings): - - "neume_names": A list of neume names in the file. - - "pitch_names": A list of pitch names in the file. - - "intervals": A list of intervals in the file. - - "contours": A list of contours in the file. - - "semitones": A list of semitones in the file. + Generate neume-level documents for search, containing + n-grams of neume names. + + :return: A list of dictionaries containing the n-grams + of neume names. """ - neume_names = [] - pitch_names = [] - intervals = [] - contours = [] + neumes_sequence: List[Neume] = [] for syllable in self.syllables: - for neume in syllable["neumes"]: - neume_names.append(neume["neume_type"]) - contours.extend(neume["contours"]) - intervals.extend([str(interval) for interval in neume["intervals"]]) - for component in neume["neume_components"]: - pitch_names.append(component["pname"] + str(component["octave"])) - return { - "neume_names": neume_names, - "pitch_names": pitch_names, - "intervals": intervals, - "contours": contours, - } - - def _get_ngrams( - self, ngram_type: Literal["neume_names", "pitch_names", "intervals", "contours"] - ) -> List[str]: + neumes_sequence.extend(syllable["neumes"]) + neume_documents: List[NeumeNgramDocument] = [] + for ngram in generate_ngrams(neumes_sequence, self.min_ngram, self.max_ngram): + bounding_boxes = [ + (neume["bounding_box"], neume["system"]) for neume in ngram + ] + document_location = combine_bounding_boxes(bounding_boxes) + neume_names = "_".join([neume["neume_type"] for neume in ngram]) + neume_documents.append( + { + "ngram_unit": "neume", + "location": stringify_bounding_boxes(document_location), + "neume_names": neume_names, + } + ) + return neume_documents + + def get_neume_component_ngram_docs(self) -> List[NeumeComponentNgramDocument]: """ - Get n-grams of a particular type from the MEI file. + Generate neume component-level documents for search, containing + n-grams of pitch names, intervals, and contours. - :param ngram_type: The type of n-gram to generate. Must be one of - "neume_names", "pitch_names", "intervals", or "contours". - :return: A list of n-grams represented as strings of space-separated - items of the specified type. + :return: A list of dictionaries containing the n-grams + of pitch names, intervals, and contours. """ - return [ - " ".join(ngram) - for ngram in generate_ngrams( - self.sequences[ngram_type], - self.min_neume_ngram, - self.max_neume_ngram, + neume_components: List[NeumeComponent] = [] + for syllable in self.syllables: + for neume in syllable["neumes"]: + neume_components.extend(neume["neume_components"]) + neume_component_documents: List[NeumeComponentNgramDocument] = [] + for ngram in generate_ngrams( + neume_components, + self.min_ngram, + self.max_ngram, + ): + pitch_names = "_".join([comp["pname"] for comp in ngram]) + intervals = "".join([str(interval) for interval in neume["intervals"]]) + contours = "".join([contour for contour in neume["contours"]]) + bounding_boxes = [(comp["bounding_box"], neume["system"]) for comp in ngram] + document_location = combine_bounding_boxes(bounding_boxes) + neume_component_documents.append( + { + "ngram_unit": "neume_component", + "location": stringify_bounding_boxes(document_location), + "pitch_names": pitch_names, + "intervals": intervals, + "contours": contours, + } ) - ] + return neume_component_documents diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_bounding_box_utils.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_bounding_box_utils.py index 43824d56d..c21812ee7 100644 --- a/app/public/cantusdata/test/core/helpers/mei_processing/test_bounding_box_utils.py +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_bounding_box_utils.py @@ -3,6 +3,7 @@ from cantusdata.helpers.mei_processing.bounding_box_utils import ( combine_bounding_boxes, combine_bounding_boxes_single_system, + stringify_bounding_boxes, ) from cantusdata.helpers.mei_processing.mei_parsing_types import Zone @@ -34,3 +35,15 @@ def test_combine_bounding_boxes(self) -> None: {"coordinates": (8, 8, 9, 10), "rotate": 0.0}, ] self.assertEqual(combined_boxes, expected_combined_boxes) + + def test_stringify_bounding_boxes(self) -> None: + bounding_boxes: List[Zone] = [ + {"coordinates": (0, 0, 1, 1), "rotate": 0}, + {"coordinates": (2, 3, 5, 8), "rotate": 0.123}, + ] + stringified_boxes = stringify_bounding_boxes(bounding_boxes) + expected_stringified_boxes = ( + '[{"ulx": 0, "uly": 0,"width": 1,"height": 1},' + '{"ulx": 2, "uly": 3,"width": 3,"height": 5}]' + ) + self.assertEqual(stringified_boxes, expected_stringified_boxes) diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py new file mode 100644 index 000000000..e6aa5a3d4 --- /dev/null +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py @@ -0,0 +1,69 @@ +from unittest import TestCase +from os import path +from cantusdata.settings import BASE_DIR +from cantusdata.helpers.mei_processing.mei_tokenizer import ( + MEITokenizer, + generate_ngrams, +) + + +class MEITokenizerTestCase(TestCase): + + def test_generate_ngrams(self) -> None: + with self.subTest("Ngrams from 2 to 3"): + sequence = [1, 2, 3, 4, 5] + min_ngram = 2 + max_ngram = 3 + ngrams = list(generate_ngrams(sequence, min_ngram, max_ngram)) + self.assertEqual( + ngrams, + [[1, 2], [2, 3], [3, 4], [4, 5], [1, 2, 3], [2, 3, 4], [3, 4, 5]], + ) + with self.subTest("Ngrams from 3 to 5"): + sequence = [1, 2, 3, 4, 5] + min_ngram = 3 + max_ngram = 5 + ngrams = list(generate_ngrams(sequence, min_ngram, max_ngram)) + self.assertEqual( + ngrams, + [ + [1, 2, 3], + [2, 3, 4], + [3, 4, 5], + [1, 2, 3, 4], + [2, 3, 4, 5], + [1, 2, 3, 4, 5], + ], + ) + + def test_mei_tokenizer(self) -> None: + with self.subTest("Ngrams from 2 to 3"): + tokenizer = MEITokenizer( + path.join( + BASE_DIR, + "cantusdata", + "test", + "core", + "helpers", + "mei_processing", + "test_mei_files", + "cdn-hsmu-m2149l4_001r.mei", + ), + min_ngram=2, + max_ngram=3, + ) + with self.subTest("Ngrams from 3 to 5"): + tokenizer = MEITokenizer( + path.join( + BASE_DIR, + "cantusdata", + "test", + "core", + "helpers", + "mei_processing", + "test_mei_files", + "cdn-hsmu-m2149l4_001r.mei", + ), + min_ngram=3, + max_ngram=5, + )