Interim commit

DDMAL · Mar 25, 2024 · a93b3ec · a93b3ec
1 parent 8f8ff68
commit a93b3ec
Show file tree

Hide file tree

Showing 4 changed files with 197 additions and 69 deletions.
diff --git a/app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py b/app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py
@@ -60,3 +60,32 @@ def combine_bounding_boxes(bounding_boxes: List[Tuple[Zone, int]]) -> List[Zone]
         combined_boxes.append(combine_bounding_boxes_single_system(boxes))
 
     return combined_boxes
+
+
+def stringify_bounding_boxes(bounding_boxes: List[Zone]) -> str:
+    """
+    Convert a list of bounding box types to a string for indexing. The
+    string encodes some JSON, an array of objects, each of which
+    represents a bounding box:
+
+        [
+            {"ulx":  , # X-coordinate of upper left corner of box
+            "uly":  , # Y-coordinate of upper left corner of box
+            "width": , # Height of the box
+            "height":  }, # Width of the box
+            ...
+        ]
+
+    :param bounding_box: A list of bounding boxes (Zone type)
+    :return: A string representation of the bounding boxes
+    """
+    bbox_strings: List[str] = []
+    for box in bounding_boxes:
+        ulx = box["coordinates"][0]
+        uly = box["coordinates"][1]
+        width = box["coordinates"][2] - box["coordinates"][0]  # lrx - ulx
+        height = box["coordinates"][1] - box["coordinates"][3]  # uly - lry
+        bbox_strings.append(
+            str({"ulx": ulx, "uly": uly, "width": width, "height": height})
+        )
+    return str(bbox_strings)
diff --git a/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py b/app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py
@@ -6,39 +6,49 @@
 
 from typing import List, Iterator, Any, TypedDict, Literal
 from .mei_parser import MEIParser
-from .mei_parsing_types import ContourType
+from .mei_parsing_types import Neume, NeumeComponent
+from .bounding_box_utils import combine_bounding_boxes, stringify_bounding_boxes
 
+NgramUnitType = Literal["neume", "neume_component"]
 
-class MusicalSequencesDict(TypedDict):
+
+class NgramDocument(TypedDict):
     """
-    Type definition for a dictionary containing a sequence of musical data.
+    A generic type for documents containing n-grams
+    of information extracted from MEI files.
+
+    ngram_unit: The unit of the n-gram
+    location: The location of the n-gram in the MEI file (MEI Zones
+        converted to JSON strings according to bounding_box_utils.stringify_bounding_boxes)
     """
 
-    neume_names: List[str]
-    pitch_names: List[str]
-    intervals: List[str]
-    contours: List[ContourType]
+    ngram_unit: NgramUnitType
+    location: str
 
 
-class NgramDocument(TypedDict):
-    id: str
-    type: str
-    location: str
+class NeumeNgramDocument(NgramDocument):
+    """
+    A type for documents containing n-grams of neume-level information.
 
+    neume_names: A string containing the names of the neumes in the n-gram,
+        separated by underscores.
+    """
 
-class NeumeNamesNgramDocument(NgramDocument):
     neume_names: str
 
 
-class PitchNamesNgramDocument(NgramDocument):
-    pitch_names: str
+class NeumeComponentNgramDocument(NgramDocument):
+    """
+    A type for documents containing n-grams of neume component-level information.
 
+    pitch_names: A string containing the pitch names of the neume components in the n-gram,
+        in the form of two-character strings (pitch name and octave), separated by underscores.
+    intervals: A string containing the intervals between the neume components in the n-gram.
+    contours: A string containing the contours of the neume components in the n-gram.
+    """
 
-class IntervalsNgramDocument(NgramDocument):
+    pitch_names: str
     intervals: str
-
-
-class ContoursNgramDocument(NgramDocument):
     contours: str
 
 
@@ -61,68 +71,75 @@ def generate_ngrams(sequence: List[Any], min_n: int, max_n: int) -> Iterator[Lis
 
 class MEITokenizer(MEIParser):
     """
-
-
     An MEITokenizer object is initialized with an MEI file and a set of
     parameters that define how the MEI file should be tokenized. These
     parameters are:
-
+    - min_ngram: The minimum length of n-grams to generate.
+    - max_ngram: The maximum length of n-grams to generate.
     """
 
-    def __init__(
-        self, mei_file: str, min_neume_ngram: int, max_neume_ngram: int
-    ) -> None:
+    def __init__(self, mei_file: str, min_ngram: int, max_ngram: int) -> None:
         super().__init__(mei_file)
-        self.min_neume_ngram = min_neume_ngram
-        self.max_neume_ngram = max_neume_ngram
-        self.sequences = self._get_musical_sequences()
+        self.min_ngram = min_ngram
+        self.max_ngram = max_ngram
 
-    def _get_musical_sequences(self) -> MusicalSequencesDict:
+    def get_neume_ngram_docs(self) -> List[NeumeNgramDocument]:
         """
-        Get sequences of musical data from the entire MEI file.
-
-        :return: A dictionary containing the following sequences of
-            musical data (all sequences in the form of lists of strings):
-            - "neume_names": A list of neume names in the file.
-            - "pitch_names": A list of pitch names in the file.
-            - "intervals": A list of intervals in the file.
-            - "contours": A list of contours in the file.
-            - "semitones": A list of semitones in the file.
+        Generate neume-level documents for search, containing
+        n-grams of neume names.
+
+        :return: A list of dictionaries containing the n-grams
+            of neume names.
         """
-        neume_names = []
-        pitch_names = []
-        intervals = []
-        contours = []
+        neumes_sequence: List[Neume] = []
         for syllable in self.syllables:
-            for neume in syllable["neumes"]:
-                neume_names.append(neume["neume_type"])
-                contours.extend(neume["contours"])
-                intervals.extend([str(interval) for interval in neume["intervals"]])
-                for component in neume["neume_components"]:
-                    pitch_names.append(component["pname"] + str(component["octave"]))
-        return {
-            "neume_names": neume_names,
-            "pitch_names": pitch_names,
-            "intervals": intervals,
-            "contours": contours,
-        }
-
-    def _get_ngrams(
-        self, ngram_type: Literal["neume_names", "pitch_names", "intervals", "contours"]
-    ) -> List[str]:
+            neumes_sequence.extend(syllable["neumes"])
+        neume_documents: List[NeumeNgramDocument] = []
+        for ngram in generate_ngrams(neumes_sequence, self.min_ngram, self.max_ngram):
+            bounding_boxes = [
+                (neume["bounding_box"], neume["system"]) for neume in ngram
+            ]
+            document_location = combine_bounding_boxes(bounding_boxes)
+            neume_names = "_".join([neume["neume_type"] for neume in ngram])
+            neume_documents.append(
+                {
+                    "ngram_unit": "neume",
+                    "location": stringify_bounding_boxes(document_location),
+                    "neume_names": neume_names,
+                }
+            )
+        return neume_documents
+
+    def get_neume_component_ngram_docs(self) -> List[NeumeComponentNgramDocument]:
         """
-        Get n-grams of a particular type from the MEI file.
+        Generate neume component-level documents for search, containing
+        n-grams of pitch names, intervals, and contours.
 
-        :param ngram_type: The type of n-gram to generate. Must be one of
-            "neume_names", "pitch_names", "intervals", or "contours".
-        :return: A list of n-grams represented as strings of space-separated
-            items of the specified type.
+        :return: A list of dictionaries containing the n-grams
+            of pitch names, intervals, and contours.
         """
-        return [
-            " ".join(ngram)
-            for ngram in generate_ngrams(
-                self.sequences[ngram_type],
-                self.min_neume_ngram,
-                self.max_neume_ngram,
+        neume_components: List[NeumeComponent] = []
+        for syllable in self.syllables:
+            for neume in syllable["neumes"]:
+                neume_components.extend(neume["neume_components"])
+        neume_component_documents: List[NeumeComponentNgramDocument] = []
+        for ngram in generate_ngrams(
+            neume_components,
+            self.min_ngram,
+            self.max_ngram,
+        ):
+            pitch_names = "_".join([comp["pname"] for comp in ngram])
+            intervals = "".join([str(interval) for interval in neume["intervals"]])
+            contours = "".join([contour for contour in neume["contours"]])
+            bounding_boxes = [(comp["bounding_box"], neume["system"]) for comp in ngram]
+            document_location = combine_bounding_boxes(bounding_boxes)
+            neume_component_documents.append(
+                {
+                    "ngram_unit": "neume_component",
+                    "location": stringify_bounding_boxes(document_location),
+                    "pitch_names": pitch_names,
+                    "intervals": intervals,
+                    "contours": contours,
+                }
             )
-        ]
+        return neume_component_documents
diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_bounding_box_utils.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_bounding_box_utils.py
@@ -3,6 +3,7 @@
 from cantusdata.helpers.mei_processing.bounding_box_utils import (
     combine_bounding_boxes,
     combine_bounding_boxes_single_system,
+    stringify_bounding_boxes,
 )
 from cantusdata.helpers.mei_processing.mei_parsing_types import Zone
 
@@ -34,3 +35,15 @@ def test_combine_bounding_boxes(self) -> None:
             {"coordinates": (8, 8, 9, 10), "rotate": 0.0},
         ]
         self.assertEqual(combined_boxes, expected_combined_boxes)
+
+    def test_stringify_bounding_boxes(self) -> None:
+        bounding_boxes: List[Zone] = [
+            {"coordinates": (0, 0, 1, 1), "rotate": 0},
+            {"coordinates": (2, 3, 5, 8), "rotate": 0.123},
+        ]
+        stringified_boxes = stringify_bounding_boxes(bounding_boxes)
+        expected_stringified_boxes = (
+            '[{"ulx": 0, "uly": 0,"width": 1,"height": 1},'
+            '{"ulx": 2, "uly": 3,"width": 3,"height": 5}]'
+        )
+        self.assertEqual(stringified_boxes, expected_stringified_boxes)
diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_tokenizer.py
@@ -0,0 +1,69 @@
+from unittest import TestCase
+from os import path
+from cantusdata.settings import BASE_DIR
+from cantusdata.helpers.mei_processing.mei_tokenizer import (
+    MEITokenizer,
+    generate_ngrams,
+)
+
+
+class MEITokenizerTestCase(TestCase):
+
+    def test_generate_ngrams(self) -> None:
+        with self.subTest("Ngrams from 2 to 3"):
+            sequence = [1, 2, 3, 4, 5]
+            min_ngram = 2
+            max_ngram = 3
+            ngrams = list(generate_ngrams(sequence, min_ngram, max_ngram))
+            self.assertEqual(
+                ngrams,
+                [[1, 2], [2, 3], [3, 4], [4, 5], [1, 2, 3], [2, 3, 4], [3, 4, 5]],
+            )
+        with self.subTest("Ngrams from 3 to 5"):
+            sequence = [1, 2, 3, 4, 5]
+            min_ngram = 3
+            max_ngram = 5
+            ngrams = list(generate_ngrams(sequence, min_ngram, max_ngram))
+            self.assertEqual(
+                ngrams,
+                [
+                    [1, 2, 3],
+                    [2, 3, 4],
+                    [3, 4, 5],
+                    [1, 2, 3, 4],
+                    [2, 3, 4, 5],
+                    [1, 2, 3, 4, 5],
+                ],
+            )
+
+    def test_mei_tokenizer(self) -> None:
+        with self.subTest("Ngrams from 2 to 3"):
+            tokenizer = MEITokenizer(
+                path.join(
+                    BASE_DIR,
+                    "cantusdata",
+                    "test",
+                    "core",
+                    "helpers",
+                    "mei_processing",
+                    "test_mei_files",
+                    "cdn-hsmu-m2149l4_001r.mei",
+                ),
+                min_ngram=2,
+                max_ngram=3,
+            )
+        with self.subTest("Ngrams from 3 to 5"):
+            tokenizer = MEITokenizer(
+                path.join(
+                    BASE_DIR,
+                    "cantusdata",
+                    "test",
+                    "core",
+                    "helpers",
+                    "mei_processing",
+                    "test_mei_files",
+                    "cdn-hsmu-m2149l4_001r.mei",
+                ),
+                min_ngram=3,
+                max_ngram=5,
+            )