diff --git a/app/public/cantusdata/helpers/mei_processing/mei_parser.py b/app/public/cantusdata/helpers/mei_processing/mei_parser.py index 525b2c146..1109b4937 100644 --- a/app/public/cantusdata/helpers/mei_processing/mei_parser.py +++ b/app/public/cantusdata/helpers/mei_processing/mei_parser.py @@ -14,7 +14,7 @@ """ from xml.etree import ElementTree as ET -from typing import Tuple, Dict, List, TypedDict, Literal +from typing import Tuple, Dict, List, TypedDict, Literal, Iterator, Optional # Mapping from pitch names to integer pitch class where C = 0 PITCH_CLASS = {"c": 0, "d": 2, "e": 4, "f": 5, "g": 7, "a": 9, "b": 11} @@ -45,14 +45,31 @@ class Zone(TypedDict): - """A type for zones (bounding boxes) in MEI files""" + """A type for zones (bounding boxes) in MEI files. + + coordinates: The location of the bouding box as + defined in MEI 'zone' elements. The coordinates + of the box are given as four integers designating, + in order: + - the x-coordinate of the upper-left corner of the box + - the y-coordinate of the upper-left corner of the box + - the x-coordinate of the lower-right corner of the box + - the y-coordinate of the lower-right corner of the box + rotate: The rotation of the zone in degrees. + """ coordinates: CoordinatesType rotate: float class NeumeComponent(TypedDict): - """A type for neume components""" + """A type for neume components + + pname: The pitch name of the neume component (ie. "c", "d", "e", etc.) + octave: The octave of the neume component (as an integer, in scientific + pitch notation; e.g. middle c has octave "4") + bounding_box: The bounding box of the neume component + """ pname: str octave: int @@ -60,7 +77,22 @@ class NeumeComponent(TypedDict): class Neume(TypedDict): - """A type for neumes""" + """A type for neumes + + neume_type: The name of the neume (ie. "Punctum", "Pes", "Clivis", etc.) + neume_components: A list of neume components (containing pitch infomation) + intervals: A list of intervals (in semitones) between neume components. + In most cases, the length of this list is the same as the number of neume + components in the neume, with the final element being the interval between + the final component of the current neume and the first component of the + following neume. When there is no following neume (at the end of the mei + file), the list is one element shorter than the number of neume components + (this final element is omitted). + contours: A list of contours ("u"[p], "d"[own], or "s"[tay]) for each interval. + As with the "intervals" list, the length of this list usually includes a final + element that stores the contour between the final component of the current neume + and the first component of the following neume. + """ neume_type: str neume_components: List[NeumeComponent] @@ -151,54 +183,151 @@ def _get_element_zone(self, element: ET.Element) -> Zone: return zone return {"coordinates": (-1, -1, -1, -1), "rotate": 0.0} - def parse_syllable(self, syllable: ET.Element) -> Syllable: + def _parse_syllable_text(self, syl_elem: Optional[ET.Element]) -> SyllableText: """ - Parse a syllable element from an MEI file into a dictionary. + Get the text of a syllable and its associated bounding box from + a 'syl' element. - :param syllable: An ElementTree element of a syllable - :return: Dictionary of syllable data + :param syllable: A syllable element from an MEI file + :return: Dictionary of syllable text data """ - # elements contain the text of the syllable. - syl = syllable.find(f"{self.MEINS}syl") - text_dict: SyllableText - if syl and syl.text: - text_dict = { - "text": syl.text.strip(), - "bounding_box": self._get_element_zone(syl), + if syl_elem is not None and syl_elem.text: + text_dict: SyllableText = { + "text": syl_elem.text.strip(), + "bounding_box": self._get_element_zone(syl_elem), } else: text_dict = { "text": "", "bounding_box": {"coordinates": (-1, -1, -1, -1), "rotate": 0.0}, } - # elements contain the pitches of the syllable. - neumes_list: List[Neume] = [] - for neume in syllable.findall(f"{self.MEINS}neume"): - neume_components: List[NeumeComponent] = [] - for neume_comp in neume.findall(f"{self.MEINS}nc"): - pname = neume_comp.get("pname") - octave = neume_comp.get("oct") - if pname and octave: - neume_components.append( - { - "pname": pname, - "octave": int(octave), - "bounding_box": self._get_element_zone(neume_comp), - } - ) - neume_type, intervals, contours = analyze_neume(neume_components) - neume_dict: Neume = { - "neume_type": neume_type, - "neume_components": neume_components, - "intervals": intervals, - "contours": contours, + return text_dict + + def _parse_neume_component( + self, neume_comp: ET.Element + ) -> Optional[NeumeComponent]: + """ + Parses an 'nc' element into a NeumeComponent dictionary. + + :param neume_comp: An 'nc' element from an MEI file + :return: A dictionary of neume component data (see NeumeComponent for structure) + """ + pname = neume_comp.get("pname") + octave = neume_comp.get("oct") + if pname and octave: + return { + "pname": pname, + "octave": int(octave), + "bounding_box": self._get_element_zone(neume_comp), } - neumes_list.append(neume_dict) - syllable_dict: Syllable = { - "text": text_dict, - "neumes": neumes_list, + return None + + def _parse_neume( + self, + neume_components: List[ET.Element], + next_neume_component: Optional[ET.Element], + ) -> Neume: + """ + Gets a Neume dictionary from a series of 'nc' elements (including + the first neume component of the following neume, if it exists) + + :param neume_components: A list of 'nc' elements in a given 'neume' element + :param next_neume_component: The first 'nc' element of the next neume + :return: A list of neume dictionaries (see Neume for structure) + """ + parsed_neume_components: List[NeumeComponent] = [] + for neume_comp in neume_components: + parsed_neume_component: Optional[NeumeComponent] = ( + self._parse_neume_component(neume_comp) + ) + if parsed_neume_component: + parsed_neume_components.append(parsed_neume_component) + neume_type, intervals, contours = analyze_neume(parsed_neume_components) + # If the first neume component of the next syllable can be parsed, + # add the interval and contour between the final neume component of + # the current syllable and the first neume component of the next syllable. + if next_neume_component is not None: + parsed_next_neume_comp: Optional[NeumeComponent] = ( + self._parse_neume_component(next_neume_component) + ) + if parsed_next_neume_comp: + last_neume_comp = parsed_neume_components[-1] + intervals.append( + get_interval_between_neume_components( + last_neume_comp, parsed_next_neume_comp + ) + ) + contours.append(get_contour_from_interval(intervals[-1])) + parsed_neume: Neume = { + "neume_type": neume_type, + "neume_components": parsed_neume_components, + "intervals": intervals, + "contours": contours, } - return syllable_dict + return parsed_neume + + def _neume_iterator( + self, + neumes: List[ET.Element], + next_syllable_1st_nc: Optional[ET.Element], + ) -> Iterator[Tuple[List[ET.Element], Optional[ET.Element]]]: + """ + Convenience generator for iterating over a syllable's neumes. + At each iteration step, the generator provides the 'nc' elements + of the current neume and the first 'nc' element of the next neume + (if it exists) so that the interval and contour between the final + neume of the current syllable and the first neume of the next syllable + can be computed. + + :param neumes: A list of 'neume' elements in a syllable + :param next_syllable_1st_nc: The first 'nc' element of the next syllable + + The generator yields a tuple of: + - The 'nc' elements of the current neume + - The first 'nc' element of the next neume (if it exists) + """ + neume_iterator = iter(neumes) + current_neume = next(neume_iterator, None) + while current_neume: + neume_components = current_neume.findall(f"{self.MEINS}nc") + next_neume = next(neume_iterator, None) + if next_neume: + next_neume_component = next_neume.find(f"{self.MEINS}nc") + else: + next_neume_component = next_syllable_1st_nc + yield neume_components, next_neume_component + current_neume = next_neume + + def _syllable_iterator( + self, + ) -> Iterator[Tuple[Optional[ET.Element], List[ET.Element], Optional[ET.Element]]]: + """ + Convenience generator for iterating over syllables in an MEI file. At each + iteration step, the generator provides all data for the current syllable + and the first neume of the next syllable (if it exists) so that the interval + and contour between the final neume of the current syllable and the first + neume of the next syllable can be computed. + + The generator yields a tuple of: + - The 'syl' element of the current syllable (containing text information), + if it exists. + - A list of 'neume' elements for the current syllable (containing musical + information), if they exist. + - The first 'nc' element (neume component) of the next syllable (if it exists). + If there is no next syllable, this value is None. + """ + syllable_iterator = self.mei.iter(f"{self.MEINS}syllable") + current_syllable = next(syllable_iterator, None) + while current_syllable: + current_syl = current_syllable.find(f"{self.MEINS}syl") + current_neumes = current_syllable.findall(f"{self.MEINS}neume") + next_syllable = next(syllable_iterator, None) + next_neume = ( + next_syllable.find(f"{self.MEINS}neume") if next_syllable else None + ) + next_nc = next_neume.find(f"{self.MEINS}nc") if next_neume else None + yield current_syl, current_neumes, next_nc + current_syllable = next_syllable def parse_mei(self) -> List[Syllable]: """ @@ -206,9 +335,18 @@ def parse_mei(self) -> List[Syllable]: :return: A list of syllables """ - syllables = [] - for syllable in self.mei.iter(f"{self.MEINS}syllable"): - syllable_dict = self.parse_syllable(syllable) + syllables: List[Syllable] = [] + for text_elem, syllable_neumes, next_neume_comp in self._syllable_iterator(): + syllable_text: SyllableText = self._parse_syllable_text(text_elem) + neumes_list: List[Neume] = [] + for neume, next_neume_1st_nc in self._neume_iterator( + syllable_neumes, next_neume_comp + ): + neumes_list.append(self._parse_neume(neume, next_neume_1st_nc)) + syllable_dict: Syllable = { + "text": syllable_text, + "neumes": neumes_list, + } syllables.append(syllable_dict) return syllables diff --git a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py index 2cb9964ac..5cc39684d 100644 --- a/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py +++ b/app/public/cantusdata/test/core/helpers/mei_processing/test_mei_parser.py @@ -6,33 +6,35 @@ get_contour_from_interval, get_interval_between_neume_components, analyze_neume, + NeumeComponent, + Zone, ) class MEIParserTestCase(TestCase): - default_bounding_box = {"coordinates": (-1, -1, -1, -1), "rotate": 0.0} - neume_component_g3 = { + default_bounding_box: Zone = {"coordinates": (-1, -1, -1, -1), "rotate": 0.0} + neume_component_g3: NeumeComponent = { "pname": "g", "octave": 3, "bounding_box": default_bounding_box, } - neume_component_d4 = { + neume_component_d4: NeumeComponent = { "pname": "d", "octave": 4, "bounding_box": default_bounding_box, } - neume_component_d3 = { + neume_component_d3: NeumeComponent = { "pname": "d", "octave": 3, "bounding_box": default_bounding_box, } - neume_component_b2 = { + neume_component_b2: NeumeComponent = { "pname": "b", "octave": 2, "bounding_box": default_bounding_box, } - def test_mei_parser(self): + def test_mei_parser(self) -> None: parser = MEIParser( path.join( BASE_DIR, @@ -47,15 +49,126 @@ def test_mei_parser(self): ) zones = parser.zones syllables = parser.syllables - self.assertEqual(len(zones), 324) - self.assertEqual(len(syllables), 116) + with self.subTest("Test number of zones"): + self.assertEqual(len(zones), 324) + with self.subTest("Test number of syllables"): + self.assertEqual(len(syllables), 116) + with self.subTest("Test sample zone #1"): + zone_key = "#m-bd6dbd3e-46ec-4244-bfb9-b22aae69116d" + expected_zone = { + "coordinates": (1191, 4482, 5276, 4791), + "rotate": -0.180727, + } + self.assertIn(zone_key, zones) + self.assertEqual(zones[zone_key], expected_zone) + with self.subTest("Test sample zone #2"): + zone_key = "#zone-0000001876581719" + expected_zone = { + "coordinates": (4933, 7834, 5265, 8034), + "rotate": 0.0, + } + self.assertIn(zone_key, zones) + self.assertEqual(zones[zone_key], expected_zone) + with self.subTest("Test first syllable"): + # First and second syllables: + ## + ## Ec + ## + ## + ## + ## + ## + ## ce + ## + ## + ## + ## + ## + # Relevant zones (for first syllable and the single neume component in that syllable): + ## + ## + expected_first_syllable = { + "text": { + "text": "Ec", + "bounding_box": { + "coordinates": (2426, 2451, 2639, 2651), + "rotate": 0.0, + }, + }, + "neumes": [ + { + "neume_type": "Punctum", + "neume_components": [ + { + "pname": "d", + "octave": 3, + "bounding_box": { + "coordinates": (2608, 2399, 2678, 2448), + "rotate": 0.0, + }, + } + ], + "intervals": [0], + "contours": ["s"], + } + ], + } + self.assertEqual(syllables[0], expected_first_syllable) + with self.subTest("Test last syllable"): + # Last syllable: + ## + ## gil + ## + ## + ## + ## + ## + # Relevant zones (for last syllable and the two neume components in that syllable): + ## + ## + ## + expected_last_syllable = { + "text": { + "text": "gil", + "bounding_box": { + "coordinates": (4933, 7834, 5265, 8034), + "rotate": 0.0, + }, + }, + "neumes": [ + { + "neume_type": "Clivis", + "neume_components": [ + { + "pname": "e", + "octave": 2, + "bounding_box": { + "coordinates": (5037, 7724, 5108, 7774), + "rotate": 0.0, + }, + }, + { + "pname": "d", + "octave": 2, + "bounding_box": { + "coordinates": (5104, 7774, 5175, 7824), + "rotate": 0.0, + }, + }, + ], + "intervals": [-2], + "contours": ["d"], + } + ], + } + self.assertEqual(syllables[-1], expected_last_syllable) - def test_get_contour_from_interval(self): + def test_get_contour_from_interval(self) -> None: self.assertEqual(get_contour_from_interval(0), "s") self.assertEqual(get_contour_from_interval(1), "u") self.assertEqual(get_contour_from_interval(-3), "d") - def test_get_interval_between_neume_components(self): + def test_get_interval_between_neume_components(self) -> None: with self.subTest("Interval test 1"): self.assertEqual( get_interval_between_neume_components( @@ -85,7 +198,7 @@ def test_get_interval_between_neume_components(self): -8, ) - def test_analyze_neume(self): + def test_analyze_neume(self) -> None: neume_components_1 = [self.neume_component_d3, self.neume_component_g3] neume_components_2 = [ self.neume_component_d3,