Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update MEI parsing and creation of OMR search tokens #845

Merged
merged 14 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 65 additions & 41 deletions app/public/cantusdata/helpers/mei_processing/mei_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
between two neume components.
- get_contour_from_interval: Computes the contour of an interval.
- analyze_neume: Analyzes a neume (a list of neume components) to determine its
neume type, its intervals, and its contour.
neume name, its intervals, and its contour.

Defines associated types for the data structures used by the parser.
"""

from typing import Tuple, Dict, List, Iterator, Optional
from lxml import etree
from lxml import etree # pylint: disable=no-name-in-module
from .mei_parsing_types import (
Zone,
SyllableText,
NeumeComponentElementData,
NeumeComponent,
ContourType,
NeumeType,
NeumeName,
Neume,
Syllable,
)
Expand All @@ -31,24 +31,24 @@
PITCH_CLASS = {"c": 0, "d": 2, "e": 4, "f": 5, "g": 7, "a": 9, "b": 11}

# Mapping from neume contours to neume names
NEUME_GROUPS: Dict[str, NeumeType] = {
"": "Punctum",
"u": "Pes",
"d": "Clivis",
"uu": "Scandicus",
"ud": "Torculus",
"du": "Porrectus",
"s": "Distropha",
"ss": "Tristopha",
"sd": "Pressus",
"dd": "Climacus",
"ddu": "Climacus resupinus",
"udu": "Torculus resupinus",
"dud": "Porrectus flexus",
"udd": "Pes subpunctis",
"uud": "Scandicus flexus",
"uudd": "Scandicus subpunctis",
"dudd": "Porrectus subpunctis",
NEUME_GROUPS: Dict[str, NeumeName] = {
"": "punctum",
"u": "pes",
"d": "clivis",
"uu": "scandicus",
"ud": "torculus",
"du": "porrectus",
"r": "distropha",
"rr": "tristopha",
"rd": "pressus",
"dd": "climacus",
"ddu": "climacus_resupinus",
"udu": "torculus_resupinus",
"dud": "porrectus_flexus",
"udd": "pes_subpunctis",
"uud": "scandicus_flexus",
"uudd": "scandicus_subpunctis",
"dudd": "porrectus_subpunctis",
}


Expand All @@ -75,6 +75,7 @@ class MEIParser:
def __init__(self, mei_file: str):
self.mei_file = mei_file
self.mei = etree.parse(self.mei_file)
self._remove_empty_neumes_and_syllables()
self.zones = self.parse_zones()
self.syllables = self.parse_mei()

Expand Down Expand Up @@ -182,7 +183,7 @@ def _parse_neume(
)
if parsed_neume_component:
parsed_nc_elements.append(parsed_neume_component)
neume_type, intervals, contours = analyze_neume(parsed_nc_elements)
neume_name, intervals, contours = analyze_neume(parsed_nc_elements)
# If the first neume component of the next syllable can be parsed,
# add the interval and contour between the final neume component of
# the current syllable and the first neume component of the next syllable.
Expand All @@ -193,7 +194,7 @@ def _parse_neume(
if parsed_next_neume_comp:
last_neume_comp = parsed_nc_elements[-1]
intervals.append(
get_interval_between_neume_components(
get_semitones_between_neume_components(
last_neume_comp, parsed_next_neume_comp
)
)
Expand All @@ -211,12 +212,13 @@ def _parse_neume(
"pname": nc["pname"],
"octave": nc["octave"],
"bounding_box": nc["bounding_box"],
"interval": intervals[i] if i < len(intervals) else None,
"semitone_interval": intervals[i] if i < len(intervals) else None,
"contour": contours[i] if i < len(contours) else None,
"system": neume_system,
}
)
parsed_neume: Neume = {
"neume_type": neume_type,
"neume_name": neume_name,
"neume_components": parsed_neume_components,
"bounding_box": combined_bounding_box,
"system": neume_system,
Expand Down Expand Up @@ -323,6 +325,26 @@ def _syllable_iterator(
system += 1
current_elem = next(elem_iterator, None)

def _remove_empty_neumes_and_syllables(self) -> None:
"""
Apparently, for a while Rodan was creating invalid MEI files that
contained empty neumes (i.e., neumes with no neume components) and
empty syllables (i.e., syllables with no neumes or only empty neumes).
This method removes those empty neumes and syllables from the MEI being parsed;
it was added as a preprocessing step so that it can, once the base
MEI files are corrected, be removed.
"""
for neume in self.mei.iter(f"{self.MEINS}neume"):
if len(neume.findall(f"{self.MEINS}nc")) == 0:
# Ignoring type because we know that getparent() will
# return an element in this case.
neume.getparent().remove(neume) # type: ignore
for syllable in self.mei.iter(f"{self.MEINS}syllable"):
if len(syllable.findall(f"{self.MEINS}neume")) == 0:
# Ignoring type because we know that getparent() will
# return an element in this case.
syllable.getparent().remove(syllable) # type: ignore

def parse_mei(self) -> List[Syllable]:
"""
Parses the MEI file into a list of syllables.
Expand Down Expand Up @@ -351,7 +373,7 @@ def parse_mei(self) -> List[Syllable]:
return syllables


def get_interval_between_neume_components(
def get_semitones_between_neume_components(
neume_component_1: NeumeComponentElementData,
neume_component_2: NeumeComponentElementData,
) -> int:
Expand All @@ -369,8 +391,8 @@ def get_interval_between_neume_components(
try:
pc1 = PITCH_CLASS[neume_component_1["pname"]]
pc2 = PITCH_CLASS[neume_component_2["pname"]]
except KeyError:
raise ValueError("Invalid pitch name in neume component.")
except KeyError as err:
raise ValueError("Invalid pitch name in neume component.") from err
# In MIDI note numbers, C0 = 12.
pitch_1 = pc1 + (12 * (neume_component_1["octave"] + 1))
pitch_2 = pc2 + (12 * (neume_component_2["octave"] + 1))
Expand All @@ -382,34 +404,36 @@ def get_contour_from_interval(interval: int) -> ContourType:
Compute the contour of an interval.

:param interval: The size of the interval in semitones
:return: The contour of the interval ("u"[p], "d"[own], or "s"[tay])
:return: The contour of the interval ("u"[p], "d"[own], or "r"[epeat])
"""
if interval < 0:
return "d"
if interval > 0:
return "u"
return "s"
return "r"


def analyze_neume(
neume: List[NeumeComponentElementData],
) -> Tuple[NeumeType, List[int], List[ContourType]]:
) -> Tuple[NeumeName, List[int], List[ContourType]]:
"""
Analyze a neume (a list of neume components) to determine:
- Neume type
- Neume intervals
- Neume contour
- The neume type (e.g., punctum, pes, clivis, etc.)
- The intervals in the neume in semitones
- The contour of the nueme

:param neume: A list of neume components (a list of NeumeComponentsType dictionaries)
:return: A tuple of information about the neume:
- Neume type (str)
- Neume intervals (list of ints)
- Neume contour (list of "u"[p], "d"[own], or "s"[tay])
- Neume intervals in semitones (list of ints)
- Neume contour (list of "u"[p], "d"[own], or "r"[epeat])
"""
intervals: List[int] = [
get_interval_between_neume_components(nc1, nc2)
semitone_intervals: List[int] = [
get_semitones_between_neume_components(nc1, nc2)
for nc1, nc2 in zip(neume[:-1], neume[1:])
]
contours: List[ContourType] = [get_contour_from_interval(i) for i in intervals]
neume_type: NeumeType = NEUME_GROUPS.get("".join(contours), "Compound")
return neume_type, intervals, contours
contours: List[ContourType] = [
get_contour_from_interval(i) for i in semitone_intervals
]
neume_type: NeumeName = NEUME_GROUPS.get("".join(contours), "compound")
return neume_type, semitone_intervals, contours
92 changes: 66 additions & 26 deletions app/public/cantusdata/helpers/mei_processing/mei_parsing_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Contains type definitions used in the MEI parsing process.
"""

from typing import Tuple, TypedDict, Literal, List, Optional
from typing import Tuple, TypedDict, Literal, List, Optional, NotRequired
from typing_extensions import TypeAlias

# A type for coordinates of bounding boxes
Expand Down Expand Up @@ -30,26 +30,26 @@ class Zone(TypedDict):
rotate: float


ContourType = Literal["u", "d", "s"]
NeumeType = Literal[
"Punctum",
"Pes",
"Clivis",
"Scandicus",
"Torculus",
"Porrectus",
"Distropha",
"Tristopha",
"Pressus",
"Climacus",
"Climacus resupinus",
"Torculus resupinus",
"Porrectus flexus",
"Pes subpunctis",
"Scandicus flexus",
"Scandicus subpunctis",
"Porrectus subpunctis",
"Compound",
ContourType = Literal["u", "d", "r"]
NeumeName = Literal[
"punctum",
"pes",
"clivis",
"scandicus",
"torculus",
"porrectus",
"distropha",
"tristopha",
"pressus",
"climacus",
"climacus_resupinus",
"torculus_resupinus",
"porrectus_flexus",
"pes_subpunctis",
"scandicus_flexus",
"scandicus_subpunctis",
"porrectus_subpunctis",
"compound",
]


Expand All @@ -74,27 +74,29 @@ class NeumeComponent(NeumeComponentElementData):
"""A type extending NeumeComponentElementData with interval and contour information.


interval: The interval (in semitones) between the neume component and the
semitone_interval: The interval in semitones between the neume component and the
following neume component. If there is no following neume component,
this is None.
contour: The contour ("u"[p], "d"[own], or "s"[tay]) of 'interval'. If there is no
contour: The contour ("u"[p], "d"[own], or "r"[epeat]) of 'interval'. If there is no
following neume component, this is None.
system: The system number that the neume component is on
"""

interval: Optional[int]
semitone_interval: Optional[int]
contour: Optional[ContourType]
system: int


class Neume(TypedDict):
"""A type for neumes

neume_type: The name of the neume (ie. "Punctum", "Pes", "Clivis", etc.)
neume_name: The name of the neume (ie. "punctum", "pes", "clivis", etc.)
neume_components: A list of neume components (containing pitch infomation)
bounding_box: The bounding box of the neume
system: The system number that the neume is on
"""

neume_type: NeumeType
neume_name: NeumeName
neume_components: List[NeumeComponent]
bounding_box: Zone
system: int
Expand All @@ -112,3 +114,41 @@ class Syllable(TypedDict):

text: SyllableText
neumes: List[Neume]


class NgramDocument(TypedDict):
"""
A generic type for documents containing n-grams
of information extracted from MEI files.

ngram_unit: The unit of the n-gram
location: The location of the n-gram in the MEI file (MEI Zones
converted to JSON strings according to bounding_box_utils.stringify_bounding_boxes)
pitch_names: A string containing the pitch names of the neume components in the n-gram,
separated by underscores.
contour: A string containing the contours of the neume components in the n-gram, separated
by underscores.
semitone_interval: A string containing the semitone intervals between the neume components
in the n-gram, separated by underscores.
neume_names: A string containing the names of the neumes in the n-gram,
separated by underscores. This field is not required, and is only present when
the n-gram contains complete neumes.

The following may be part of an NgramDocument, but are optional because
they will be added when the document is indexed:
manuscript_id: The ID of the manuscript the n-gram belongs to.
folio_number: The number of the folio on which the n-gram exists.
id: The unique ID of the document (corresponds to solr schema's id field)
type: The type of the document (corresponds to solr schema's type field)
"""

location: str
pitch_names: str
contour: str
semitone_intervals: str
neume_names: NotRequired[str]
manuscript_id: NotRequired[str]
folio: NotRequired[str]
id: NotRequired[str]
type: NotRequired[Literal["omr_ngram"]]
image_uri: NotRequired[str]
Loading
Loading