Skip to content

Commit

Permalink
Interim commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dchiller committed Mar 25, 2024
1 parent 8f8ff68 commit a93b3ec
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 69 deletions.
29 changes: 29 additions & 0 deletions app/public/cantusdata/helpers/mei_processing/bounding_box_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,32 @@ def combine_bounding_boxes(bounding_boxes: List[Tuple[Zone, int]]) -> List[Zone]
combined_boxes.append(combine_bounding_boxes_single_system(boxes))

return combined_boxes


def stringify_bounding_boxes(bounding_boxes: List[Zone]) -> str:
"""
Convert a list of bounding box types to a string for indexing. The
string encodes some JSON, an array of objects, each of which
represents a bounding box:
[
{"ulx": , # X-coordinate of upper left corner of box
"uly": , # Y-coordinate of upper left corner of box
"width": , # Height of the box
"height": }, # Width of the box
...
]
:param bounding_box: A list of bounding boxes (Zone type)
:return: A string representation of the bounding boxes
"""
bbox_strings: List[str] = []
for box in bounding_boxes:
ulx = box["coordinates"][0]
uly = box["coordinates"][1]
width = box["coordinates"][2] - box["coordinates"][0] # lrx - ulx
height = box["coordinates"][1] - box["coordinates"][3] # uly - lry
bbox_strings.append(
str({"ulx": ulx, "uly": uly, "width": width, "height": height})
)
return str(bbox_strings)
155 changes: 86 additions & 69 deletions app/public/cantusdata/helpers/mei_processing/mei_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,49 @@

from typing import List, Iterator, Any, TypedDict, Literal
from .mei_parser import MEIParser
from .mei_parsing_types import ContourType
from .mei_parsing_types import Neume, NeumeComponent
from .bounding_box_utils import combine_bounding_boxes, stringify_bounding_boxes

NgramUnitType = Literal["neume", "neume_component"]

class MusicalSequencesDict(TypedDict):

class NgramDocument(TypedDict):
"""
Type definition for a dictionary containing a sequence of musical data.
A generic type for documents containing n-grams
of information extracted from MEI files.
ngram_unit: The unit of the n-gram
location: The location of the n-gram in the MEI file (MEI Zones
converted to JSON strings according to bounding_box_utils.stringify_bounding_boxes)
"""

neume_names: List[str]
pitch_names: List[str]
intervals: List[str]
contours: List[ContourType]
ngram_unit: NgramUnitType
location: str


class NgramDocument(TypedDict):
id: str
type: str
location: str
class NeumeNgramDocument(NgramDocument):
"""
A type for documents containing n-grams of neume-level information.
neume_names: A string containing the names of the neumes in the n-gram,
separated by underscores.
"""

class NeumeNamesNgramDocument(NgramDocument):
neume_names: str


class PitchNamesNgramDocument(NgramDocument):
pitch_names: str
class NeumeComponentNgramDocument(NgramDocument):
"""
A type for documents containing n-grams of neume component-level information.
pitch_names: A string containing the pitch names of the neume components in the n-gram,
in the form of two-character strings (pitch name and octave), separated by underscores.
intervals: A string containing the intervals between the neume components in the n-gram.
contours: A string containing the contours of the neume components in the n-gram.
"""

class IntervalsNgramDocument(NgramDocument):
pitch_names: str
intervals: str


class ContoursNgramDocument(NgramDocument):
contours: str


Expand All @@ -61,68 +71,75 @@ def generate_ngrams(sequence: List[Any], min_n: int, max_n: int) -> Iterator[Lis

class MEITokenizer(MEIParser):
"""
An MEITokenizer object is initialized with an MEI file and a set of
parameters that define how the MEI file should be tokenized. These
parameters are:
- min_ngram: The minimum length of n-grams to generate.
- max_ngram: The maximum length of n-grams to generate.
"""

def __init__(
self, mei_file: str, min_neume_ngram: int, max_neume_ngram: int
) -> None:
def __init__(self, mei_file: str, min_ngram: int, max_ngram: int) -> None:
super().__init__(mei_file)
self.min_neume_ngram = min_neume_ngram
self.max_neume_ngram = max_neume_ngram
self.sequences = self._get_musical_sequences()
self.min_ngram = min_ngram
self.max_ngram = max_ngram

def _get_musical_sequences(self) -> MusicalSequencesDict:
def get_neume_ngram_docs(self) -> List[NeumeNgramDocument]:
"""
Get sequences of musical data from the entire MEI file.
:return: A dictionary containing the following sequences of
musical data (all sequences in the form of lists of strings):
- "neume_names": A list of neume names in the file.
- "pitch_names": A list of pitch names in the file.
- "intervals": A list of intervals in the file.
- "contours": A list of contours in the file.
- "semitones": A list of semitones in the file.
Generate neume-level documents for search, containing
n-grams of neume names.
:return: A list of dictionaries containing the n-grams
of neume names.
"""
neume_names = []
pitch_names = []
intervals = []
contours = []
neumes_sequence: List[Neume] = []
for syllable in self.syllables:
for neume in syllable["neumes"]:
neume_names.append(neume["neume_type"])
contours.extend(neume["contours"])
intervals.extend([str(interval) for interval in neume["intervals"]])
for component in neume["neume_components"]:
pitch_names.append(component["pname"] + str(component["octave"]))
return {
"neume_names": neume_names,
"pitch_names": pitch_names,
"intervals": intervals,
"contours": contours,
}

def _get_ngrams(
self, ngram_type: Literal["neume_names", "pitch_names", "intervals", "contours"]
) -> List[str]:
neumes_sequence.extend(syllable["neumes"])
neume_documents: List[NeumeNgramDocument] = []
for ngram in generate_ngrams(neumes_sequence, self.min_ngram, self.max_ngram):
bounding_boxes = [
(neume["bounding_box"], neume["system"]) for neume in ngram
]
document_location = combine_bounding_boxes(bounding_boxes)
neume_names = "_".join([neume["neume_type"] for neume in ngram])
neume_documents.append(
{
"ngram_unit": "neume",
"location": stringify_bounding_boxes(document_location),
"neume_names": neume_names,
}
)
return neume_documents

def get_neume_component_ngram_docs(self) -> List[NeumeComponentNgramDocument]:
"""
Get n-grams of a particular type from the MEI file.
Generate neume component-level documents for search, containing
n-grams of pitch names, intervals, and contours.
:param ngram_type: The type of n-gram to generate. Must be one of
"neume_names", "pitch_names", "intervals", or "contours".
:return: A list of n-grams represented as strings of space-separated
items of the specified type.
:return: A list of dictionaries containing the n-grams
of pitch names, intervals, and contours.
"""
return [
" ".join(ngram)
for ngram in generate_ngrams(
self.sequences[ngram_type],
self.min_neume_ngram,
self.max_neume_ngram,
neume_components: List[NeumeComponent] = []
for syllable in self.syllables:
for neume in syllable["neumes"]:
neume_components.extend(neume["neume_components"])
neume_component_documents: List[NeumeComponentNgramDocument] = []
for ngram in generate_ngrams(
neume_components,
self.min_ngram,
self.max_ngram,
):
pitch_names = "_".join([comp["pname"] for comp in ngram])
intervals = "".join([str(interval) for interval in neume["intervals"]])
contours = "".join([contour for contour in neume["contours"]])
bounding_boxes = [(comp["bounding_box"], neume["system"]) for comp in ngram]
document_location = combine_bounding_boxes(bounding_boxes)
neume_component_documents.append(
{
"ngram_unit": "neume_component",
"location": stringify_bounding_boxes(document_location),
"pitch_names": pitch_names,
"intervals": intervals,
"contours": contours,
}
)
]
return neume_component_documents
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from cantusdata.helpers.mei_processing.bounding_box_utils import (
combine_bounding_boxes,
combine_bounding_boxes_single_system,
stringify_bounding_boxes,
)
from cantusdata.helpers.mei_processing.mei_parsing_types import Zone

Expand Down Expand Up @@ -34,3 +35,15 @@ def test_combine_bounding_boxes(self) -> None:
{"coordinates": (8, 8, 9, 10), "rotate": 0.0},
]
self.assertEqual(combined_boxes, expected_combined_boxes)

def test_stringify_bounding_boxes(self) -> None:
bounding_boxes: List[Zone] = [
{"coordinates": (0, 0, 1, 1), "rotate": 0},
{"coordinates": (2, 3, 5, 8), "rotate": 0.123},
]
stringified_boxes = stringify_bounding_boxes(bounding_boxes)
expected_stringified_boxes = (
'[{"ulx": 0, "uly": 0,"width": 1,"height": 1},'
'{"ulx": 2, "uly": 3,"width": 3,"height": 5}]'
)
self.assertEqual(stringified_boxes, expected_stringified_boxes)
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from unittest import TestCase
from os import path
from cantusdata.settings import BASE_DIR
from cantusdata.helpers.mei_processing.mei_tokenizer import (
MEITokenizer,
generate_ngrams,
)


class MEITokenizerTestCase(TestCase):

def test_generate_ngrams(self) -> None:
with self.subTest("Ngrams from 2 to 3"):
sequence = [1, 2, 3, 4, 5]
min_ngram = 2
max_ngram = 3
ngrams = list(generate_ngrams(sequence, min_ngram, max_ngram))
self.assertEqual(
ngrams,
[[1, 2], [2, 3], [3, 4], [4, 5], [1, 2, 3], [2, 3, 4], [3, 4, 5]],
)
with self.subTest("Ngrams from 3 to 5"):
sequence = [1, 2, 3, 4, 5]
min_ngram = 3
max_ngram = 5
ngrams = list(generate_ngrams(sequence, min_ngram, max_ngram))
self.assertEqual(
ngrams,
[
[1, 2, 3],
[2, 3, 4],
[3, 4, 5],
[1, 2, 3, 4],
[2, 3, 4, 5],
[1, 2, 3, 4, 5],
],
)

def test_mei_tokenizer(self) -> None:
with self.subTest("Ngrams from 2 to 3"):
tokenizer = MEITokenizer(
path.join(
BASE_DIR,
"cantusdata",
"test",
"core",
"helpers",
"mei_processing",
"test_mei_files",
"cdn-hsmu-m2149l4_001r.mei",
),
min_ngram=2,
max_ngram=3,
)
with self.subTest("Ngrams from 3 to 5"):
tokenizer = MEITokenizer(
path.join(
BASE_DIR,
"cantusdata",
"test",
"core",
"helpers",
"mei_processing",
"test_mei_files",
"cdn-hsmu-m2149l4_001r.mei",
),
min_ngram=3,
max_ngram=5,
)

0 comments on commit a93b3ec

Please sign in to comment.