Skip to content

Commit

Permalink
numerous patches, account for stress and update japanese
Browse files Browse the repository at this point in the history
  • Loading branch information
korakoe committed Jan 6, 2025
1 parent 729b262 commit 9e04f61
Show file tree
Hide file tree
Showing 7 changed files with 9,646 additions and 95,460 deletions.
6 changes: 4 additions & 2 deletions VoPho/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ class Phonemizer:
A class for phonemizing text in multiple languages,
"""

def __init__(self, working_path=None):
def __init__(self, working_path=None, stress=False):
"""
Initialize the Phonemizer.
:param working_path: Optional path for working directory
:param stress: Optional toggle for stress, for phonemisers that support it
"""
self.working_path = working_path
self.stress = stress
self._phonemizers = {}
self.Tokenizer = Tokenizer()

Expand Down Expand Up @@ -55,7 +57,7 @@ def get_phonemizer(self, lang):
"""
if lang not in self._phonemizers:
if lang == 'en':
self._phonemizers[lang] = english.Phonemizer()
self._phonemizers[lang] = english.Phonemizer(stress=self.stress)
elif lang == 'ja':
self._phonemizers[lang] = japanese.Phonemizer()
elif lang == 'zh':
Expand Down
14 changes: 11 additions & 3 deletions VoPho/phonemizers/english.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,12 @@
"herb": "hɜːrb", # (often mispronounced as 'urb')
}

innacurate_from_phonemizer = {
"british": "ˈbrɪt.ɪʃ"
}

# Combine both dictionaries
manual_phonemizations = {**general, **proper_names, **common_mispronunciations}
manual_phonemizations = {**general, **proper_names, **common_mispronunciations, **innacurate_from_phonemizer}

model = SentenceTransformer('all-MiniLM-L6-v2')

Expand Down Expand Up @@ -147,7 +151,6 @@

### ^^^ PLACEHOLDER UNTIL MANUAL DICT CREATED


def get_most_similar_definition(word, query):
if word not in word_definitions:
return "", word
Expand Down Expand Up @@ -266,14 +269,15 @@ def replace_homonyms(text):


class Phonemizer:
def __init__(self, manual_fixes=None, allow_heteronyms=True): # temporarily allow heteronyms until we fill the dictionary
def __init__(self, manual_fixes=None, allow_heteronyms=True, stress=False): # temporarily allow heteronyms until we fill the dictionary
if manual_fixes is None:
manual_fixes = manual_phonemizations
self.phonemizer = OpenPhonemizer()

# Dictionary of manual phonemizations
self.manual_phonemizations = manual_fixes
self.allow_heteronyms=allow_heteronyms
self.stress = stress

# Post-processing filters
self.manual_filters = {
Expand All @@ -295,6 +299,10 @@ def preprocess(self, text):

def postprocess(self, text):
# Remove the <phoneme> tags but retain the IPA within them, preserving spaces
if not self.stress:
text = re.sub("ˈ", "", text)
text = re.sub("\u02C8", "", text) # double check

return self.phoneme_tag_pattern.sub(r"\1", text)

def phonemize(self, text):
Expand Down
Loading

0 comments on commit 9e04f61

Please sign in to comment.