Skip to content

Commit

Permalink
Add support for multi-code phonemisation
Browse files Browse the repository at this point in the history
  • Loading branch information
korakoe committed Sep 15, 2024
1 parent cba3e09 commit 4122387
Show file tree
Hide file tree
Showing 11 changed files with 1,302 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

*.pyc
92 changes: 83 additions & 9 deletions VoPho/engine.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,87 @@
from phonemizers import mandarin, russian
import warnings

class Phonemizer:
def __init__(self, working_path):
from phonemizers import english, japanese, mandarin, russian
from langtokenizers.multicoded import Tokenizer
import re

class Phonemizer:
def __init__(self, working_path=None):
self.working_path = working_path
self._phonemizers = {}
self.Tokenizer = Tokenizer()

def get_phonemizer(self, lang):
if lang not in self._phonemizers:
if lang == 'en':
self._phonemizers[lang] = english.Phonemizer()
elif lang == 'ja':
self._phonemizers[lang] = japanese.Phonemizer()
elif lang == 'zh':
self._phonemizers[lang] = mandarin.Phonemizer()
elif lang == 'ru':
self._phonemizers[lang] = russian.Phonemizer(working_path=self.working_path)
return self._phonemizers.get(lang)

def seperate_languages(self, text):
text = self.Tokenizer.tokenize(text)

pattern = r'(<(\w+)>(.*?)</\2>)|([^<]+)'
matches = re.findall(pattern, text)

result = []
current_item = {"text": "", "lang": None}

for match in matches:
if match[1]: # Tagged content
lang, content = match[1], match[2]
if current_item["lang"] != lang:
if current_item["text"]:
result.append(current_item)
current_item = {"text": content, "lang": lang}
else:
current_item["text"] += content
else: # Untagged content (punctuation or spaces)
untagged_content = match[3]
if current_item["text"]:
current_item["text"] += untagged_content
else:
result.append({"text": untagged_content, "lang": "untagged"})

if current_item["text"]:
result.append(current_item)

return result

def phonemize_text_segment(self, text, lang):
phonemizer = self.get_phonemizer(lang)
if phonemizer:
return phonemizer.phonemize(text)
return f"<??>{text}</??>" # Return original text if no phonemizer available

def phonemize(self, input_text):
separated = self.seperate_languages(input_text)
result = []
for item in separated:
phonemized_text = self.phonemize_text_segment(item['text'], item['lang'])
checked_languages = self.Tokenizer.detect_japanese_korean_chinese(phonemized_text)
if checked_languages != "??":
segmentsCJK = self.Tokenizer.split_non_cjk_in_segment(phonemized_text)
for CJK in segmentsCJK:
if self.Tokenizer.detect_japanese_korean_chinese(CJK) != "??":
phonemized_text = phonemized_text.replace(CJK,
self.phonemize_text_segment(CJK, checked_languages))
result.append(phonemized_text)
fin = ''.join(result)

if "<??>" in fin:
warnings.warn(
"Your output contains unsupported languages, "
"<??> tags have been added to allow for manual filtering")
return fin

# PHONEMIZERS THAT REQUIRE A GPU ARE PLACED HERE
# This is so we can allocate them a class that will
# allow us to deallocate and prevent double loading a
# phonemizer, if you really need to allocate multiple
# simply instantiate another VoPho.engine.Phonemizer
self.russian_phonemiser = None
if __name__ == "__main__":
input_text = "hello, 你好は中国語でこんにちはと言う意味をしています。مرحبا! Привет! नमस्ते!"
engine = Phonemizer()
output = engine.phonemize(input_text)
print(input_text)
print(output)
Empty file.
File renamed without changes.
2 changes: 0 additions & 2 deletions VoPho/phonemizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from mandarin import phonemize
from russian import Phonemizer
57 changes: 57 additions & 0 deletions VoPho/phonemizers/english.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from openphonemizer import OpenPhonemizer


class Phonemizer:
def __init__(self):
self.phonemizer = OpenPhonemizer()

# post-processing
self.manual_filters = {
" . . . ": "... ",
" . ": ". "
}

def phonemize(self, text):
result = []
in_quotes = False
current_segment = ""

for char in text:
if char == '"':
# Process the current segment before changing quote state
if current_segment:
if not in_quotes:
processed_segment = self.phonemizer(current_segment)
else:
processed_segment = f'{self.phonemizer(current_segment)}'
result.append(processed_segment)
current_segment = ""

# Add the quote character and flip the state
result.append(char)
in_quotes = not in_quotes
else:
current_segment += char

# Process any remaining text
if current_segment:
if not in_quotes:
processed_segment = self.phonemizer(current_segment)
else:
processed_segment = f'"{self.phonemizer(current_segment)}"'
result.append(processed_segment)

result = ''.join(result)

# apply manual filters
for filter, item in self.manual_filters.items():
result = result.replace(filter, item)

return result


if __name__ == "__main__":
phonem = Phonemizer()
test_text = 'this is a test, "sometimes this is removed", and this is not. graduation is a key part of... celebration'
print(f"Original: {test_text}")
print(f"Phonemized: {phonem.phonemize(test_text)}")
Loading

0 comments on commit 4122387

Please sign in to comment.