-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for multi-code phonemisation
- Loading branch information
Showing
11 changed files
with
1,302 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
|
||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,87 @@ | ||
from phonemizers import mandarin, russian | ||
import warnings | ||
|
||
class Phonemizer: | ||
def __init__(self, working_path): | ||
from phonemizers import english, japanese, mandarin, russian | ||
from langtokenizers.multicoded import Tokenizer | ||
import re | ||
|
||
class Phonemizer: | ||
def __init__(self, working_path=None): | ||
self.working_path = working_path | ||
self._phonemizers = {} | ||
self.Tokenizer = Tokenizer() | ||
|
||
def get_phonemizer(self, lang): | ||
if lang not in self._phonemizers: | ||
if lang == 'en': | ||
self._phonemizers[lang] = english.Phonemizer() | ||
elif lang == 'ja': | ||
self._phonemizers[lang] = japanese.Phonemizer() | ||
elif lang == 'zh': | ||
self._phonemizers[lang] = mandarin.Phonemizer() | ||
elif lang == 'ru': | ||
self._phonemizers[lang] = russian.Phonemizer(working_path=self.working_path) | ||
return self._phonemizers.get(lang) | ||
|
||
def seperate_languages(self, text): | ||
text = self.Tokenizer.tokenize(text) | ||
|
||
pattern = r'(<(\w+)>(.*?)</\2>)|([^<]+)' | ||
matches = re.findall(pattern, text) | ||
|
||
result = [] | ||
current_item = {"text": "", "lang": None} | ||
|
||
for match in matches: | ||
if match[1]: # Tagged content | ||
lang, content = match[1], match[2] | ||
if current_item["lang"] != lang: | ||
if current_item["text"]: | ||
result.append(current_item) | ||
current_item = {"text": content, "lang": lang} | ||
else: | ||
current_item["text"] += content | ||
else: # Untagged content (punctuation or spaces) | ||
untagged_content = match[3] | ||
if current_item["text"]: | ||
current_item["text"] += untagged_content | ||
else: | ||
result.append({"text": untagged_content, "lang": "untagged"}) | ||
|
||
if current_item["text"]: | ||
result.append(current_item) | ||
|
||
return result | ||
|
||
def phonemize_text_segment(self, text, lang): | ||
phonemizer = self.get_phonemizer(lang) | ||
if phonemizer: | ||
return phonemizer.phonemize(text) | ||
return f"<??>{text}</??>" # Return original text if no phonemizer available | ||
|
||
def phonemize(self, input_text): | ||
separated = self.seperate_languages(input_text) | ||
result = [] | ||
for item in separated: | ||
phonemized_text = self.phonemize_text_segment(item['text'], item['lang']) | ||
checked_languages = self.Tokenizer.detect_japanese_korean_chinese(phonemized_text) | ||
if checked_languages != "??": | ||
segmentsCJK = self.Tokenizer.split_non_cjk_in_segment(phonemized_text) | ||
for CJK in segmentsCJK: | ||
if self.Tokenizer.detect_japanese_korean_chinese(CJK) != "??": | ||
phonemized_text = phonemized_text.replace(CJK, | ||
self.phonemize_text_segment(CJK, checked_languages)) | ||
result.append(phonemized_text) | ||
fin = ''.join(result) | ||
|
||
if "<??>" in fin: | ||
warnings.warn( | ||
"Your output contains unsupported languages, " | ||
"<??> tags have been added to allow for manual filtering") | ||
return fin | ||
|
||
# PHONEMIZERS THAT REQUIRE A GPU ARE PLACED HERE | ||
# This is so we can allocate them a class that will | ||
# allow us to deallocate and prevent double loading a | ||
# phonemizer, if you really need to allocate multiple | ||
# simply instantiate another VoPho.engine.Phonemizer | ||
self.russian_phonemiser = None | ||
if __name__ == "__main__": | ||
input_text = "hello, 你好は中国語でこんにちはと言う意味をしています。مرحبا! Привет! नमस्ते!" | ||
engine = Phonemizer() | ||
output = engine.phonemize(input_text) | ||
print(input_text) | ||
print(output) |
Empty file.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +0,0 @@ | ||
from mandarin import phonemize | ||
from russian import Phonemizer | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from openphonemizer import OpenPhonemizer | ||
|
||
|
||
class Phonemizer: | ||
def __init__(self): | ||
self.phonemizer = OpenPhonemizer() | ||
|
||
# post-processing | ||
self.manual_filters = { | ||
" . . . ": "... ", | ||
" . ": ". " | ||
} | ||
|
||
def phonemize(self, text): | ||
result = [] | ||
in_quotes = False | ||
current_segment = "" | ||
|
||
for char in text: | ||
if char == '"': | ||
# Process the current segment before changing quote state | ||
if current_segment: | ||
if not in_quotes: | ||
processed_segment = self.phonemizer(current_segment) | ||
else: | ||
processed_segment = f'{self.phonemizer(current_segment)}' | ||
result.append(processed_segment) | ||
current_segment = "" | ||
|
||
# Add the quote character and flip the state | ||
result.append(char) | ||
in_quotes = not in_quotes | ||
else: | ||
current_segment += char | ||
|
||
# Process any remaining text | ||
if current_segment: | ||
if not in_quotes: | ||
processed_segment = self.phonemizer(current_segment) | ||
else: | ||
processed_segment = f'"{self.phonemizer(current_segment)}"' | ||
result.append(processed_segment) | ||
|
||
result = ''.join(result) | ||
|
||
# apply manual filters | ||
for filter, item in self.manual_filters.items(): | ||
result = result.replace(filter, item) | ||
|
||
return result | ||
|
||
|
||
if __name__ == "__main__": | ||
phonem = Phonemizer() | ||
test_text = 'this is a test, "sometimes this is removed", and this is not. graduation is a key part of... celebration' | ||
print(f"Original: {test_text}") | ||
print(f"Phonemized: {phonem.phonemize(test_text)}") |
Oops, something went wrong.