Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle irregular strings #3

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions numerical_pinyin_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,56 @@

# Dictionary with lists of tonal pinyin for each vowel
pinyin = {
'a': ['ā', 'á', 'ǎ', 'à', 'a'],
'a': ['ā', 'á', 'ǎ', 'à', 'a'],
'e': ['ē', 'é', 'ě', 'è', 'e'],
'i': ['ī', 'í', 'ǐ', 'ì', 'i'],
'o': ['ō', 'ó', 'ǒ', 'ò', 'o'],
'u': ['ū', 'ú', 'ǔ', 'ù', 'u'],
'i': ['ī', 'í', 'ǐ', 'ì', 'i'],
'o': ['ō', 'ó', 'ǒ', 'ò', 'o'],
'u': ['ū', 'ú', 'ǔ', 'ù', 'u'],
'ü': ['ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü']
}


# Function to enable/disable debugging print statements
def debug(*args, **kwargs):
if DEBUG_ENABLED:
print(*args, **kwargs)


import re
def split_words(word):
# Regular expression pattern to split behind a digit
pattern = r'(?<=[1-5])'
split_word = re.split(pattern, word)

new_split_word = []
for i in range(len(split_word)):
# Strip leading and trailing spaces from the string
split_word[i] = split_word[i].strip()
if split_word[i] != '':
new_split_word.append(split_word[i])
split_word = new_split_word


return split_word

# Function that converts numerical pinyin (ni3) to tone marked pinyin (nǐ)
def convert_from_numerical_pinyin(word):

finished_word = []

# Splits word into individual character strings and calls convert_indiv_character for each
split_word = word.split(' ')
split_word = split_words(word)
for indiv_character in split_word:
finished_char = convert_indiv_character(indiv_character)
finished_word.append(finished_char)

# Joins the returned indiv char back into one string
finished_string = " ".join(finished_word)
finished_string = "".join(finished_word)
debug("Joined individual characters into finished word:", finished_string)
return finished_string



# Converts indiv char to tone marked chars
def convert_indiv_character(indiv_character):

debug("")
debug("------")
debug("Starting loop for word:", indiv_character)
Expand All @@ -58,7 +76,7 @@ def convert_indiv_character(indiv_character):
counter = counter + 1
vowels.append(char)
debug("Found vowels:", vowels)

# If multiple vowels are found, use this logic to choose vowel for tone mark
# a, e, or o takes tone mark - a takes tone in 'ao'
# else, second vowel takes tone mark
Expand All @@ -71,13 +89,13 @@ def convert_indiv_character(indiv_character):
tone_vowel = 'o'
elif 'e' in vowels:
tone_vowel = 'e'
else:
else:
tone_vowel = vowels[1]

debug("Selected vowel:", tone_vowel)
elif counter == 0:
# try:
# try:

# If the character is r5 (儿), remove tone number and return
if letter_list == ["r", "5"]:
return "".join(letter_list[:-1])
Expand All @@ -88,15 +106,15 @@ def convert_indiv_character(indiv_character):
tone_vowel = vowels[0]
debug("Only one vowel found:", tone_vowel)

# Select tone number, which is last item in letter_list
# Select tone number, which is last item in letter_list
tone = letter_list[-1]

# Set integer to use as pinyin dict/list index
# Select tonal vowel from pinyin dict/list using tone_vowel and tone index
try:
tone_int = int(tone)-1
try:
tone_int = int(tone) - 1
tonal_pinyin = pinyin[tone_vowel][tone_int]

except Exception as e:
raise ValueError("Invalid numerical pinyin. The last letter must be an integer between 1-5.")

Expand All @@ -105,18 +123,18 @@ def convert_indiv_character(indiv_character):

# Cal replace_tone_vowel to replace and reformat the string
return replace_tone_vowel(letter_list, tone_vowel, tonal_pinyin)



def replace_tone_vowel(letter_list, tone_vowel, tonal_pinyin):

# Replace the tone vowel with tone marked vowel
letter_list = [w.replace(tone_vowel, tonal_pinyin) for w in letter_list]
debug("Replaced tone vowel with tone mark:", letter_list)

#Remove tone number
# Remove tone number
tone_number_removed = letter_list[:-1]
debug("Removed now unnecessary tone number:", tone_number_removed)

#Reform string
# Reform string
finished_char = "".join(tone_number_removed)
debug("Made the letters list into a string:", finished_char)
return finished_char