From 93daaf78672f1cceaba502b1240bac5f2304dbb4 Mon Sep 17 00:00:00 2001 From: "P. v Petersenn" <122872435+vonpetersenn@users.noreply.github.com> Date: Wed, 20 Sep 2023 16:19:17 +0200 Subject: [PATCH] handle irregular strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit defining a split_words function to handle strings such as 'chī\\fàn'. Before, handling \\ would not have been possible. --- numerical_pinyin_converter.py | 62 ++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/numerical_pinyin_converter.py b/numerical_pinyin_converter.py index da0f464..00b57c4 100644 --- a/numerical_pinyin_converter.py +++ b/numerical_pinyin_converter.py @@ -3,38 +3,56 @@ # Dictionary with lists of tonal pinyin for each vowel pinyin = { - 'a': ['ā', 'á', 'ǎ', 'à', 'a'], + 'a': ['ā', 'á', 'ǎ', 'à', 'a'], 'e': ['ē', 'é', 'ě', 'è', 'e'], - 'i': ['ī', 'í', 'ǐ', 'ì', 'i'], - 'o': ['ō', 'ó', 'ǒ', 'ò', 'o'], - 'u': ['ū', 'ú', 'ǔ', 'ù', 'u'], + 'i': ['ī', 'í', 'ǐ', 'ì', 'i'], + 'o': ['ō', 'ó', 'ǒ', 'ò', 'o'], + 'u': ['ū', 'ú', 'ǔ', 'ù', 'u'], 'ü': ['ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü'] } + # Function to enable/disable debugging print statements def debug(*args, **kwargs): if DEBUG_ENABLED: print(*args, **kwargs) + +import re +def split_words(word): + # Regular expression pattern to split behind a digit + pattern = r'(?<=[1-5])' + split_word = re.split(pattern, word) + + new_split_word = [] + for i in range(len(split_word)): + # Strip leading and trailing spaces from the string + split_word[i] = split_word[i].strip() + if split_word[i] != '': + new_split_word.append(split_word[i]) + split_word = new_split_word + + + return split_word + # Function that converts numerical pinyin (ni3) to tone marked pinyin (nǐ) def convert_from_numerical_pinyin(word): - finished_word = [] # Splits word into individual character strings and calls convert_indiv_character for each - split_word = word.split(' ') + split_word = split_words(word) for indiv_character in split_word: finished_char = convert_indiv_character(indiv_character) finished_word.append(finished_char) # Joins the returned indiv char back into one string - finished_string = " ".join(finished_word) + finished_string = "".join(finished_word) debug("Joined individual characters into finished word:", finished_string) return finished_string - + + # Converts indiv char to tone marked chars def convert_indiv_character(indiv_character): - debug("") debug("------") debug("Starting loop for word:", indiv_character) @@ -58,7 +76,7 @@ def convert_indiv_character(indiv_character): counter = counter + 1 vowels.append(char) debug("Found vowels:", vowels) - + # If multiple vowels are found, use this logic to choose vowel for tone mark # a, e, or o takes tone mark - a takes tone in 'ao' # else, second vowel takes tone mark @@ -71,13 +89,13 @@ def convert_indiv_character(indiv_character): tone_vowel = 'o' elif 'e' in vowels: tone_vowel = 'e' - else: + else: tone_vowel = vowels[1] debug("Selected vowel:", tone_vowel) elif counter == 0: - # try: - + # try: + # If the character is r5 (儿), remove tone number and return if letter_list == ["r", "5"]: return "".join(letter_list[:-1]) @@ -88,15 +106,15 @@ def convert_indiv_character(indiv_character): tone_vowel = vowels[0] debug("Only one vowel found:", tone_vowel) - # Select tone number, which is last item in letter_list + # Select tone number, which is last item in letter_list tone = letter_list[-1] - + # Set integer to use as pinyin dict/list index # Select tonal vowel from pinyin dict/list using tone_vowel and tone index - try: - tone_int = int(tone)-1 + try: + tone_int = int(tone) - 1 tonal_pinyin = pinyin[tone_vowel][tone_int] - + except Exception as e: raise ValueError("Invalid numerical pinyin. The last letter must be an integer between 1-5.") @@ -105,18 +123,18 @@ def convert_indiv_character(indiv_character): # Cal replace_tone_vowel to replace and reformat the string return replace_tone_vowel(letter_list, tone_vowel, tonal_pinyin) - + + def replace_tone_vowel(letter_list, tone_vowel, tonal_pinyin): - # Replace the tone vowel with tone marked vowel letter_list = [w.replace(tone_vowel, tonal_pinyin) for w in letter_list] debug("Replaced tone vowel with tone mark:", letter_list) - #Remove tone number + # Remove tone number tone_number_removed = letter_list[:-1] debug("Removed now unnecessary tone number:", tone_number_removed) - #Reform string + # Reform string finished_char = "".join(tone_number_removed) debug("Made the letters list into a string:", finished_char) return finished_char \ No newline at end of file