From 93daaf78672f1cceaba502b1240bac5f2304dbb4 Mon Sep 17 00:00:00 2001
From: "P. v Petersenn" <122872435+vonpetersenn@users.noreply.github.com>
Date: Wed, 20 Sep 2023 16:19:17 +0200
Subject: [PATCH] handle irregular strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

defining a split_words function to handle strings such as 'chī\\fàn'. Before, handling \\ would not have been possible.
---
 numerical_pinyin_converter.py | 62 ++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/numerical_pinyin_converter.py b/numerical_pinyin_converter.py
index da0f464..00b57c4 100644
--- a/numerical_pinyin_converter.py
+++ b/numerical_pinyin_converter.py
@@ -3,38 +3,56 @@
 
 # Dictionary with lists of tonal pinyin for each vowel
 pinyin = {
-    'a': ['ā', 'á', 'ǎ', 'à', 'a'], 
+    'a': ['ā', 'á', 'ǎ', 'à', 'a'],
     'e': ['ē', 'é', 'ě', 'è', 'e'],
-    'i': ['ī', 'í', 'ǐ', 'ì', 'i'], 
-    'o': ['ō', 'ó', 'ǒ', 'ò', 'o'], 
-    'u': ['ū', 'ú', 'ǔ', 'ù', 'u'], 
+    'i': ['ī', 'í', 'ǐ', 'ì', 'i'],
+    'o': ['ō', 'ó', 'ǒ', 'ò', 'o'],
+    'u': ['ū', 'ú', 'ǔ', 'ù', 'u'],
     'ü': ['ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü']
 }
 
+
 # Function to enable/disable debugging print statements
 def debug(*args, **kwargs):
     if DEBUG_ENABLED:
         print(*args, **kwargs)
 
+
+import re
+def split_words(word):
+    # Regular expression pattern to split behind a digit
+    pattern = r'(?<=[1-5])'
+    split_word = re.split(pattern, word)
+
+    new_split_word = []
+    for i in range(len(split_word)):
+        # Strip leading and trailing spaces from the string
+        split_word[i] = split_word[i].strip()
+        if split_word[i] != '':
+            new_split_word.append(split_word[i])
+    split_word = new_split_word
+
+
+    return split_word
+
 # Function that converts numerical pinyin (ni3) to tone marked pinyin (nǐ)
 def convert_from_numerical_pinyin(word):
-
     finished_word = []
 
     # Splits word into individual character strings and calls convert_indiv_character for each
-    split_word = word.split(' ')
+    split_word = split_words(word)
     for indiv_character in split_word:
         finished_char = convert_indiv_character(indiv_character)
         finished_word.append(finished_char)
 
     # Joins the returned indiv char back into one string
-    finished_string = " ".join(finished_word)
+    finished_string = "".join(finished_word)
     debug("Joined individual characters into finished word:", finished_string)
     return finished_string
-    
+
+
 # Converts indiv char to tone marked chars
 def convert_indiv_character(indiv_character):
-
     debug("")
     debug("------")
     debug("Starting loop for word:", indiv_character)
@@ -58,7 +76,7 @@ def convert_indiv_character(indiv_character):
             counter = counter + 1
             vowels.append(char)
     debug("Found vowels:", vowels)
-    
+
     # If multiple vowels are found, use this logic to choose vowel for tone mark
     # a, e, or o takes tone mark - a takes tone in 'ao'
     # else, second vowel takes tone mark
@@ -71,13 +89,13 @@ def convert_indiv_character(indiv_character):
             tone_vowel = 'o'
         elif 'e' in vowels:
             tone_vowel = 'e'
-        else: 
+        else:
             tone_vowel = vowels[1]
 
         debug("Selected vowel:", tone_vowel)
     elif counter == 0:
-        # try:    
-        
+        # try:
+
         # If the character is r5 (儿), remove tone number and return
         if letter_list == ["r", "5"]:
             return "".join(letter_list[:-1])
@@ -88,15 +106,15 @@ def convert_indiv_character(indiv_character):
         tone_vowel = vowels[0]
         debug("Only one vowel found:", tone_vowel)
 
-    # Select tone number, which is last item in letter_list  
+    # Select tone number, which is last item in letter_list
     tone = letter_list[-1]
-    
+
     # Set integer to use as pinyin dict/list index
     # Select tonal vowel from pinyin dict/list using tone_vowel and tone index
-    try: 
-        tone_int = int(tone)-1
+    try:
+        tone_int = int(tone) - 1
         tonal_pinyin = pinyin[tone_vowel][tone_int]
-        
+
     except Exception as e:
         raise ValueError("Invalid numerical pinyin. The last letter must be an integer between 1-5.")
 
@@ -105,18 +123,18 @@ def convert_indiv_character(indiv_character):
 
     # Cal replace_tone_vowel to replace and reformat the string
     return replace_tone_vowel(letter_list, tone_vowel, tonal_pinyin)
-    
+
+
 def replace_tone_vowel(letter_list, tone_vowel, tonal_pinyin):
-    
     # Replace the tone vowel with tone marked vowel
     letter_list = [w.replace(tone_vowel, tonal_pinyin) for w in letter_list]
     debug("Replaced tone vowel with tone mark:", letter_list)
 
-    #Remove tone number
+    # Remove tone number
     tone_number_removed = letter_list[:-1]
     debug("Removed now unnecessary tone number:", tone_number_removed)
 
-    #Reform string
+    # Reform string
     finished_char = "".join(tone_number_removed)
     debug("Made the letters list into a string:", finished_char)
     return finished_char
\ No newline at end of file