Merge branch 'main' into 8-replicate-demetr-results-for-bleu

alan-turing-institute · Jan 10, 2025 · f9ba2d0 · f9ba2d0
2 parents 5c59dcc + 713f2fb
commit f9ba2d0
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -21,6 +21,15 @@ python -m pip install .
 
 `typst compile notes.typ`
 
+## CallHome Dataset
+
+Go [https://ca.talkbank.org/access/CallHome](here), select the conversation language, create account, then you can download the "media folder". There you can find the .cha files, which contain the transcriptions.
+
+To load the transcriptions as a bag of sentences, use `m4st.parse.TranscriptParser.from_folder` to load all conversation lines. This class does not group them by participant, or conversation - it just loads every line as an entry to a list (+ some pre-processing).
+
+## Ollama
+
+To use the Ollama client, which is one way to corrupt sentences randomly, you need to install [https://ollama.com](Ollama), and run the server.
 
 ## License
 

diff --git a/doc/notes.typ b/doc/notes.typ
@@ -24,7 +24,7 @@ Either way, the translation will be influenced by the domain shift due to filler
 //We don't know the real world distribution of filler words, but we could use a LLM to sample from $bb(P)(hat(x) | x)$, where $x$ is the clean input, and $hat(x)$ is the filler-word-corrupted input.
 
 The translation model can be defined as $cal(T): x arrow x^prime$, where $x^prime$ is the translated text.
-The metric can be defined as $cal(M): x^prime, x, {y_i}_(i=1)^N arrow bb(R)$, where $y_i$ are reference translations provided by $N$ translators.
+The metric can be defined as $cal(M): [a], x^prime, x, {y_i}_(i=1)^N arrow bb(R)$, where $y_i$ are reference translations provided by $N$ translators, and $a$ is the source audio (denoted optional since some metrics don't accept it).
 In our use case $N=1$.
 
 We are generally not interested in benchmarking different models, so we can assume that $cal(T)$ is given.

diff --git a/src/m4st/ollama/__init__.py b/src/m4st/ollama/__init__.py
diff --git a/src/m4st/ollama/client.py b/src/m4st/ollama/client.py
@@ -0,0 +1,71 @@
+import json
+
+import requests  # type: ignore[import-untyped]
+
+model = "llama3.2"
+
+
+def generate(prompt, context):
+    r = requests.post(
+        "http://localhost:11434/api/generate",
+        json={
+            "model": model,
+            "prompt": prompt,
+            "context": context,
+        },
+        stream=False,
+    )
+    r.raise_for_status()
+
+    response_text = ""
+    # Iterating over the returned lines would also allow streaming,
+    # but we don't really care about it at the moment.
+    for line in r.iter_lines():
+        body = json.loads(line)
+        response_part = body.get("response", "")
+        response_text += response_part
+
+        if "error" in body:
+            raise Exception(body["error"])
+
+        if body.get("done", False):
+            break
+
+    return response_text
+
+
+# NOTE: ollama must be running for this to work, start the ollama
+# app or run `ollama serve`
+def corrupt_text(text: str, corruption_level: int = 2) -> str:
+    r"""
+    Get Ollama to corrupt your text to the required 'level'.
+
+    The levels are defined in the prompt to be between 0 and 5, where
+    0 is the original text, and 5 is 'very heavy disfluency'. This
+    function will modify the prompt with the text and level, and
+    return the LLM response.
+
+    This function requires that the Ollama server is running.
+    """
+    prompt_statement = """
+        You will receive a short snippet of text, and your job is to \
+corrupt the text, adding disfluencies in such a way, that the text appears \
+more conversational. Try to keep the same underlying meaning, and only \
+change the style and fluency.
+
+        On the corruption scale 0 to 5, where 0 means return the original \
+text without additional disfluency, and 5 means very heavy disfluency, please \
+corrupt the text to level {}.
+
+        In your response include only the modified text, and nothing else.
+
+        The text is: {}
+    """
+    prompt = prompt_statement.format(corruption_level, text)
+    return generate(prompt, [])
+
+
+if __name__ == "__main__":
+    # Test
+    output = corrupt_text("Hello, I'm Bob.", corruption_level=4)
+    print(output)
diff --git a/src/m4st/parse.py b/src/m4st/parse.py
@@ -0,0 +1,62 @@
+import glob
+import os
+import re
+
+from tqdm import tqdm
+
+
+class TranscriptParser:
+    r"""
+    Provides a bag of conversational lines.
+
+    Instantiate this by using the `from_folder` class method and
+    pointing it to a folder from the CallHome dataset, for example
+    the 'deu' folder for transcriptions in German. This class will
+    try its best to remove the .cha format specifics, and only
+    keep the UTF8 characters, thus providing text we can use for
+    downstream translation.
+    """
+
+    def __init__(self):
+        self.lines = []
+
+    @classmethod
+    def from_folder(cls, folder_path: str):
+        parser = cls()
+        # Loop through all .cha files in the folder
+        for file_path in tqdm(
+            glob.glob(os.path.join(folder_path, "*.cha")), desc=f"Parsing {folder_path}"
+        ):
+            with open(file_path) as file:
+                data = file.read()
+                parser.parse_transcription(data)
+
+        return parser
+
+    def parse_line(self, line: str):
+        # Match lines with participant utterances
+        match = re.match(r"\*(\w):\s+(.*)", line)
+        if match:
+            participant, text = match.groups()
+            # Remove timestamps (e.g., •50770_51060•) from the text
+            # And other artefacts
+            clean_text = re.sub(r"\x15\d+_\d+\x15", "", text).strip()
+            clean_text = re.sub(r"&=\S+", "", clean_text).strip()
+            clean_text = re.sub(r"&+\S+", "", clean_text).strip()
+            clean_text = re.sub(r"\+/", "", clean_text).strip()
+            clean_text = re.sub(r"\+", "", clean_text).strip()
+            if clean_text in [".", "?", "!"]:
+                # Nothing but the punctuation is remaining
+                return
+
+            self.lines.append(clean_text)
+
+    def parse_transcription(self, data: str):
+        lines = data.split("\n")
+        for line in lines:
+            if line in ["@Begin", "@UTF8", "@End"]:
+                # The begin header
+                pass
+            elif line.startswith("*"):
+                # Participant line
+                self.parse_line(line)