-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into 8-replicate-demetr-results-for-bleu
- Loading branch information
Showing
5 changed files
with
143 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import json | ||
|
||
import requests # type: ignore[import-untyped] | ||
|
||
model = "llama3.2" | ||
|
||
|
||
def generate(prompt, context): | ||
r = requests.post( | ||
"http://localhost:11434/api/generate", | ||
json={ | ||
"model": model, | ||
"prompt": prompt, | ||
"context": context, | ||
}, | ||
stream=False, | ||
) | ||
r.raise_for_status() | ||
|
||
response_text = "" | ||
# Iterating over the returned lines would also allow streaming, | ||
# but we don't really care about it at the moment. | ||
for line in r.iter_lines(): | ||
body = json.loads(line) | ||
response_part = body.get("response", "") | ||
response_text += response_part | ||
|
||
if "error" in body: | ||
raise Exception(body["error"]) | ||
|
||
if body.get("done", False): | ||
break | ||
|
||
return response_text | ||
|
||
|
||
# NOTE: ollama must be running for this to work, start the ollama | ||
# app or run `ollama serve` | ||
def corrupt_text(text: str, corruption_level: int = 2) -> str: | ||
r""" | ||
Get Ollama to corrupt your text to the required 'level'. | ||
The levels are defined in the prompt to be between 0 and 5, where | ||
0 is the original text, and 5 is 'very heavy disfluency'. This | ||
function will modify the prompt with the text and level, and | ||
return the LLM response. | ||
This function requires that the Ollama server is running. | ||
""" | ||
prompt_statement = """ | ||
You will receive a short snippet of text, and your job is to \ | ||
corrupt the text, adding disfluencies in such a way, that the text appears \ | ||
more conversational. Try to keep the same underlying meaning, and only \ | ||
change the style and fluency. | ||
On the corruption scale 0 to 5, where 0 means return the original \ | ||
text without additional disfluency, and 5 means very heavy disfluency, please \ | ||
corrupt the text to level {}. | ||
In your response include only the modified text, and nothing else. | ||
The text is: {} | ||
""" | ||
prompt = prompt_statement.format(corruption_level, text) | ||
return generate(prompt, []) | ||
|
||
|
||
if __name__ == "__main__": | ||
# Test | ||
output = corrupt_text("Hello, I'm Bob.", corruption_level=4) | ||
print(output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import glob | ||
import os | ||
import re | ||
|
||
from tqdm import tqdm | ||
|
||
|
||
class TranscriptParser: | ||
r""" | ||
Provides a bag of conversational lines. | ||
Instantiate this by using the `from_folder` class method and | ||
pointing it to a folder from the CallHome dataset, for example | ||
the 'deu' folder for transcriptions in German. This class will | ||
try its best to remove the .cha format specifics, and only | ||
keep the UTF8 characters, thus providing text we can use for | ||
downstream translation. | ||
""" | ||
|
||
def __init__(self): | ||
self.lines = [] | ||
|
||
@classmethod | ||
def from_folder(cls, folder_path: str): | ||
parser = cls() | ||
# Loop through all .cha files in the folder | ||
for file_path in tqdm( | ||
glob.glob(os.path.join(folder_path, "*.cha")), desc=f"Parsing {folder_path}" | ||
): | ||
with open(file_path) as file: | ||
data = file.read() | ||
parser.parse_transcription(data) | ||
|
||
return parser | ||
|
||
def parse_line(self, line: str): | ||
# Match lines with participant utterances | ||
match = re.match(r"\*(\w):\s+(.*)", line) | ||
if match: | ||
participant, text = match.groups() | ||
# Remove timestamps (e.g., •50770_51060•) from the text | ||
# And other artefacts | ||
clean_text = re.sub(r"\x15\d+_\d+\x15", "", text).strip() | ||
clean_text = re.sub(r"&=\S+", "", clean_text).strip() | ||
clean_text = re.sub(r"&+\S+", "", clean_text).strip() | ||
clean_text = re.sub(r"\+/", "", clean_text).strip() | ||
clean_text = re.sub(r"\+", "", clean_text).strip() | ||
if clean_text in [".", "?", "!"]: | ||
# Nothing but the punctuation is remaining | ||
return | ||
|
||
self.lines.append(clean_text) | ||
|
||
def parse_transcription(self, data: str): | ||
lines = data.split("\n") | ||
for line in lines: | ||
if line in ["@Begin", "@UTF8", "@End"]: | ||
# The begin header | ||
pass | ||
elif line.startswith("*"): | ||
# Participant line | ||
self.parse_line(line) |