diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cb88c88 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +bk/ +venv/ +test/*.xml +__pycache__ +*.py[cod] +*.swp + +build/ +develop-eggs/ +dist/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/README.md b/README.md new file mode 100644 index 0000000..76a9d2a --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +## [CMU Linguistic Annotation Backend](https://github.com/neulab/cmulab/) plugin for [ELAN](https://archive.mpi.nl/tla/elan) + +This plugin is still a work in progress. Eventually it will allow ELAN users to invoke various services (translation, transcription, POS tagging etc.) provided by the CMU Linguistic Annotation Backend server + + +### Setup + +#### Linux + +1. Download the latest version of ELAN from [here](https://archive.mpi.nl/tla/elan/download) and install it: +``` +wget https://www.mpi.nl/tools/elan/ELAN-XX_linux.tar.gz +tar xzf ELAN-XX_linux.tar.gz +``` + +2. [Download a copy of this repo](https://github.com/zaidsheikh/cmulab_elan_extension/archive/refs/heads/main.zip) and unzip it. Copy the `cmulab_elan_extension-main/` folder into ELAN's extensions dir (`ELAN-XX/lib/app/extensions/`). + +#### Mac + +1. If ELAN is not already installed on your Mac, [download the latest .dmg installer](https://archive.mpi.nl/tla/elan/download) and install it. It should be installed in the `/Applications/ELAN_XX` directory, where `XX` is the name of the version. +2. Download this [zip file](https://github.com/zaidsheikh/cmulab_elan_extension/archive/refs/heads/main.zip) and unzip it. You should see a folder named `cmulab_elan_extension-main` containing the contents of this repo. +3. Right-click `ELAN_XX` and click "Show Package Contents", then copy your `cmulab_elan_extension-main` folder into `ELAN_XX.app/Contents/app/extensions`. + + +#### Windows + +1. Download the latest version of ELAN from [here](https://archive.mpi.nl/tla/elan/download) and install it. +2. [Download a copy of this repo](https://github.com/zaidsheikh/cmulab_elan_extension/archive/refs/heads/main.zip) and unzip it. Copy the `cmulab_elan_extension-main/` folder into ELAN's extensions dir (`ELAN-XX/app/extensions/`). +3. Install [Python 3](https://www.python.org/downloads/) if it isn't already installed. + + +### Instructions + +Start ELAN with the provided test audio file + +`ELAN_6-1/bin/ELAN allosaurus-elan/test/allosaurus.wav &` + +Switch to the "Recognizers" tab and then select "CMU Linguistic Annotation Backend" from the Recognizer dropdown list at the top and then click the "Start" button. +If this is your first time using this plugin, you will be prompted to login to the [CMULAB backend server](https://github.com/neulab/cmulab) and get an access token (you can create an account or simply login with an existing Google account): diff --git a/cmulab_elan_extension.bat b/cmulab_elan_extension.bat new file mode 100644 index 0000000..6b506b8 --- /dev/null +++ b/cmulab_elan_extension.bat @@ -0,0 +1,18 @@ +@echo off + +set PYTHONLEGACYWINDOWSIOENCODING=True +set PYTHONIOENCODING=:replace + +If not exist venv\ ( + echo "PROGRESS: 1% Initial setup: Creating virtual env, installing dependencies" + python3 -m venv venv + call .\venv\Scripts\activate + python3 -m pip --no-input install -r requirements.txt + echo "PROGRESS: 5% One-time initialization successfully completed!" + call deactivate +) + +echo "Activating venv..." +call .\venv\Scripts\activate +python3 .\cmulab_elan_extension.py +call deactivate diff --git a/cmulab_elan_extension.cmdi b/cmulab_elan_extension.cmdi new file mode 100644 index 0000000..252659e --- /dev/null +++ b/cmulab_elan_extension.cmdi @@ -0,0 +1,42 @@ + + + + +
+ + + + + + + + CMU Linguistic Annotation Backend + + cmulab_elan_extension.html + + + source + + input_tier + + + cmulab_service + + + + + + + output_tier + + +
diff --git a/cmulab_elan_extension.html b/cmulab_elan_extension.html new file mode 100644 index 0000000..33af1fc --- /dev/null +++ b/cmulab_elan_extension.html @@ -0,0 +1,11 @@ + + + + + CMU Linguistic Annotation Backend + + + +

CMU Linguistic Annotation Backend

+ + diff --git a/cmulab_elan_extension.py b/cmulab_elan_extension.py new file mode 100755 index 0000000..b7bfa78 --- /dev/null +++ b/cmulab_elan_extension.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import atexit +import os +import os.path +import re +import shutil +import subprocess +import sys +import tempfile +import unicodedata +import time + +import requests +import json +import traceback +from utils.create_dataset import create_dataset_from_eaf + +import PySimpleGUI as sg +import webbrowser + + +AUTH_TOKEN_FILE = os.path.join(os.path.expanduser("~"), ".cmulab_elan") +CMULAB_SERVER = "http://miami.lti.cs.cmu.edu:8088" + + +def ping_server(server_url): + status_check = None + try: + status_check = requests.get(server_url.rstrip('/') + "/annotator") + except: + traceback.print_exc() + return status_check + + +def get_server_url(): + server_url = CMULAB_SERVER + status_check = ping_server(server_url) + while not status_check: + err_msg = "Error connecting to CMULAB server " + server_url + layout = [[sg.Text(err_msg + "\nPlease enter new CMULAB server URL")], [sg.Input()], [sg.Button('OK')]] + window = sg.Window('CMULAB server URL', layout) + event, values = window.read() + server_url = values[0].strip().rstrip('/') + if not server_url.startswith("http"): + server_url = "http://" + server_url + window.close() + status_check = ping_server(server_url) + return server_url + + +def get_params(): + # The parameters provided by the user via the ELAN recognizer interface + # (specified in CMDI). + params = {} + # Read in all of the parameters that ELAN passes to this local recognizer on + # standard input. + for line in sys.stdin: + match = re.search(r'(.*?)', line) + if match: + params[match.group(1)] = match.group(2).strip() + return params + + +def browser_login(server_url): + webbrowser.open(server_url + "/annotator/get_auth_token/") + + +def get_auth_token(server_url): + if os.path.exists(AUTH_TOKEN_FILE): + with open(AUTH_TOKEN_FILE) as fin: + auth_token = fin.read().strip() + else: + # browser_login(server_url) + layout = [[sg.Text('Click link below to get your access token')], + [sg.Text(server_url + "/annotator/get_auth_token/", text_color='blue', enable_events=True, key='-LINK-')], + [sg.Text("Please enter your access token here")], [sg.Input()], [sg.Button('OK')]] + window = sg.Window('Authorization required!', layout, finalize=True) + window['-LINK-'].set_cursor(cursor='hand1') + while True: + event, values = window.read() + if event in (sg.WIN_CLOSED, 'Exit'): + break + elif event == '-LINK-': + webbrowser.open(window['-LINK-'].DisplayText) + auth_token = values[0].strip() + if auth_token: + break + window.close() + with open(AUTH_TOKEN_FILE, 'w') as fout: + fout.write(auth_token) + return auth_token + + +def get_input_annotations(input_tier): + # grab the 'input_tier' parameter, open that + # XML document, and read in all of the annotation start times, end times, + # and values. + # Note: Tiers for the recognizers are in the AVATech tier format, not EAF + annotations = [] + if os.path.exists(input_tier): + with open(input_tier, 'r', encoding = 'utf-8') as input_tier_file: + for line in input_tier_file: + match = re.search(r'(.*?)', line) + if match: + annotation = { \ + 'start': int(float(match.group(1)) * 1000.0), \ + 'end' : int(float(match.group(2)) * 1000.0), \ + 'value' : match.group(3) } + annotations.append(annotation) + return annotations + + + +def phone_transcription(server_url, auth_token, input_audio, annotations): + layout = [[sg.Text("Language code"), sg.Input(default_text="eng", key='lang_code')], + [sg.Text("Pretrained model"), sg.Input(default_text="eng2102", key='pretrained_model')], + [sg.Button('OK')]] + window = sg.Window('Allosaurus parameters', layout) + event, values = window.read() + lang_code = values["lang_code"].strip().lower() + pretrained_model = values["pretrained_model"].strip().lower() + window.close() + + with open(input_audio,'rb') as audio_file: + files = {'file': audio_file} + url = server_url + "/annotator/segment/1/annotate/2/" + try: + headers = {} + if auth_token: + headers["Authorization"] = auth_token + allosaurus_params = {"lang": lang_code, "model": pretrained_model} + r = requests.post(url, files=files, data={"segments": json.dumps(annotations), "params": json.dumps(allosaurus_params)}, headers=headers) + except: + err_msg = "Error connecting to CMULAB server " + server_url + sys.stderr.write(err_msg + "\n") + traceback.print_exc() + sg.Popup(err_msg, title="ERROR") + print('RESULT: FAILED.', flush = True) + sys.exit(1) + print("Response from CMULAB server " + server_url + ": " + r.text) + if not r.ok: + sg.Popup("Server error, click the report button to view logs.", title="ERROR") + print('RESULT: FAILED.', flush = True) + sys.exit(1) + transcribed_annotations = json.loads(r.text) + for annotation in transcribed_annotations: + annotation["value"] = annotation["transcription"].replace(' ', '') + return transcribed_annotations + + +def finetune_allosaurus(server_url, auth_token, input_audio, annotations): + layout = [[sg.Text(err_msg + "\nPlease enter new CMULAB server URL")], [sg.Input()], [sg.Button('OK')]] + window = sg.Window('CMULAB server URL', layout) + event, values = window.read() + server_url = values[0].strip().rstrip('/') + window.close() + + +def speaker_diarization(server_url, auth_token, input_audio, annotations): + if not annotations: + sg.Popup("Please select an input tier containing a few sample annotations for each speaker", title="ERROR") + print('RESULT: FAILED.', flush = True) + sys.exit(1) + layout = [[sg.Text("Threshold"), sg.Slider((0, 1), orientation='h', resolution=0.01, default_value=0.45)], + [sg.Button('OK')]] + window = sg.Window('Diarization parameters', layout) + event, values = window.read() + threshold = float(values[0]) + window.close() + print("PROGRESS: 0.5 Running speaker diarization...", flush = True) + with open(input_audio,'rb') as audio_file: + files = {'file': audio_file} + url = server_url + "/annotator/segment/1/annotate/2/" + try: + headers = {} + if auth_token: + headers["Authorization"] = auth_token + request_params = {"service": "diarization", "threshold": threshold} + print(url) + print(input_audio) + print(json.dumps(annotations, indent=4)) + print(json.dumps(request_params, indent=4)) + print(json.dumps(headers, indent=4)) + r = requests.post(url, files=files, + data={"segments": json.dumps(annotations), "params": json.dumps(request_params)}, + headers=headers) + except: + err_msg = "Error connecting to CMULAB server " + server_url + sys.stderr.write(err_msg + "\n") + traceback.print_exc() + sg.Popup(err_msg, title="ERROR") + print('RESULT: FAILED.', flush = True) + sys.exit(1) + print("Response from CMULAB server " + server_url + ": " + r.text) + if not r.ok: + sg.Popup("Server error, click the report button to view logs.", title="ERROR") + print('RESULT: FAILED.', flush = True) + sys.exit(1) + response_data = json.loads(r.text) + transcribed_annotations = [] + for item in response_data: + transcribed_annotations.append({ + "start": item[1], + "end": item[2], + "value": item[0] + }) + return transcribed_annotations + + +def write_output(output_tier_file, annotations): + with open(output_tier_file, 'w', encoding = 'utf-8') as output_tier: + # Write document header. + output_tier.write('\n') + output_tier.write('\n') + for annotation in annotations: + output_tier.write(' %s\n' % + (annotation['start'], annotation['end'], annotation['value'])) + output_tier.write('\n') + + +def main(): + params = get_params() + + input_audio = params.get('source') + input_tier = params.get('input_tier', 'none specified') + output_tier = params.get('output_tier') + cmulab_service = params.get('cmulab_service', 'Phone-transcription') + print("input_tier: " + input_tier) + print("cmulab_service: " + cmulab_service) + + server_url = get_server_url() + + auth_token = get_auth_token(server_url) + + print("PROGRESS: 0.1 Loading annotations from input tier", flush = True) + annotations = get_input_annotations(input_tier) + + if cmulab_service == "Phone-transcription": + output_annotations = phone_transcription(server_url, auth_token, input_audio, annotations) + elif cmulab_service == "Finetune-allosaurus": + output_annotations = finetune_allosaurus(server_url, auth_token, input_audio, annotations) + elif cmulab_service == "Speaker-diarization": + output_annotations = speaker_diarization(server_url, auth_token, input_audio, annotations) + else: + print("RESULT: FAILED. Not supported!", flush = True) + sys.exit(1) + + print("PROGRESS: 0.95 Preparing output tier", flush = True) + write_output(output_tier, output_annotations) + print('RESULT: DONE.', flush = True) + + +if __name__ == '__main__': + main() diff --git a/cmulab_elan_extension.sh b/cmulab_elan_extension.sh new file mode 100755 index 0000000..6db3aa2 --- /dev/null +++ b/cmulab_elan_extension.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# It seems that recognizer processes invoked by ELAN don't inherit any regular +# environmental variables (like PATH), which makes it difficult to track down +# where both Python and ffmpeg(1) might be. These same processes also have +# their locale set to C. This implies a default ASCII file encoding. + +export LC_ALL="en_US.UTF-8" +export PYTHONIOENCODING="utf-8" + +# change to cmulab_elan_extension dir +cd "$(dirname "$0")" + +if [ ! -d "venv" ]; then + echo "PROGRESS: 1% (Initial setup) Creating virtual env, installing dependencies" + python3 -m venv venv + source venv/bin/activate + python3 -m pip --no-input install -r requirements.txt + echo "PROGRESS: 5% One-time initialization successfully completed!" + deactivate +fi + +source venv/bin/activate +python3 ./cmulab_elan_extension.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2fd8a96 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests +pydub +pympi-ling +PySimpleGUI diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/create_dataset.py b/utils/create_dataset.py new file mode 100644 index 0000000..f35e1fa --- /dev/null +++ b/utils/create_dataset.py @@ -0,0 +1,32 @@ +import argparse +import pympi +import pydub +from pathlib import Path + + +def create_dataset_from_eaf(eaf_file, output_dir, tier_name="Allosaurus"): + print(eaf_file) + print(output_dir) + print(tier_name) + output_dir_path = Path(output_dir) + output_dir_path.mkdir(parents=True, exist_ok=True) + input_elan = pympi.Elan.Eaf(file_path=eaf_file) + audio_file_path = input_elan.media_descriptors[0]["MEDIA_URL"][len("file://"):] + full_audio = pydub.AudioSegment.from_file(audio_file_path, format = 'wav') + for segment_id in input_elan.tiers[tier_name][0]: + start_id, end_id, transcription, _ = input_elan.tiers[tier_name][0][segment_id] + start = input_elan.timeslots[start_id] + end = input_elan.timeslots[end_id] + clip = full_audio[start:end] + clip.export(output_dir_path / (segment_id + ".wav"), format = 'wav') + (output_dir_path / (segment_id + ".txt")).write_text(transcription) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="convert EAF file to dataset required for fine-tuning allosaurus") + parser.add_argument('eaf_file', type=str, help="EAF file with phone transcriptions") + parser.add_argument('output_dir', type=str, help="output dir") + parser.add_argument('--tier', type=str, default="Allosaurus", help="Tier containing phone transcriptions") + args = parser.parse_args() + create_dataset_from_eaf(args.eaf_file, args.output_dir, args.tier) diff --git a/utils/requirements.txt b/utils/requirements.txt new file mode 100644 index 0000000..1e982ec --- /dev/null +++ b/utils/requirements.txt @@ -0,0 +1,2 @@ +pydub +pympi-ling