diff --git a/pyproject.toml b/pyproject.toml index ef11293..913542e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ authors = [ ] description = "Evaluation of Metrics for Speech Translation (M4ST)" readme = "README.md" -requires-python = ">=3.10" +requires-python = "==3.11.*" classifiers = [ "Development Status :: 1 - Planning", "Intended Audience :: Science/Research", @@ -27,15 +27,27 @@ classifiers = [ "Typing :: Typed", ] dependencies = [ - "tqdm", - "requests" + "tqdm", + "requests", + "evaluate>=0.4.3", + "fairseq2>=0.2.0", + "filelock>=3.0.12", + "ipykernel>=6.29.5", + "nltk>=3.9.1", + "pandas>=2.2.3", + "sacrebleu>=2.4.3", + "seaborn>=0.13.2", + "sonar-space>=0.2.0", + "torch==2.0.1", + "torchvision>=0.15.2", + "unbabel-comet==2.2.3", ] [project.optional-dependencies] dev = [ "pytest >=6", "pytest-cov >=3", - "pre-commit", + "pre-commit>=3.2.0", ] [project.urls] diff --git a/scripts/demetr/process_demetr.py b/scripts/demetr/process_demetr.py new file mode 100644 index 0000000..004a766 --- /dev/null +++ b/scripts/demetr/process_demetr.py @@ -0,0 +1,63 @@ +import argparse +import os + +from m4st.process_demetr import ProcessDEMETR + + +def main(args: dict) -> None: + output_dir = args["output_dir"] + output_file = args["output_file"] + + os.makedirs(output_dir, exist_ok=True) + + demetr = ProcessDEMETR( + metrics_to_use=args["metrics"], + output_filepath=os.path.join(output_dir, output_file), + demetr_root=args["dataset_dir"], + ) + + print(args["cats"]) + demetr.process_demetr(cats_to_process=args["cats"]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--dataset-dir", + type=str, + default="../../datasets/demetr", + help="Root dataset \ + for DEMETR containing JSON files.", + ) + parser.add_argument( + "--output-dir", + type=str, + default="../../outputs/demetr", + help="Path to output directory. Will be created by script.", + ) + parser.add_argument( + "--output-file", + type=str, + default="demetr_results.csv", + help="Name for output CSV file.", + ) + parser.add_argument( + "--metrics", + nargs="+", + type=str, + default=["COMET_ref", "COMET_qe", "BLASER_ref", "BLASER_qe", "SacreBLEU"], + help="Metrics to use. Must be one or more \ + of COMET_ref, COMET_qe, BLASER_ref, BLASER_qe, SacreBLEU. Defaults to all.", + ) + parser.add_argument( + "--cats", + nargs="+", + type=int, + required=False, + help="Specific DEMETR disfluency \ + categories to be processed. By default all will be processsed.", + ) + + args = parser.parse_args() + main(vars(args)) diff --git a/src/m4st/metrics.py b/src/m4st/metrics.py new file mode 100644 index 0000000..5d3f455 --- /dev/null +++ b/src/m4st/metrics.py @@ -0,0 +1,145 @@ +import evaluate +import numpy as np +from pandas import Series +from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline +from sonar.models.blaser.loader import load_blaser_model + + +class SacreBLEUScore: + """Applies SacreBLEU from the evaluate library.""" + + def __init__(self) -> None: + self.bleu = evaluate.load("sacrebleu") + + def get_scores(self, references: Series, predictions: Series) -> list: + results = [] + + # SacreBLEU doesn't seem to support batching that isn't document-level, so + # each sentence must be run through separately + for index, ref_txt in references.items(): + mt_txt = predictions[index] + score = self.bleu.compute(predictions=[mt_txt], references=[[ref_txt]]) + results.append(score["score"]) + + return results + + +class BLASERRefScore: + """Initialises and applies the BLASER 2.0 QE metric from the SONAR library.""" + + def __init__(self, ref_lang_code: str = "eng_Latn") -> None: + self.blaser_ref = load_blaser_model("blaser_2_0_ref").eval() + self.text_embedder = TextToEmbeddingModelPipeline( + encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder" + ) + # Code defining the target language + # Defaults to English + self.ref_lang_code = ref_lang_code + + def get_scores( + self, + references: Series, + predictions: Series, + sources: Series, + source_lang_codes: Series, + ) -> list: + langs = np.unique(source_lang_codes) + + # Store results for all languages so they can be returned together + results = [] + + # BLASER requires the source language, so at best we can batch by language as + # source_lang must be a string + for language in langs: + mask = source_lang_codes == language + sources_lang = np.array(sources[mask]) + refs_lang = np.array(references[mask]) + preds_lang = np.array(predictions[mask]) + + src_embs = self.text_embedder.predict(sources_lang, source_lang=language) + ref_embs = self.text_embedder.predict( + refs_lang, source_lang=self.ref_lang_code + ) + mt_embs = self.text_embedder.predict( + preds_lang, source_lang=self.ref_lang_code + ) + + for i in range(len(src_embs)): + result = self.blaser_ref( + src=src_embs[[i]], ref=ref_embs[[i]], mt=mt_embs[[i]] + ).item() + results.append(result) + + return results + + +class BLASERQEScore: + """Initialises and applies the BLASER 2.0 reference-based metric from the SONAR + library.""" + + def __init__(self, ref_lang_code: str = "eng_Latn") -> None: + self.blaser_qe = load_blaser_model("blaser_2_0_qe").eval() + self.text_embedder = TextToEmbeddingModelPipeline( + encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder" + ) + # Code defining the target language + # Defaults to English + self.ref_lang_code = ref_lang_code + + def get_scores( + self, predictions: Series, sources: Series, source_lang_codes: Series + ) -> list: + langs = np.unique(source_lang_codes) + + # Store results for all languages so they can be returned together + results = [] + + # BLASER requires the source language, so at best we can batch by language as + # source_lang must be a string + for language in langs: + mask = source_lang_codes == language + sources_lang = np.array(sources[mask]) + preds_lang = np.array(predictions[mask]) + + src_embs = self.text_embedder.predict(sources_lang, source_lang=language) + mt_embs = self.text_embedder.predict( + preds_lang, source_lang=self.ref_lang_code + ) + + for i in range(len(src_embs)): + result = self.blaser_qe(src=src_embs[[i]], mt=mt_embs[[i]]).item() + results.append(result) + + return results + + +class COMETRefScore: + """Applies COMET reference-based metric from the evaluate library.""" + + def __init__(self) -> None: + self.comet = evaluate.load("comet", model="wmt21-comet-mqm") + + def get_scores( + self, references: Series, predictions: Series, sources: Series + ) -> list: + scores = self.comet.compute( + predictions=predictions, + references=references, + sources=sources, + ) + return scores["scores"] + + +class COMETQEScore: + """Applies COMET QE metric from the evaluate library.""" + + def __init__(self) -> None: + self.comet = evaluate.load("comet", model="wmt21-comet-qe-mqm") + + def get_scores( + self, references: Series, predictions: Series, sources: Series + ) -> list: + scores = self.comet.compute( + predictions=predictions, references=references, sources=sources + ) + return scores["scores"] diff --git a/src/m4st/process_demetr.py b/src/m4st/process_demetr.py new file mode 100644 index 0000000..de8779d --- /dev/null +++ b/src/m4st/process_demetr.py @@ -0,0 +1,162 @@ +""" +Script for running BLEU, SacreBLEU, and BLASER 2.0 on the DEMETR dataset. +""" + +import csv +import os + +import numpy as np +import pandas as pd + +from m4st.metrics import ( + BLASERQEScore, + BLASERRefScore, + COMETQEScore, + COMETRefScore, + SacreBLEUScore, +) + + +class ProcessDEMETR: + def __init__( + self, + output_filepath: os.PathLike | str, + demetr_root: os.PathLike | str, + metrics_to_use: list, + ) -> None: + # Conversion from DEMETR language tag to SONAR language code + self.language_codes = { + "chinese_simple": "zho_Hans", # Hans for Simplified script + "czech": "ces_Latn", + "french": "fra_Latn", + "german": "deu_Latn", + "hindi": "hin_Deva", + "italian": "ita_Latn", + "japanese": "jpn_Jpan", + "polish": "pol_Latn", + "russian": "rus_Cyrl", + "spanish": "spa_Latn", + } + self.output_path = output_filepath + self.demetr_root = demetr_root + self.metrics_to_use = metrics_to_use + + colnames = ["category", *self.metrics_to_use] + + with open(self.output_path, "w") as output_file: + writer = csv.writer(output_file) + writer.writerow(colnames) + + if "SacreBLEU" in self.metrics_to_use: + self.sacre_bleu = SacreBLEUScore() + if "BLASER_ref" in self.metrics_to_use: + self.blaser_ref = BLASERRefScore() + if "BLASER_qe" in self.metrics_to_use: + self.blaser_qe = BLASERQEScore() + if "COMET_ref" in self.metrics_to_use: + self.comet_ref = COMETRefScore() + if "COMET_qe" in self.metrics_to_use: + self.comet_qe = COMETQEScore() + + print(f"Using metrics {self.metrics_to_use}") + + def process_demetr_category( + self, + category: int, + cat_fp: str, + num_samples: int, + reverse_accuracy: bool = False, + ) -> None: + curr_ds_path = os.path.join(self.demetr_root, cat_fp) + + # Load sentences into dataframe + demetr_df = pd.read_json(curr_ds_path) + + ref_txts = demetr_df["eng_sent"] # Human translation + mt_txts = demetr_df["mt_sent"] # Original machine translation + src_txts = demetr_df["src_sent"] # Foreign language source + dfluent_txts = demetr_df["pert_sent"] # Perturbed machine translation + src_langs = demetr_df["lang_tag"] # Source language + blaser_lang_codes = src_langs.replace(self.language_codes) + + # Set up output arrays - typically (1000, n) where n is number of metrics + # Two sets of results for each metric, one fluent and one disfluent + mt_results = np.zeros((num_samples, len(self.metrics_to_use))) + dis_results = np.zeros((num_samples, len(self.metrics_to_use))) + + for j, metric in enumerate(self.metrics_to_use): + if metric == "COMET_ref": + mt_results[:, j] = self.comet_ref.get_scores( + ref_txts, mt_txts, src_txts + ) + dis_results[:, j] = self.comet_ref.get_scores( + ref_txts, dfluent_txts, src_txts + ) + elif metric == "COMET_qe": + mt_results[:, j] = self.comet_qe.get_scores(ref_txts, mt_txts, src_txts) + dis_results[:, j] = self.comet_qe.get_scores( + ref_txts, dfluent_txts, src_txts + ) + elif metric == "BLASER_ref": + mt_results[:, j] = self.blaser_ref.get_scores( + ref_txts, mt_txts, src_txts, blaser_lang_codes + ) + dis_results[:, j] = self.blaser_ref.get_scores( + ref_txts, dfluent_txts, src_txts, blaser_lang_codes + ) + elif metric == "BLASER_qe": + mt_results[:, j] = self.blaser_qe.get_scores( + mt_txts, src_txts, blaser_lang_codes + ) + dis_results[:, j] = self.blaser_qe.get_scores( + dfluent_txts, src_txts, blaser_lang_codes + ) + elif metric == "SacreBLEU": + mt_results[:, j] = self.sacre_bleu.get_scores(ref_txts, mt_txts) + dis_results[:, j] = self.sacre_bleu.get_scores(ref_txts, dfluent_txts) + else: + print(f"Unknown metric {metric}") + + mask = mt_results > dis_results + if reverse_accuracy: + results = np.count_nonzero(~mask, axis=0) + else: + results = np.count_nonzero(mask, axis=0) + + results = results / num_samples * 100 + + results_str = [category, *results] + + with open(self.output_path, "a") as output_file: + csv_writer = csv.writer(output_file) + csv_writer.writerow(results_str) + + def process_demetr( + self, + samples_per_cat: int = 1000, + cats_to_process: list | None = None, + ) -> pd.DataFrame: + if cats_to_process is None: + cats_to_process = [] + + # Get list of JSON files + # Each file contains sentences for a single DEMETR category + dataset_list = os.listdir(self.demetr_root) + + for ds in dataset_list: + ds_cat = int(ds.split("_")[1].strip("id")) + + if ds_cat in cats_to_process or not cats_to_process: + print(f"Processing input file {ds}") + + # Accuracy metric is reversed for category 35 as in this case the + # reference text is passed as the disfluent translation and should + # therefore score more highly + reverse_acc = ds_cat == 35 + + self.process_demetr_category( + ds_cat, + ds, + samples_per_cat, + reverse_acc, + ) diff --git a/src/m4st/utils.py b/src/m4st/utils.py new file mode 100644 index 0000000..6438f2c --- /dev/null +++ b/src/m4st/utils.py @@ -0,0 +1,7 @@ +import json +import os + + +def load_json(json_path: os.PathLike | str) -> list | dict: + with open(json_path) as input_file: + return json.load(input_file)