diff --git a/pyproject.toml b/pyproject.toml
index ef11293..913542e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 ]
 description = "Evaluation of Metrics for Speech Translation (M4ST)"
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = "==3.11.*"
 classifiers = [
   "Development Status :: 1 - Planning",
   "Intended Audience :: Science/Research",
@@ -27,15 +27,27 @@ classifiers = [
   "Typing :: Typed",
 ]
 dependencies = [
-  "tqdm",
-  "requests"
+    "tqdm",
+    "requests",
+    "evaluate>=0.4.3",
+    "fairseq2>=0.2.0",
+    "filelock>=3.0.12",
+    "ipykernel>=6.29.5",
+    "nltk>=3.9.1",
+    "pandas>=2.2.3",
+    "sacrebleu>=2.4.3",
+    "seaborn>=0.13.2",
+    "sonar-space>=0.2.0",
+    "torch==2.0.1",
+    "torchvision>=0.15.2",
+    "unbabel-comet==2.2.3",
 ]
 
 [project.optional-dependencies]
 dev = [
   "pytest >=6",
   "pytest-cov >=3",
-  "pre-commit",
+  "pre-commit>=3.2.0",
 ]
 
 [project.urls]
diff --git a/scripts/demetr/process_demetr.py b/scripts/demetr/process_demetr.py
new file mode 100644
index 0000000..004a766
--- /dev/null
+++ b/scripts/demetr/process_demetr.py
@@ -0,0 +1,63 @@
+import argparse
+import os
+
+from m4st.process_demetr import ProcessDEMETR
+
+
+def main(args: dict) -> None:
+    output_dir = args["output_dir"]
+    output_file = args["output_file"]
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    demetr = ProcessDEMETR(
+        metrics_to_use=args["metrics"],
+        output_filepath=os.path.join(output_dir, output_file),
+        demetr_root=args["dataset_dir"],
+    )
+
+    print(args["cats"])
+    demetr.process_demetr(cats_to_process=args["cats"])
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--dataset-dir",
+        type=str,
+        default="../../datasets/demetr",
+        help="Root dataset \
+            for DEMETR containing JSON files.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="../../outputs/demetr",
+        help="Path to output directory. Will be created by script.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default="demetr_results.csv",
+        help="Name for output CSV file.",
+    )
+    parser.add_argument(
+        "--metrics",
+        nargs="+",
+        type=str,
+        default=["COMET_ref", "COMET_qe", "BLASER_ref", "BLASER_qe", "SacreBLEU"],
+        help="Metrics to use. Must be one or more \
+            of COMET_ref, COMET_qe, BLASER_ref, BLASER_qe, SacreBLEU. Defaults to all.",
+    )
+    parser.add_argument(
+        "--cats",
+        nargs="+",
+        type=int,
+        required=False,
+        help="Specific DEMETR disfluency \
+            categories to be processed. By default all will be processsed.",
+    )
+
+    args = parser.parse_args()
+    main(vars(args))
diff --git a/src/m4st/metrics.py b/src/m4st/metrics.py
new file mode 100644
index 0000000..5d3f455
--- /dev/null
+++ b/src/m4st/metrics.py
@@ -0,0 +1,145 @@
+import evaluate
+import numpy as np
+from pandas import Series
+from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
+from sonar.models.blaser.loader import load_blaser_model
+
+
+class SacreBLEUScore:
+    """Applies SacreBLEU from the evaluate library."""
+
+    def __init__(self) -> None:
+        self.bleu = evaluate.load("sacrebleu")
+
+    def get_scores(self, references: Series, predictions: Series) -> list:
+        results = []
+
+        # SacreBLEU doesn't seem to support batching that isn't document-level, so
+        # each sentence must be run through separately
+        for index, ref_txt in references.items():
+            mt_txt = predictions[index]
+            score = self.bleu.compute(predictions=[mt_txt], references=[[ref_txt]])
+            results.append(score["score"])
+
+        return results
+
+
+class BLASERRefScore:
+    """Initialises and applies the BLASER 2.0 QE metric from the SONAR library."""
+
+    def __init__(self, ref_lang_code: str = "eng_Latn") -> None:
+        self.blaser_ref = load_blaser_model("blaser_2_0_ref").eval()
+        self.text_embedder = TextToEmbeddingModelPipeline(
+            encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder"
+        )
+        # Code defining the target language
+        # Defaults to English
+        self.ref_lang_code = ref_lang_code
+
+    def get_scores(
+        self,
+        references: Series,
+        predictions: Series,
+        sources: Series,
+        source_lang_codes: Series,
+    ) -> list:
+        langs = np.unique(source_lang_codes)
+
+        # Store results for all languages so they can be returned together
+        results = []
+
+        # BLASER requires the source language, so at best we can batch by language as
+        # source_lang must be a string
+        for language in langs:
+            mask = source_lang_codes == language
+            sources_lang = np.array(sources[mask])
+            refs_lang = np.array(references[mask])
+            preds_lang = np.array(predictions[mask])
+
+            src_embs = self.text_embedder.predict(sources_lang, source_lang=language)
+            ref_embs = self.text_embedder.predict(
+                refs_lang, source_lang=self.ref_lang_code
+            )
+            mt_embs = self.text_embedder.predict(
+                preds_lang, source_lang=self.ref_lang_code
+            )
+
+            for i in range(len(src_embs)):
+                result = self.blaser_ref(
+                    src=src_embs[[i]], ref=ref_embs[[i]], mt=mt_embs[[i]]
+                ).item()
+                results.append(result)
+
+        return results
+
+
+class BLASERQEScore:
+    """Initialises and applies the BLASER 2.0 reference-based metric from the SONAR
+    library."""
+
+    def __init__(self, ref_lang_code: str = "eng_Latn") -> None:
+        self.blaser_qe = load_blaser_model("blaser_2_0_qe").eval()
+        self.text_embedder = TextToEmbeddingModelPipeline(
+            encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder"
+        )
+        # Code defining the target language
+        # Defaults to English
+        self.ref_lang_code = ref_lang_code
+
+    def get_scores(
+        self, predictions: Series, sources: Series, source_lang_codes: Series
+    ) -> list:
+        langs = np.unique(source_lang_codes)
+
+        # Store results for all languages so they can be returned together
+        results = []
+
+        # BLASER requires the source language, so at best we can batch by language as
+        # source_lang must be a string
+        for language in langs:
+            mask = source_lang_codes == language
+            sources_lang = np.array(sources[mask])
+            preds_lang = np.array(predictions[mask])
+
+            src_embs = self.text_embedder.predict(sources_lang, source_lang=language)
+            mt_embs = self.text_embedder.predict(
+                preds_lang, source_lang=self.ref_lang_code
+            )
+
+            for i in range(len(src_embs)):
+                result = self.blaser_qe(src=src_embs[[i]], mt=mt_embs[[i]]).item()
+                results.append(result)
+
+        return results
+
+
+class COMETRefScore:
+    """Applies COMET reference-based metric from the evaluate library."""
+
+    def __init__(self) -> None:
+        self.comet = evaluate.load("comet", model="wmt21-comet-mqm")
+
+    def get_scores(
+        self, references: Series, predictions: Series, sources: Series
+    ) -> list:
+        scores = self.comet.compute(
+            predictions=predictions,
+            references=references,
+            sources=sources,
+        )
+        return scores["scores"]
+
+
+class COMETQEScore:
+    """Applies COMET QE metric from the evaluate library."""
+
+    def __init__(self) -> None:
+        self.comet = evaluate.load("comet", model="wmt21-comet-qe-mqm")
+
+    def get_scores(
+        self, references: Series, predictions: Series, sources: Series
+    ) -> list:
+        scores = self.comet.compute(
+            predictions=predictions, references=references, sources=sources
+        )
+        return scores["scores"]
diff --git a/src/m4st/process_demetr.py b/src/m4st/process_demetr.py
new file mode 100644
index 0000000..de8779d
--- /dev/null
+++ b/src/m4st/process_demetr.py
@@ -0,0 +1,162 @@
+"""
+Script for running BLEU, SacreBLEU, and BLASER 2.0 on the DEMETR dataset.
+"""
+
+import csv
+import os
+
+import numpy as np
+import pandas as pd
+
+from m4st.metrics import (
+    BLASERQEScore,
+    BLASERRefScore,
+    COMETQEScore,
+    COMETRefScore,
+    SacreBLEUScore,
+)
+
+
+class ProcessDEMETR:
+    def __init__(
+        self,
+        output_filepath: os.PathLike | str,
+        demetr_root: os.PathLike | str,
+        metrics_to_use: list,
+    ) -> None:
+        # Conversion from DEMETR language tag to SONAR language code
+        self.language_codes = {
+            "chinese_simple": "zho_Hans",  # Hans for Simplified script
+            "czech": "ces_Latn",
+            "french": "fra_Latn",
+            "german": "deu_Latn",
+            "hindi": "hin_Deva",
+            "italian": "ita_Latn",
+            "japanese": "jpn_Jpan",
+            "polish": "pol_Latn",
+            "russian": "rus_Cyrl",
+            "spanish": "spa_Latn",
+        }
+        self.output_path = output_filepath
+        self.demetr_root = demetr_root
+        self.metrics_to_use = metrics_to_use
+
+        colnames = ["category", *self.metrics_to_use]
+
+        with open(self.output_path, "w") as output_file:
+            writer = csv.writer(output_file)
+            writer.writerow(colnames)
+
+        if "SacreBLEU" in self.metrics_to_use:
+            self.sacre_bleu = SacreBLEUScore()
+        if "BLASER_ref" in self.metrics_to_use:
+            self.blaser_ref = BLASERRefScore()
+        if "BLASER_qe" in self.metrics_to_use:
+            self.blaser_qe = BLASERQEScore()
+        if "COMET_ref" in self.metrics_to_use:
+            self.comet_ref = COMETRefScore()
+        if "COMET_qe" in self.metrics_to_use:
+            self.comet_qe = COMETQEScore()
+
+        print(f"Using metrics {self.metrics_to_use}")
+
+    def process_demetr_category(
+        self,
+        category: int,
+        cat_fp: str,
+        num_samples: int,
+        reverse_accuracy: bool = False,
+    ) -> None:
+        curr_ds_path = os.path.join(self.demetr_root, cat_fp)
+
+        # Load sentences into dataframe
+        demetr_df = pd.read_json(curr_ds_path)
+
+        ref_txts = demetr_df["eng_sent"]  # Human translation
+        mt_txts = demetr_df["mt_sent"]  # Original machine translation
+        src_txts = demetr_df["src_sent"]  # Foreign language source
+        dfluent_txts = demetr_df["pert_sent"]  # Perturbed machine translation
+        src_langs = demetr_df["lang_tag"]  # Source language
+        blaser_lang_codes = src_langs.replace(self.language_codes)
+
+        # Set up output arrays - typically (1000, n) where n is number of metrics
+        # Two sets of results for each metric, one fluent and one disfluent
+        mt_results = np.zeros((num_samples, len(self.metrics_to_use)))
+        dis_results = np.zeros((num_samples, len(self.metrics_to_use)))
+
+        for j, metric in enumerate(self.metrics_to_use):
+            if metric == "COMET_ref":
+                mt_results[:, j] = self.comet_ref.get_scores(
+                    ref_txts, mt_txts, src_txts
+                )
+                dis_results[:, j] = self.comet_ref.get_scores(
+                    ref_txts, dfluent_txts, src_txts
+                )
+            elif metric == "COMET_qe":
+                mt_results[:, j] = self.comet_qe.get_scores(ref_txts, mt_txts, src_txts)
+                dis_results[:, j] = self.comet_qe.get_scores(
+                    ref_txts, dfluent_txts, src_txts
+                )
+            elif metric == "BLASER_ref":
+                mt_results[:, j] = self.blaser_ref.get_scores(
+                    ref_txts, mt_txts, src_txts, blaser_lang_codes
+                )
+                dis_results[:, j] = self.blaser_ref.get_scores(
+                    ref_txts, dfluent_txts, src_txts, blaser_lang_codes
+                )
+            elif metric == "BLASER_qe":
+                mt_results[:, j] = self.blaser_qe.get_scores(
+                    mt_txts, src_txts, blaser_lang_codes
+                )
+                dis_results[:, j] = self.blaser_qe.get_scores(
+                    dfluent_txts, src_txts, blaser_lang_codes
+                )
+            elif metric == "SacreBLEU":
+                mt_results[:, j] = self.sacre_bleu.get_scores(ref_txts, mt_txts)
+                dis_results[:, j] = self.sacre_bleu.get_scores(ref_txts, dfluent_txts)
+            else:
+                print(f"Unknown metric {metric}")
+
+        mask = mt_results > dis_results
+        if reverse_accuracy:
+            results = np.count_nonzero(~mask, axis=0)
+        else:
+            results = np.count_nonzero(mask, axis=0)
+
+        results = results / num_samples * 100
+
+        results_str = [category, *results]
+
+        with open(self.output_path, "a") as output_file:
+            csv_writer = csv.writer(output_file)
+            csv_writer.writerow(results_str)
+
+    def process_demetr(
+        self,
+        samples_per_cat: int = 1000,
+        cats_to_process: list | None = None,
+    ) -> pd.DataFrame:
+        if cats_to_process is None:
+            cats_to_process = []
+
+        # Get list of JSON files
+        # Each file contains sentences for a single DEMETR category
+        dataset_list = os.listdir(self.demetr_root)
+
+        for ds in dataset_list:
+            ds_cat = int(ds.split("_")[1].strip("id"))
+
+            if ds_cat in cats_to_process or not cats_to_process:
+                print(f"Processing input file {ds}")
+
+                # Accuracy metric is reversed for category 35 as in this case the
+                # reference text is passed as the disfluent translation and should
+                # therefore score more highly
+                reverse_acc = ds_cat == 35
+
+                self.process_demetr_category(
+                    ds_cat,
+                    ds,
+                    samples_per_cat,
+                    reverse_acc,
+                )
diff --git a/src/m4st/utils.py b/src/m4st/utils.py
new file mode 100644
index 0000000..6438f2c
--- /dev/null
+++ b/src/m4st/utils.py
@@ -0,0 +1,7 @@
+import json
+import os
+
+
+def load_json(json_path: os.PathLike | str) -> list | dict:
+    with open(json_path) as input_file:
+        return json.load(input_file)