From 5d4eeae7359f81de3077cdc4a1f9185de30526bd Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <anders.j.pedersen@alexandra.dk>
Date: Fri, 10 Nov 2023 11:04:37 +0100
Subject: [PATCH 1/5] feat: reorder operations, and fix sentence ids being
 non-unique

---
 src/coral_models/prepare_raw_data.py | 63 +++++++++++++---------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
index f0e6edad..a5007763 100644
--- a/src/coral_models/prepare_raw_data.py
+++ b/src/coral_models/prepare_raw_data.py
@@ -99,7 +99,7 @@ def make_speaker_metadata(raw_path: Path, metadata_path: Path) -> pd.DataFrame:
     # Specification of gender is not consistent, so we need to correct that
     # by mapping "K" to "female", and "M" to "male".
     speakers["gender"] = speakers["gender"].apply(
-        lambda x: dict(M="male", K="female").get(x, x)
+        lambda x: dict(M="male", K="female", m="male", k="female").get(x, x)
     )
 
     # Create a speaker id column.
@@ -177,9 +177,6 @@ def make_recording_metadata(
     # We need filenames for later, when we want to create recording ids.
     recording_metadata["filename"] = recording_metadata.index.astype(str)
 
-    # Make a sentence content to sentence_id dict
-    sentence_content_to_id = dict(zip(sentences["text"], sentences["sentence_id"]))
-
     # Load speaker information from read aloud data
     recording_metadata_list = [recording_metadata]
     read_aloud_paths = raw_path.glob("*_oplæst_*")
@@ -214,17 +211,24 @@ def make_recording_metadata(
         # the next available id.
         read_aloud_data["sentence_id"] = -1
         for row_i, row in read_aloud_data.iterrows():
-            if row["transcription"] in sentence_content_to_id.keys():
-                read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[
-                    row["transcription"]
-                ]
-            else:
-                sentence_content_to_id[row["transcription"]] = len(
-                    sentence_content_to_id
+            if row["transcription"] not in sentences["text"].values:
+                # Append new sentence to sentences dataframe
+                sentences = pd.concat(
+                    [
+                        sentences,
+                        pd.DataFrame(
+                            {
+                                "text": [row["transcription"]],
+                                "sentence_id": len(sentences),
+                            }
+                        ),
+                    ],
+                    ignore_index=True,
                 )
-                read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[
-                    row["transcription"]
-                ]
+
+            read_aloud_data.loc[row_i, "sentence_id"] = sentences[
+                sentences["text"] == row["transcription"]
+            ].index[0]
 
         # Make a recorder_id. This is not in the read aloud data, as no we have
         # no information about the recorders for the read aloud data.
@@ -255,14 +259,6 @@ def make_recording_metadata(
         all_recording_metadata["recording_id"].notna()
     ].reset_index(drop=True)
 
-    # We have updated the sentence_content_to_id dict, so we also need to update the
-    # sentences dataframe with the new sentence ids
-    sentences = (
-        pd.DataFrame.from_dict(sentence_content_to_id, orient="index")
-        .reset_index()
-        .rename(columns={"index": "text", 0: "sentence_id"})
-    )
-
     # Prepend the sentence id column with "s"
     sentences["sentence_id"] = "s" + sentences.index.astype(str)
     all_recording_metadata["sentence_id"] = "s" + all_recording_metadata[
@@ -400,7 +396,18 @@ def prepare_raw_data(
         except FileNotFoundError:
             pass
 
+    # Write a README file
+    readme = make_readme()
+    with open(output_path / "README.md", "w") as f:
+        f.write(readme)
+
+    # Save the dataframes
+    speakers.to_excel(output_path / "speakers.xlsx")
+    sentences.to_excel(output_path / "sentences.xlsx")
+    recordings.to_excel(output_path / "recordings.xlsx")
+
     # Make a dataframe with statistics about the data
+    speakers["age"] = speakers["age"].astype(int)
     data_stats = pd.DataFrame(
         {
             "Number of speakers": len(speakers),
@@ -422,17 +429,7 @@ def prepare_raw_data(
         },
         index=[0],
     )
-
-    # Write a README file
-    readme = make_readme()
-    with open(output_path / "README.md", "w") as f:
-        f.write(readme)
-
-    # Save the dataframes
-    data_stats.to_csv(output_path / "data_stats.csv", index=False)
-    speakers.to_csv(output_path / "speakers.csv", index=False)
-    sentences.to_csv(output_path / "sentences.csv", index=False)
-    recordings.to_csv(output_path / "recordings.csv", index=False)
+    data_stats.to_excel(output_path / "data_stats.xlsx")
 
 
 def correct_country(country: str) -> str:

From be6f6838a2536aef37972cf3d374e29549ffabe4 Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <anders.j.pedersen@alexandra.dk>
Date: Tue, 14 Nov 2023 12:36:21 +0100
Subject: [PATCH 2/5] feat: tqdm

---
 src/coral_models/prepare_raw_data.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
index a5007763..4bddf5d0 100644
--- a/src/coral_models/prepare_raw_data.py
+++ b/src/coral_models/prepare_raw_data.py
@@ -180,7 +180,7 @@ def make_recording_metadata(
     # Load speaker information from read aloud data
     recording_metadata_list = [recording_metadata]
     read_aloud_paths = raw_path.glob("*_oplæst_*")
-    for read_aloud_path in read_aloud_paths:
+    for read_aloud_path in tqdm(list(read_aloud_paths)):
         read_aloud_data = get_data_from_db(read_aloud_path)
 
         # Format filenames
@@ -255,6 +255,10 @@ def make_recording_metadata(
 
     # Remove rows with no recording id. Sometimes recorders did not submit their
     # all their recordings.
+
+    #### IDS ARE STILL WRONG!=!=!==!?!??!?!?!
+    # MYBE BECAUSE RESET INDEX; GO THROUGH ALL LINES BELOW
+
     all_recording_metadata = all_recording_metadata[
         all_recording_metadata["recording_id"].notna()
     ].reset_index(drop=True)

From ddd0f415f5f64144f1c772e0d3c9c96e009bf6a1 Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <anders.j.pedersen@alexandra.dk>
Date: Tue, 14 Nov 2023 12:39:55 +0100
Subject: [PATCH 3/5] fix: remove artifact

---
 src/coral_models/prepare_raw_data.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
index 4bddf5d0..9959f549 100644
--- a/src/coral_models/prepare_raw_data.py
+++ b/src/coral_models/prepare_raw_data.py
@@ -255,10 +255,6 @@ def make_recording_metadata(
 
     # Remove rows with no recording id. Sometimes recorders did not submit their
     # all their recordings.
-
-    #### IDS ARE STILL WRONG!=!=!==!?!??!?!?!
-    # MYBE BECAUSE RESET INDEX; GO THROUGH ALL LINES BELOW
-
     all_recording_metadata = all_recording_metadata[
         all_recording_metadata["recording_id"].notna()
     ].reset_index(drop=True)

From 5f53c20fc2f3080ca1d5aa736a1377fbad978e22 Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <anders.j.pedersen@alexandra.dk>
Date: Wed, 15 Nov 2023 10:18:38 +0100
Subject: [PATCH 4/5] feat: add a hidden folder for the sensitive portion of
 the preprocessed data.

---
 config/config.yaml                   |  1 +
 src/coral_models/prepare_raw_data.py |  8 ++++++--
 src/scripts/build_coral_data.py      | 11 +++++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 332cb151..7350d1bf 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -11,6 +11,7 @@ dirs:
   processed: processed
   final: final
   models: models
+  hidden: hidden
 
 seed: 4242
 
diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
index 9959f549..c6186ce2 100644
--- a/src/coral_models/prepare_raw_data.py
+++ b/src/coral_models/prepare_raw_data.py
@@ -294,7 +294,8 @@ def prepare_raw_data(
     input_path: Path | str = Path("data/raw"),
     output_path: Path | str = Path("data/processed"),
     metadata_path: Path | str = Path("data/raw/metadata.csv"),
-):
+    hidden_output_path: Path | str = Path("data/hidden"),
+) -> None:
     """Prepare the raw data.
 
     Args:
@@ -304,10 +305,13 @@ def prepare_raw_data(
             Path to the processed data. Defaults to "data/processed".
         metadata_path (Path or str, optional):
             Path to the metadata. Defaults to "data/raw/metadata.csv".
+        hidden_input_path (Path or str, optional):
+            Path to save sensitive information. Defaults to "data/hidden".
     """
     input_path = Path(input_path)
     output_path = Path(output_path)
     metadata_path = Path(metadata_path)
+    hidden_output_path = Path(hidden_output_path)
 
     # Make speaker-metadata dataframe
     speakers = make_speaker_metadata(input_path, metadata_path)
@@ -402,7 +406,7 @@ def prepare_raw_data(
         f.write(readme)
 
     # Save the dataframes
-    speakers.to_excel(output_path / "speakers.xlsx")
+    speakers.to_excel(hidden_output_path / "speakers.xlsx")
     sentences.to_excel(output_path / "sentences.xlsx")
     recordings.to_excel(output_path / "recordings.xlsx")
 
diff --git a/src/scripts/build_coral_data.py b/src/scripts/build_coral_data.py
index 47c5353c..774ce3d4 100644
--- a/src/scripts/build_coral_data.py
+++ b/src/scripts/build_coral_data.py
@@ -2,6 +2,7 @@
 
 Usage:
     python build_coral_data.py <input_path> <metadata_path> <output_path>
+        <hidden_output_path>
 """
 
 import click
@@ -22,8 +23,14 @@
     "output_path",
     type=click.Path(),
 )
-def main(input_path: str, output_path: str, metadata_path: str) -> None:
-    prepare_raw_data(input_path, output_path, metadata_path)
+@click.argument(
+    "hidden_output_path",
+    type=click.Path(),
+)
+def main(
+    input_path: str, output_path: str, metadata_path: str, hidden_output_path: str
+) -> None:
+    prepare_raw_data(input_path, output_path, metadata_path, hidden_output_path)
 
 
 if __name__ == "__main__":

From edfdf28953b115bbf7355582561ef7278700cf38 Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <anders.j.pedersen@alexandra.dk>
Date: Mon, 27 Nov 2023 13:36:53 +0100
Subject: [PATCH 5/5] fix: kwargs and script usage

---
 src/scripts/build_coral_data.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/scripts/build_coral_data.py b/src/scripts/build_coral_data.py
index 774ce3d4..a87478a3 100644
--- a/src/scripts/build_coral_data.py
+++ b/src/scripts/build_coral_data.py
@@ -1,7 +1,10 @@
 """Script that preprocesses the raw CoRal data.
 
 Usage:
-    python build_coral_data.py <input_path> <metadata_path> <output_path>
+    python src/scripts/build_coral_data.py \
+        <input_path> \
+        <metadata_path> \
+        <output_path> \
         <hidden_output_path>
 """
 
@@ -30,7 +33,12 @@
 def main(
     input_path: str, output_path: str, metadata_path: str, hidden_output_path: str
 ) -> None:
-    prepare_raw_data(input_path, output_path, metadata_path, hidden_output_path)
+    prepare_raw_data(
+        input_path=input_path,
+        output_path=output_path,
+        metadata_path=metadata_path,
+        hidden_output_path=hidden_output_path,
+    )
 
 
 if __name__ == "__main__":