alexandrainst · AJDERS · Nov 27, 2023 · Nov 10, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/config/config.yaml b/config/config.yaml
@@ -11,6 +11,7 @@ dirs:
   processed: processed
   final: final
   models: models
+  hidden: hidden
 
 seed: 4242
 

diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
@@ -99,7 +99,7 @@ def make_speaker_metadata(raw_path: Path, metadata_path: Path) -> pd.DataFrame:
     # Specification of gender is not consistent, so we need to correct that
     # by mapping "K" to "female", and "M" to "male".
     speakers["gender"] = speakers["gender"].apply(
-        lambda x: dict(M="male", K="female").get(x, x)
+        lambda x: dict(M="male", K="female", m="male", k="female").get(x, x)
     )
 
     # Create a speaker id column.
@@ -177,13 +177,10 @@ def make_recording_metadata(
     # We need filenames for later, when we want to create recording ids.
     recording_metadata["filename"] = recording_metadata.index.astype(str)
 
-    # Make a sentence content to sentence_id dict
-    sentence_content_to_id = dict(zip(sentences["text"], sentences["sentence_id"]))
-
     # Load speaker information from read aloud data
     recording_metadata_list = [recording_metadata]
     read_aloud_paths = raw_path.glob("*_oplæst_*")
-    for read_aloud_path in read_aloud_paths:
+    for read_aloud_path in tqdm(list(read_aloud_paths)):
         read_aloud_data = get_data_from_db(read_aloud_path)
 
         # Format filenames
@@ -214,17 +211,24 @@ def make_recording_metadata(
         # the next available id.
         read_aloud_data["sentence_id"] = -1
         for row_i, row in read_aloud_data.iterrows():
-            if row["transcription"] in sentence_content_to_id.keys():
-                read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[
-                    row["transcription"]
-                ]
-            else:
-                sentence_content_to_id[row["transcription"]] = len(
-                    sentence_content_to_id
+            if row["transcription"] not in sentences["text"].values:
+                # Append new sentence to sentences dataframe
+                sentences = pd.concat(
+                    [
+                        sentences,
+                        pd.DataFrame(
+                            {
+                                "text": [row["transcription"]],
+                                "sentence_id": len(sentences),
+                            }
+                        ),
+                    ],
+                    ignore_index=True,
                 )
-                read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[
-                    row["transcription"]
-                ]
+
+            read_aloud_data.loc[row_i, "sentence_id"] = sentences[
+                sentences["text"] == row["transcription"]
+            ].index[0]
 
         # Make a recorder_id. This is not in the read aloud data, as no we have
         # no information about the recorders for the read aloud data.
@@ -255,14 +259,6 @@ def make_recording_metadata(
         all_recording_metadata["recording_id"].notna()
     ].reset_index(drop=True)
 
-    # We have updated the sentence_content_to_id dict, so we also need to update the
-    # sentences dataframe with the new sentence ids
-    sentences = (
-        pd.DataFrame.from_dict(sentence_content_to_id, orient="index")
-        .reset_index()
-        .rename(columns={"index": "text", 0: "sentence_id"})
-    )
-
     # Prepend the sentence id column with "s"
     sentences["sentence_id"] = "s" + sentences.index.astype(str)
     all_recording_metadata["sentence_id"] = "s" + all_recording_metadata[
@@ -298,7 +294,8 @@ def prepare_raw_data(
     input_path: Path | str = Path("data/raw"),
     output_path: Path | str = Path("data/processed"),
     metadata_path: Path | str = Path("data/raw/metadata.csv"),
-):
+    hidden_output_path: Path | str = Path("data/hidden"),
+) -> None:
     """Prepare the raw data.
 
     Args:
@@ -308,10 +305,13 @@ def prepare_raw_data(
             Path to the processed data. Defaults to "data/processed".
         metadata_path (Path or str, optional):
             Path to the metadata. Defaults to "data/raw/metadata.csv".
+        hidden_input_path (Path or str, optional):
+            Path to save sensitive information. Defaults to "data/hidden".
     """
     input_path = Path(input_path)
     output_path = Path(output_path)
     metadata_path = Path(metadata_path)
+    hidden_output_path = Path(hidden_output_path)
 
     # Make speaker-metadata dataframe
     speakers = make_speaker_metadata(input_path, metadata_path)
@@ -400,7 +400,18 @@ def prepare_raw_data(
         except FileNotFoundError:
             pass
 
+    # Write a README file
+    readme = make_readme()
+    with open(output_path / "README.md", "w") as f:
+        f.write(readme)
+
+    # Save the dataframes
+    speakers.to_excel(hidden_output_path / "speakers.xlsx")
+    sentences.to_excel(output_path / "sentences.xlsx")
+    recordings.to_excel(output_path / "recordings.xlsx")
+
     # Make a dataframe with statistics about the data
+    speakers["age"] = speakers["age"].astype(int)
     data_stats = pd.DataFrame(
         {
             "Number of speakers": len(speakers),
@@ -422,17 +433,7 @@ def prepare_raw_data(
         },
         index=[0],
     )
-
-    # Write a README file
-    readme = make_readme()
-    with open(output_path / "README.md", "w") as f:
-        f.write(readme)
-
-    # Save the dataframes
-    data_stats.to_csv(output_path / "data_stats.csv", index=False)
-    speakers.to_csv(output_path / "speakers.csv", index=False)
-    sentences.to_csv(output_path / "sentences.csv", index=False)
-    recordings.to_csv(output_path / "recordings.csv", index=False)
+    data_stats.to_excel(output_path / "data_stats.xlsx")
 
 
 def correct_country(country: str) -> str:

diff --git a/src/scripts/build_coral_data.py b/src/scripts/build_coral_data.py
@@ -1,7 +1,11 @@
 """Script that preprocesses the raw CoRal data.
 
 Usage:
-    python build_coral_data.py <input_path> <metadata_path> <output_path>
+    python src/scripts/build_coral_data.py \
+        <input_path> \
+        <metadata_path> \
+        <output_path> \
+        <hidden_output_path>
 """
 
 import click
@@ -22,8 +26,19 @@
     "output_path",
     type=click.Path(),
 )
-def main(input_path: str, output_path: str, metadata_path: str) -> None:
-    prepare_raw_data(input_path, output_path, metadata_path)
+@click.argument(
+    "hidden_output_path",
+    type=click.Path(),
+)
+def main(
+    input_path: str, output_path: str, metadata_path: str, hidden_output_path: str
+) -> None:
+    prepare_raw_data(
+        input_path=input_path,
+        output_path=output_path,
+        metadata_path=metadata_path,
+        hidden_output_path=hidden_output_path,
+    )
 
 
 if __name__ == "__main__":