From 5d4eeae7359f81de3077cdc4a1f9185de30526bd Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Fri, 10 Nov 2023 11:04:37 +0100 Subject: [PATCH 1/5] feat: reorder operations, and fix sentence ids being non-unique --- src/coral_models/prepare_raw_data.py | 63 +++++++++++++--------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index f0e6edad..a5007763 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -99,7 +99,7 @@ def make_speaker_metadata(raw_path: Path, metadata_path: Path) -> pd.DataFrame: # Specification of gender is not consistent, so we need to correct that # by mapping "K" to "female", and "M" to "male". speakers["gender"] = speakers["gender"].apply( - lambda x: dict(M="male", K="female").get(x, x) + lambda x: dict(M="male", K="female", m="male", k="female").get(x, x) ) # Create a speaker id column. @@ -177,9 +177,6 @@ def make_recording_metadata( # We need filenames for later, when we want to create recording ids. recording_metadata["filename"] = recording_metadata.index.astype(str) - # Make a sentence content to sentence_id dict - sentence_content_to_id = dict(zip(sentences["text"], sentences["sentence_id"])) - # Load speaker information from read aloud data recording_metadata_list = [recording_metadata] read_aloud_paths = raw_path.glob("*_oplæst_*") @@ -214,17 +211,24 @@ def make_recording_metadata( # the next available id. read_aloud_data["sentence_id"] = -1 for row_i, row in read_aloud_data.iterrows(): - if row["transcription"] in sentence_content_to_id.keys(): - read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[ - row["transcription"] - ] - else: - sentence_content_to_id[row["transcription"]] = len( - sentence_content_to_id + if row["transcription"] not in sentences["text"].values: + # Append new sentence to sentences dataframe + sentences = pd.concat( + [ + sentences, + pd.DataFrame( + { + "text": [row["transcription"]], + "sentence_id": len(sentences), + } + ), + ], + ignore_index=True, ) - read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[ - row["transcription"] - ] + + read_aloud_data.loc[row_i, "sentence_id"] = sentences[ + sentences["text"] == row["transcription"] + ].index[0] # Make a recorder_id. This is not in the read aloud data, as no we have # no information about the recorders for the read aloud data. @@ -255,14 +259,6 @@ def make_recording_metadata( all_recording_metadata["recording_id"].notna() ].reset_index(drop=True) - # We have updated the sentence_content_to_id dict, so we also need to update the - # sentences dataframe with the new sentence ids - sentences = ( - pd.DataFrame.from_dict(sentence_content_to_id, orient="index") - .reset_index() - .rename(columns={"index": "text", 0: "sentence_id"}) - ) - # Prepend the sentence id column with "s" sentences["sentence_id"] = "s" + sentences.index.astype(str) all_recording_metadata["sentence_id"] = "s" + all_recording_metadata[ @@ -400,7 +396,18 @@ def prepare_raw_data( except FileNotFoundError: pass + # Write a README file + readme = make_readme() + with open(output_path / "README.md", "w") as f: + f.write(readme) + + # Save the dataframes + speakers.to_excel(output_path / "speakers.xlsx") + sentences.to_excel(output_path / "sentences.xlsx") + recordings.to_excel(output_path / "recordings.xlsx") + # Make a dataframe with statistics about the data + speakers["age"] = speakers["age"].astype(int) data_stats = pd.DataFrame( { "Number of speakers": len(speakers), @@ -422,17 +429,7 @@ def prepare_raw_data( }, index=[0], ) - - # Write a README file - readme = make_readme() - with open(output_path / "README.md", "w") as f: - f.write(readme) - - # Save the dataframes - data_stats.to_csv(output_path / "data_stats.csv", index=False) - speakers.to_csv(output_path / "speakers.csv", index=False) - sentences.to_csv(output_path / "sentences.csv", index=False) - recordings.to_csv(output_path / "recordings.csv", index=False) + data_stats.to_excel(output_path / "data_stats.xlsx") def correct_country(country: str) -> str: From be6f6838a2536aef37972cf3d374e29549ffabe4 Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Tue, 14 Nov 2023 12:36:21 +0100 Subject: [PATCH 2/5] feat: tqdm --- src/coral_models/prepare_raw_data.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index a5007763..4bddf5d0 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -180,7 +180,7 @@ def make_recording_metadata( # Load speaker information from read aloud data recording_metadata_list = [recording_metadata] read_aloud_paths = raw_path.glob("*_oplæst_*") - for read_aloud_path in read_aloud_paths: + for read_aloud_path in tqdm(list(read_aloud_paths)): read_aloud_data = get_data_from_db(read_aloud_path) # Format filenames @@ -255,6 +255,10 @@ def make_recording_metadata( # Remove rows with no recording id. Sometimes recorders did not submit their # all their recordings. + + #### IDS ARE STILL WRONG!=!=!==!?!??!?!?! + # MYBE BECAUSE RESET INDEX; GO THROUGH ALL LINES BELOW + all_recording_metadata = all_recording_metadata[ all_recording_metadata["recording_id"].notna() ].reset_index(drop=True) From ddd0f415f5f64144f1c772e0d3c9c96e009bf6a1 Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Tue, 14 Nov 2023 12:39:55 +0100 Subject: [PATCH 3/5] fix: remove artifact --- src/coral_models/prepare_raw_data.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index 4bddf5d0..9959f549 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -255,10 +255,6 @@ def make_recording_metadata( # Remove rows with no recording id. Sometimes recorders did not submit their # all their recordings. - - #### IDS ARE STILL WRONG!=!=!==!?!??!?!?! - # MYBE BECAUSE RESET INDEX; GO THROUGH ALL LINES BELOW - all_recording_metadata = all_recording_metadata[ all_recording_metadata["recording_id"].notna() ].reset_index(drop=True) From 5f53c20fc2f3080ca1d5aa736a1377fbad978e22 Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Wed, 15 Nov 2023 10:18:38 +0100 Subject: [PATCH 4/5] feat: add a hidden folder for the sensitive portion of the preprocessed data. --- config/config.yaml | 1 + src/coral_models/prepare_raw_data.py | 8 ++++++-- src/scripts/build_coral_data.py | 11 +++++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 332cb151..7350d1bf 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -11,6 +11,7 @@ dirs: processed: processed final: final models: models + hidden: hidden seed: 4242 diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index 9959f549..c6186ce2 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -294,7 +294,8 @@ def prepare_raw_data( input_path: Path | str = Path("data/raw"), output_path: Path | str = Path("data/processed"), metadata_path: Path | str = Path("data/raw/metadata.csv"), -): + hidden_output_path: Path | str = Path("data/hidden"), +) -> None: """Prepare the raw data. Args: @@ -304,10 +305,13 @@ def prepare_raw_data( Path to the processed data. Defaults to "data/processed". metadata_path (Path or str, optional): Path to the metadata. Defaults to "data/raw/metadata.csv". + hidden_input_path (Path or str, optional): + Path to save sensitive information. Defaults to "data/hidden". """ input_path = Path(input_path) output_path = Path(output_path) metadata_path = Path(metadata_path) + hidden_output_path = Path(hidden_output_path) # Make speaker-metadata dataframe speakers = make_speaker_metadata(input_path, metadata_path) @@ -402,7 +406,7 @@ def prepare_raw_data( f.write(readme) # Save the dataframes - speakers.to_excel(output_path / "speakers.xlsx") + speakers.to_excel(hidden_output_path / "speakers.xlsx") sentences.to_excel(output_path / "sentences.xlsx") recordings.to_excel(output_path / "recordings.xlsx") diff --git a/src/scripts/build_coral_data.py b/src/scripts/build_coral_data.py index 47c5353c..774ce3d4 100644 --- a/src/scripts/build_coral_data.py +++ b/src/scripts/build_coral_data.py @@ -2,6 +2,7 @@ Usage: python build_coral_data.py + """ import click @@ -22,8 +23,14 @@ "output_path", type=click.Path(), ) -def main(input_path: str, output_path: str, metadata_path: str) -> None: - prepare_raw_data(input_path, output_path, metadata_path) +@click.argument( + "hidden_output_path", + type=click.Path(), +) +def main( + input_path: str, output_path: str, metadata_path: str, hidden_output_path: str +) -> None: + prepare_raw_data(input_path, output_path, metadata_path, hidden_output_path) if __name__ == "__main__": From edfdf28953b115bbf7355582561ef7278700cf38 Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Mon, 27 Nov 2023 13:36:53 +0100 Subject: [PATCH 5/5] fix: kwargs and script usage --- src/scripts/build_coral_data.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/scripts/build_coral_data.py b/src/scripts/build_coral_data.py index 774ce3d4..a87478a3 100644 --- a/src/scripts/build_coral_data.py +++ b/src/scripts/build_coral_data.py @@ -1,7 +1,10 @@ """Script that preprocesses the raw CoRal data. Usage: - python build_coral_data.py + python src/scripts/build_coral_data.py \ + \ + \ + \ """ @@ -30,7 +33,12 @@ def main( input_path: str, output_path: str, metadata_path: str, hidden_output_path: str ) -> None: - prepare_raw_data(input_path, output_path, metadata_path, hidden_output_path) + prepare_raw_data( + input_path=input_path, + output_path=output_path, + metadata_path=metadata_path, + hidden_output_path=hidden_output_path, + ) if __name__ == "__main__":