diff --git a/config/config.yaml b/config/config.yaml index 332cb151..7350d1bf 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -11,6 +11,7 @@ dirs: processed: processed final: final models: models + hidden: hidden seed: 4242 diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index f0e6edad..c6186ce2 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -99,7 +99,7 @@ def make_speaker_metadata(raw_path: Path, metadata_path: Path) -> pd.DataFrame: # Specification of gender is not consistent, so we need to correct that # by mapping "K" to "female", and "M" to "male". speakers["gender"] = speakers["gender"].apply( - lambda x: dict(M="male", K="female").get(x, x) + lambda x: dict(M="male", K="female", m="male", k="female").get(x, x) ) # Create a speaker id column. @@ -177,13 +177,10 @@ def make_recording_metadata( # We need filenames for later, when we want to create recording ids. recording_metadata["filename"] = recording_metadata.index.astype(str) - # Make a sentence content to sentence_id dict - sentence_content_to_id = dict(zip(sentences["text"], sentences["sentence_id"])) - # Load speaker information from read aloud data recording_metadata_list = [recording_metadata] read_aloud_paths = raw_path.glob("*_oplæst_*") - for read_aloud_path in read_aloud_paths: + for read_aloud_path in tqdm(list(read_aloud_paths)): read_aloud_data = get_data_from_db(read_aloud_path) # Format filenames @@ -214,17 +211,24 @@ def make_recording_metadata( # the next available id. read_aloud_data["sentence_id"] = -1 for row_i, row in read_aloud_data.iterrows(): - if row["transcription"] in sentence_content_to_id.keys(): - read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[ - row["transcription"] - ] - else: - sentence_content_to_id[row["transcription"]] = len( - sentence_content_to_id + if row["transcription"] not in sentences["text"].values: + # Append new sentence to sentences dataframe + sentences = pd.concat( + [ + sentences, + pd.DataFrame( + { + "text": [row["transcription"]], + "sentence_id": len(sentences), + } + ), + ], + ignore_index=True, ) - read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[ - row["transcription"] - ] + + read_aloud_data.loc[row_i, "sentence_id"] = sentences[ + sentences["text"] == row["transcription"] + ].index[0] # Make a recorder_id. This is not in the read aloud data, as no we have # no information about the recorders for the read aloud data. @@ -255,14 +259,6 @@ def make_recording_metadata( all_recording_metadata["recording_id"].notna() ].reset_index(drop=True) - # We have updated the sentence_content_to_id dict, so we also need to update the - # sentences dataframe with the new sentence ids - sentences = ( - pd.DataFrame.from_dict(sentence_content_to_id, orient="index") - .reset_index() - .rename(columns={"index": "text", 0: "sentence_id"}) - ) - # Prepend the sentence id column with "s" sentences["sentence_id"] = "s" + sentences.index.astype(str) all_recording_metadata["sentence_id"] = "s" + all_recording_metadata[ @@ -298,7 +294,8 @@ def prepare_raw_data( input_path: Path | str = Path("data/raw"), output_path: Path | str = Path("data/processed"), metadata_path: Path | str = Path("data/raw/metadata.csv"), -): + hidden_output_path: Path | str = Path("data/hidden"), +) -> None: """Prepare the raw data. Args: @@ -308,10 +305,13 @@ def prepare_raw_data( Path to the processed data. Defaults to "data/processed". metadata_path (Path or str, optional): Path to the metadata. Defaults to "data/raw/metadata.csv". + hidden_input_path (Path or str, optional): + Path to save sensitive information. Defaults to "data/hidden". """ input_path = Path(input_path) output_path = Path(output_path) metadata_path = Path(metadata_path) + hidden_output_path = Path(hidden_output_path) # Make speaker-metadata dataframe speakers = make_speaker_metadata(input_path, metadata_path) @@ -400,7 +400,18 @@ def prepare_raw_data( except FileNotFoundError: pass + # Write a README file + readme = make_readme() + with open(output_path / "README.md", "w") as f: + f.write(readme) + + # Save the dataframes + speakers.to_excel(hidden_output_path / "speakers.xlsx") + sentences.to_excel(output_path / "sentences.xlsx") + recordings.to_excel(output_path / "recordings.xlsx") + # Make a dataframe with statistics about the data + speakers["age"] = speakers["age"].astype(int) data_stats = pd.DataFrame( { "Number of speakers": len(speakers), @@ -422,17 +433,7 @@ def prepare_raw_data( }, index=[0], ) - - # Write a README file - readme = make_readme() - with open(output_path / "README.md", "w") as f: - f.write(readme) - - # Save the dataframes - data_stats.to_csv(output_path / "data_stats.csv", index=False) - speakers.to_csv(output_path / "speakers.csv", index=False) - sentences.to_csv(output_path / "sentences.csv", index=False) - recordings.to_csv(output_path / "recordings.csv", index=False) + data_stats.to_excel(output_path / "data_stats.xlsx") def correct_country(country: str) -> str: diff --git a/src/scripts/build_coral_data.py b/src/scripts/build_coral_data.py index 47c5353c..a87478a3 100644 --- a/src/scripts/build_coral_data.py +++ b/src/scripts/build_coral_data.py @@ -1,7 +1,11 @@ """Script that preprocesses the raw CoRal data. Usage: - python build_coral_data.py + python src/scripts/build_coral_data.py \ + \ + \ + \ + """ import click @@ -22,8 +26,19 @@ "output_path", type=click.Path(), ) -def main(input_path: str, output_path: str, metadata_path: str) -> None: - prepare_raw_data(input_path, output_path, metadata_path) +@click.argument( + "hidden_output_path", + type=click.Path(), +) +def main( + input_path: str, output_path: str, metadata_path: str, hidden_output_path: str +) -> None: + prepare_raw_data( + input_path=input_path, + output_path=output_path, + metadata_path=metadata_path, + hidden_output_path=hidden_output_path, + ) if __name__ == "__main__":