Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hotfix/sentence ids #46

Merged
merged 6 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dirs:
processed: processed
final: final
models: models
hidden: hidden

seed: 4242

Expand Down
71 changes: 36 additions & 35 deletions src/coral_models/prepare_raw_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def make_speaker_metadata(raw_path: Path, metadata_path: Path) -> pd.DataFrame:
# Specification of gender is not consistent, so we need to correct that
# by mapping "K" to "female", and "M" to "male".
speakers["gender"] = speakers["gender"].apply(
lambda x: dict(M="male", K="female").get(x, x)
lambda x: dict(M="male", K="female", m="male", k="female").get(x, x)
)

# Create a speaker id column.
Expand Down Expand Up @@ -177,13 +177,10 @@ def make_recording_metadata(
# We need filenames for later, when we want to create recording ids.
recording_metadata["filename"] = recording_metadata.index.astype(str)

# Make a sentence content to sentence_id dict
sentence_content_to_id = dict(zip(sentences["text"], sentences["sentence_id"]))

# Load speaker information from read aloud data
recording_metadata_list = [recording_metadata]
read_aloud_paths = raw_path.glob("*_oplæst_*")
for read_aloud_path in read_aloud_paths:
for read_aloud_path in tqdm(list(read_aloud_paths)):
read_aloud_data = get_data_from_db(read_aloud_path)

# Format filenames
Expand Down Expand Up @@ -214,17 +211,24 @@ def make_recording_metadata(
# the next available id.
read_aloud_data["sentence_id"] = -1
for row_i, row in read_aloud_data.iterrows():
if row["transcription"] in sentence_content_to_id.keys():
read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[
row["transcription"]
]
else:
sentence_content_to_id[row["transcription"]] = len(
sentence_content_to_id
if row["transcription"] not in sentences["text"].values:
# Append new sentence to sentences dataframe
sentences = pd.concat(
[
sentences,
pd.DataFrame(
{
"text": [row["transcription"]],
"sentence_id": len(sentences),
}
),
],
ignore_index=True,
)
read_aloud_data.loc[row_i, "sentence_id"] = sentence_content_to_id[
row["transcription"]
]

read_aloud_data.loc[row_i, "sentence_id"] = sentences[
sentences["text"] == row["transcription"]
].index[0]

# Make a recorder_id. This is not in the read aloud data, as no we have
# no information about the recorders for the read aloud data.
Expand Down Expand Up @@ -255,14 +259,6 @@ def make_recording_metadata(
all_recording_metadata["recording_id"].notna()
].reset_index(drop=True)

# We have updated the sentence_content_to_id dict, so we also need to update the
# sentences dataframe with the new sentence ids
sentences = (
pd.DataFrame.from_dict(sentence_content_to_id, orient="index")
.reset_index()
.rename(columns={"index": "text", 0: "sentence_id"})
)

# Prepend the sentence id column with "s"
sentences["sentence_id"] = "s" + sentences.index.astype(str)
all_recording_metadata["sentence_id"] = "s" + all_recording_metadata[
Expand Down Expand Up @@ -298,7 +294,8 @@ def prepare_raw_data(
input_path: Path | str = Path("data/raw"),
output_path: Path | str = Path("data/processed"),
metadata_path: Path | str = Path("data/raw/metadata.csv"),
):
hidden_output_path: Path | str = Path("data/hidden"),
) -> None:
"""Prepare the raw data.

Args:
Expand All @@ -308,10 +305,13 @@ def prepare_raw_data(
Path to the processed data. Defaults to "data/processed".
metadata_path (Path or str, optional):
Path to the metadata. Defaults to "data/raw/metadata.csv".
hidden_input_path (Path or str, optional):
Path to save sensitive information. Defaults to "data/hidden".
"""
input_path = Path(input_path)
output_path = Path(output_path)
metadata_path = Path(metadata_path)
hidden_output_path = Path(hidden_output_path)

# Make speaker-metadata dataframe
speakers = make_speaker_metadata(input_path, metadata_path)
Expand Down Expand Up @@ -400,7 +400,18 @@ def prepare_raw_data(
except FileNotFoundError:
pass

# Write a README file
readme = make_readme()
with open(output_path / "README.md", "w") as f:
f.write(readme)

# Save the dataframes
speakers.to_excel(hidden_output_path / "speakers.xlsx")
sentences.to_excel(output_path / "sentences.xlsx")
recordings.to_excel(output_path / "recordings.xlsx")

# Make a dataframe with statistics about the data
speakers["age"] = speakers["age"].astype(int)
data_stats = pd.DataFrame(
{
"Number of speakers": len(speakers),
Expand All @@ -422,17 +433,7 @@ def prepare_raw_data(
},
index=[0],
)

# Write a README file
readme = make_readme()
with open(output_path / "README.md", "w") as f:
f.write(readme)

# Save the dataframes
data_stats.to_csv(output_path / "data_stats.csv", index=False)
speakers.to_csv(output_path / "speakers.csv", index=False)
sentences.to_csv(output_path / "sentences.csv", index=False)
recordings.to_csv(output_path / "recordings.csv", index=False)
data_stats.to_excel(output_path / "data_stats.xlsx")


def correct_country(country: str) -> str:
Expand Down
21 changes: 18 additions & 3 deletions src/scripts/build_coral_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""Script that preprocesses the raw CoRal data.

Usage:
python build_coral_data.py <input_path> <metadata_path> <output_path>
python src/scripts/build_coral_data.py \
<input_path> \
<metadata_path> \
<output_path> \
<hidden_output_path>
"""

import click
Expand All @@ -22,8 +26,19 @@
"output_path",
type=click.Path(),
)
def main(input_path: str, output_path: str, metadata_path: str) -> None:
prepare_raw_data(input_path, output_path, metadata_path)
@click.argument(
"hidden_output_path",
type=click.Path(),
)
def main(
input_path: str, output_path: str, metadata_path: str, hidden_output_path: str
) -> None:
prepare_raw_data(
input_path=input_path,
output_path=output_path,
metadata_path=metadata_path,
hidden_output_path=hidden_output_path,
)


if __name__ == "__main__":
Expand Down