-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_br_data.py
162 lines (135 loc) · 8.27 KB
/
clean_br_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import numpy as np
import pandas as pd
import re
from matplotlib import pyplot as plt
from rapidfuzz import fuzz
from time import perf_counter
# TODO - remove "Unreleased" from names and artists
def identify_remix_or_edit(track_name):
"""Superseded by pandas str functions"""
regex_get_remix = re.compile("\(.*\)")
# regex_remove_edit = re.compile("remix|\(|\)|edit|Remix|Edit|bootleg|Bootleg")
return regex_get_remix.findall(track_name)
def regression_check_cleaning(df_start: pd.DataFrame, df_end: pd.DataFrame):
"""Finds values that were wiped from the dataframe for human assessment"""
df_end = df_end[df_start.columns] # only check matching columns
condition = df_start.notnull() & df_end.isnull()
return df_start.loc[condition.any(axis=1), :]
def clean_data(df: pd.DataFrame, save_visualisation: bool = True) -> pd.DataFrame:
# Remove artist from the track name - regex remove everything before "\-"
df["TrackName"] = df["TrackName"].str.replace(r".*\-\s", "", regex=True)
df["TrackName"] = df["TrackName"].str.strip()
df["TrackName"] = df["TrackName"].str.replace(r'\s+', ' ', regex=True)
# From here we don't really care about the track name except to search beatport.
# String process of artists
# Some are clearlly encoding errors - I have caught a few and manually adjusted them below
# ?-Ziq --> Mu-Zic; ?Ztek -> AZtek; ?kkord --> Akkord; Ø [Phase] --> Ø [Phase]; --> Rødhåd; Âme --> Ame; Ã…re:gone --> Åre:gone
# Ã…MRTÃœM --> ÅMRTÜM; Ã…NTÆGØNIST --> ÅNTÆGØNIST
# May not be necessary
print(pd.unique(df["Artist"].sort_values()))
manual_artists_to_clean = [("?-Ziq", "Mu-Ziq"), ("?Ztek", "AZtek"), ("?kkord", "Akkord"), ("Ø [Phase]", "Ø [Phase]"),
("RødhÃ¥d", "Rødhåd"), ("Âme" "Ame"), ("Ã…re:gone", "Åre:gone"),
("Ã…MRTÃœM", "ÅMRTÜM"), ("Ã…NTÆGØNIST", "ÅNTÆGØNIST"), ("μ-Ziq", "Mu-Ziq"), ("μ-Ziq", "Mu-Ziq")]
for artist_tup in manual_artists_to_clean:
df["Artist"] = df["Artist"].str.replace(artist_tup[0], artist_tup[1])
df["DJ"] = df["DJ"].str.replace(artist_tup[0], artist_tup[1])
# Split out the artist column for colabs
tokenized_artists = df["Artist"].str.split(r"\s\&\s", expand=True, regex=True)
tokenized_artists.columns = [f"Artist{i}" for i in range(len(tokenized_artists.columns))]
for col in tokenized_artists.columns:
tokenized_artists[col] = tokenized_artists[col].str.strip()
tokenized_artists[col] = tokenized_artists[col].str.replace(r"\s+", " ", regex=True)
df = df.merge(tokenized_artists, left_index=True, right_index=True)
# Split B2B
b2b = df["DJ"].str.split(r"\s\&\s|\sand\s\|\sAnd\s|\sB2B\s|\sb2b\s", expand=True, regex=True)
b2b.columns = [f"DJ{i}" for i in range(len(b2b.columns))]
for col in b2b.columns:
b2b[col] = b2b[col].str.strip()
b2b[col] = b2b[col].str.replace(r"\s+", " ", regex=True)
df = df.merge(b2b, left_index=True, right_index=True)
df["RemixOrEdit"] = df["TrackName"].str.extract(r"\((.*?)\)", expand=False)
df["RemixOrEdit"] = df["RemixOrEdit"].str.replace(
"(Remix)|(\()|(\))|(Edit)|(bootleg)|(Version)|(Vocal Mix)|(Mix)|(Instrumental Mix)|(Instrumental)|(Dub)",
"",
case=False,
regex=True
)
df["RemixOrEdit"] = df["RemixOrEdit"].str.strip()
multi_remix = df["RemixOrEdit"].str.split(r"\s\&\s|\sand\s|\sAnd\s", expand=True, regex=True)
multi_remix.columns = [f"RemixOrEdit{i}" for i in range(len(multi_remix.columns))]
for col in multi_remix.columns:
multi_remix[col] = multi_remix[col].str.strip()
multi_remix[col] = multi_remix[col].str.replace(r"\s+", " ", regex=True)
df = df.merge(multi_remix, left_index=True, right_index=True)
for col in df:
if col == "Number":
continue
df[col] = df[col].str.replace("\?(?!ME)|(^ID$)|(^Id$)", "", regex=True) # Replace all "?" and ID/Id except for ?ME, that's a legit artist
print(regression_check_cleaning(df_start=raw_df, df_end=df))
df.to_csv(r"Data\\preprocessed_br_data.csv", encoding="utf-8")
# TODO
artist_cols = df.filter(regex=r"(DJ\d+)|(RemixOrEdit\d+)|(Artist\d+)")
all_artist_cols = df.filter(regex=r"(DJ\d*)|(RemixOrEdit\d*)|(Artist\d*)").columns
start = perf_counter()
list_of_artists: np.ndarray = pd.unique(artist_cols.stack())
# Approach for cleaning artist names:
# This is a static, point in time database. It is enough to get a similarity score for all pairs of artists
# then to evaluate a cutoff point of similarity score and merge all above the threshold
# Manually choosing this score prevents false positives/negatives but lacks the ability to be automated in the future
# If ever repeating the task if downloading data, this can be repeated
# and hopefully because the data is taken Beatport (what I currently suspect) there shouldnt be too many issues (except maybe remixes...)
# Numpy Cartesian product of artists
cartesian_sorted = np.sort(np.array(np.meshgrid(list_of_artists, list_of_artists)).T.reshape(-1, 2), axis=1)
intermediary_df = pd.DataFrame(
cartesian_sorted,
columns=["Artist1", "Artist2"]
)
# convert to df - Combinations column is tuples of Arist 1 and Artist 2. Note it is a sorted tuple
intermediary_df["Combinations"] = intermediary_df.apply(lambda x: (x["Artist1"], x["Artist2"]), axis=1)
# remove duplicates (halves dataframe size)
product_of_artists = pd.DataFrame(pd.unique(intermediary_df["Combinations"]).tolist(), columns=["Artist1", "Artist2"])
# remove exact pairs
product_of_artists = product_of_artists.loc[product_of_artists["Artist1"] != product_of_artists["Artist2"], :] # remove exact matches
# calc string similarity of all pairs of artists
product_of_artists["StringSimilarity"] = product_of_artists.apply(lambda x: fuzz.ratio(x["Artist1"], x["Artist2"]), axis=1)
threshold = 80
merge_artist = product_of_artists.loc[product_of_artists["StringSimilarity"] > threshold, :]
print(f"Took {(perf_counter() - start)/60}mins to do string similarity.")
fig, ax = plt.subplots(figsize=(40, 40))
ax.axvline(threshold, color='red', linestyle='--', linewidth=2)
ax.set_title("Histogram of Values")
ax.set_yscale('log') # to try elucidate the taile
ax.set_xlabel("Value")
ax.set_ylabel("Frequency")
product_of_artists["StringSimilarity"].hist(ax=ax, bins=30) # would be nice to show the threshold line
plt.show()
if save_visualisation:
fig.savefig(r"Images\histogram_of_similarity_values.png")
fig_tail, ax_tail = plt.subplots(figsize=(40, 40))
ax.axvline(threshold, color='red', linestyle='--', linewidth=2)
ax_tail.set_title("Tail of Histogram of Values")
ax_tail.set_yscale('log') # to try elucidate the taile
ax_tail.set_xlabel("Value")
ax_tail.set_ylabel("Frequency")
product_of_artists.loc[product_of_artists["StringSimilarity"] > 65, "StringSimilarity"].hist(ax=ax_tail, bins=30) # would be nice to show the threshold line
plt.show()
if save_visualisation:
fig_tail.savefig(r"Images\tail_of_histogram.png")
product_of_artists.loc[product_of_artists["StringSimilarity"] > 65, :].to_csv(r"Data\\FuzzyMatching.csv")
# Then apply merges onto datasets
final_df = df.copy()
list_of_pairs = merge_artist[["Artist1", "Artist2"]].to_dict("split", index=False)["data"]
dict_of_pairs = {item[1]: item[0] for item in list_of_pairs}
for col in artist_cols.columns:
final_df[col] = final_df[col].map(lambda x: dict_of_pairs[x] if x in dict_of_pairs.keys() else x)
print(f"Took {(perf_counter() - start)/60}mins to do all of artist fuzzy matching.")
return final_df
if __name__ == "__main__":
df = pd.read_csv(r"Data\1001_tracklist_set_lists.csv")
raw_df = df.copy()
final_df = clean_data(raw_df, save_visualisation=True)
final_df.to_csv(r"Data\cleaned_boiler_room_data.csv", encoding="utf-8", index=False)
final_df.to_parquet(r"Data\cleaned_boiler_room_data.parquet", index=False)
# number of missing genres - whether it is worth getting these from the beatport API
print(df.isnull().sum())
# TODO at the end regression check anything that had data and now is empty