Skip to content

Commit

Permalink
removed yt interfacing, removed dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
cvqluu committed Nov 9, 2022
1 parent 2c589bb commit a686c69
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 338 deletions.
6 changes: 1 addition & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
beautifulsoup4>=4.10.0
matplotlib>=3.5.1
pandas>=1.3.5
pytube>=11.0.2
scikit-learn>=1.0.2
speechbrain>=0.5.11
torchaudio>=0.10.1
validators>=0.18.2
youtube-dl>=2021.12.17
torchaudio>=0.10.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
version=__version__,
install_requires=install_requires,
long_description=long_description,
long_description_content_type="text/markdown"
long_description_content_type="text/markdown",
)
82 changes: 44 additions & 38 deletions simple_diarizer/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,46 @@
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from sklearn.metrics import pairwise_distances

def similarity_matrix(embeds, metric='cosine'):

def similarity_matrix(embeds, metric="cosine"):
return pairwise_distances(embeds, metric=metric)

def cluster_AHC(embeds, n_clusters=None, threshold=None,
metric='cosine', **kwargs):

def cluster_AHC(embeds, n_clusters=None, threshold=None, metric="cosine", **kwargs):
"""
Cluster embeds using Agglomerative Hierarchical Clustering
"""
if n_clusters is None:
assert threshold, "If num_clusters is not defined, threshold must be defined"

S = similarity_matrix(embeds, metric=metric)

if n_clusters is None:
cluster_model = AgglomerativeClustering(n_clusters=None,
affinity='precomputed',
linkage='average',
compute_full_tree=True,
distance_threshold=threshold)
cluster_model = AgglomerativeClustering(
n_clusters=None,
affinity="precomputed",
linkage="average",
compute_full_tree=True,
distance_threshold=threshold,
)

return cluster_model.fit_predict(S)
else:
cluster_model = AgglomerativeClustering(n_clusters=n_clusters,
affinity='precomputed',
linkage='average')
cluster_model = AgglomerativeClustering(
n_clusters=n_clusters, affinity="precomputed", linkage="average"
)

return cluster_model.fit_predict(S)


##########################################
# Spectral clustering
# A lot of these methods are lifted from
# A lot of these methods are lifted from
# https://github.com/wq2012/SpectralCluster
##########################################

def cluster_SC(embeds, n_clusters=None, threshold=None,
enhance_sim=True, **kwargs):

def cluster_SC(embeds, n_clusters=None, threshold=None, enhance_sim=True, **kwargs):
"""
Cluster embeds using Spectral Clustering
"""
Expand All @@ -52,7 +55,7 @@ def cluster_SC(embeds, n_clusters=None, threshold=None,
S = compute_affinity_matrix(embeds)
if enhance_sim:
S = sim_enhancement(S)

if n_clusters is None:
(eigenvalues, eigenvectors) = compute_sorted_eigenvectors(S)
# Get number of clusters.
Expand All @@ -67,18 +70,18 @@ def cluster_SC(embeds, n_clusters=None, threshold=None,
# This implemention from scikit-learn does NOT, which is inconsistent
# with the paper.
kmeans_clusterer = KMeans(
n_clusters=k,
init="k-means++",
max_iter=300,
random_state=0)
n_clusters=k, init="k-means++", max_iter=300, random_state=0
)
labels = kmeans_clusterer.fit_predict(spectral_embeddings)
return labels
else:
cluster_model = SpectralClustering(n_clusters=n_clusters,
affinity='precomputed')
cluster_model = SpectralClustering(
n_clusters=n_clusters, affinity="precomputed"
)

return cluster_model.fit_predict(S)


def diagonal_fill(A):
"""
Sets the diagonal elemnts of the matrix to the max of each row
Expand All @@ -87,54 +90,61 @@ def diagonal_fill(A):
A[np.diag_indices(A.shape[0])] = np.max(A, axis=1)
return A


def gaussian_blur(A, sigma=1.0):
"""
Does a gaussian blur on the affinity matrix
"""
return gaussian_filter(A, sigma=sigma)


def row_threshold_mult(A, p=0.95, mult=0.01):
"""
For each row multiply elements smaller than the row's p'th percentile by mult
"""
percentiles = np.percentile(A, p*100, axis=1)
mask = A < percentiles[:,np.newaxis]
percentiles = np.percentile(A, p * 100, axis=1)
mask = A < percentiles[:, np.newaxis]

A = (mask * mult * A) + (~mask * A)
A = (mask * mult * A) + (~mask * A)
return A


def symmetrization(A):
"""
Symmeterization: Y_{i,j} = max(S_{ij}, S_{ji})
"""
return np.maximum(A, A.T)


def diffusion(A):
"""
Diffusion: Y <- YY^T
"""
return np.dot(A, A.T)


def row_max_norm(A):
"""
Row-wise max normalization: S_{ij} = Y_{ij} / max_k(Y_{ik})
"""
maxes = np.amax(A, axis=1)
return A/maxes
return A / maxes


def sim_enhancement(A):
func_order = [
diagonal_fill,
gaussian_blur,
row_threshold_mult,
symmetrization,
diffusion,
row_max_norm
]
diagonal_fill,
gaussian_blur,
row_threshold_mult,
symmetrization,
diffusion,
row_max_norm,
]
for f in func_order:
A = f(A)
return A


def compute_affinity_matrix(X):
"""Compute the affinity matrix from data.
Note that the range of affinity is [0,1].
Expand Down Expand Up @@ -175,8 +185,7 @@ def compute_sorted_eigenvectors(A):
return w, v


def compute_number_of_clusters(
eigenvalues, max_clusters=None, stop_eigenvalue=1e-2):
def compute_number_of_clusters(eigenvalues, max_clusters=None, stop_eigenvalue=1e-2):
"""Compute number of clusters using EigenGap principle.
Args:
eigenvalues: sorted eigenvalues of the affinity matrix
Expand All @@ -198,6 +207,3 @@ def compute_number_of_clusters(
max_delta = delta
max_delta_index = i
return max_delta_index



Loading

0 comments on commit a686c69

Please sign in to comment.