From acb708de415e9e4366fb094e120a68414f5619bd Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Tue, 19 Mar 2024 17:23:34 +0100 Subject: [PATCH 1/4] new branch with USE_GPU --- main.py | 10 ++++++++-- simple_diarizer/diarizer.py | 22 +++++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 18befae..4b03382 100644 --- a/main.py +++ b/main.py @@ -14,11 +14,13 @@ parser.add_argument("--max_speakers", dest='max_speakers', default=25, type=int, help="Maximum number of speakers (if number of speaker is unknown)") parser.add_argument("--embed_model", dest='embed_model', default="ecapa", type=str, help="Name of embedding") parser.add_argument("--cluster_method", dest='cluster_method', default="nme-sc", type=str, help="Clustering method") +parser.add_argument("--gpu", dest='gpu', default=False, type=bool, help="use of GPU") args = parser.parse_args() diar = Diarizer( embed_model=args.embed_model, # 'xvec' and 'ecapa' supported - cluster_method=args.cluster_method # 'ahc' 'sc' and 'nme-sc' supported + cluster_method=args.cluster_method, # 'ahc' 'sc' and 'nme-sc' supported + USE_GPU=args.gpu ) WAV_FILE=args.audio_name @@ -26,6 +28,10 @@ max_spk= args.max_speakers output_file=args.outputfile + + + + t0 = time.time() segments = diar.diarize(WAV_FILE, num_speakers=num_speakers,max_speakers=max_spk,outfile=output_file) @@ -73,5 +79,5 @@ json["speakers"] = list(_speakers.values()) json["segments"] = _segments - pprint.pprint(json) + #pprint.pprint(json) diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index 400e991..825ccc1 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -15,7 +15,7 @@ class Diarizer: def __init__( - self, embed_model="xvec", cluster_method="sc", window=1.5, period=0.75 + self, embed_model="xvec", cluster_method="sc", window=1.5, period=0.75, USE_GPU=False ): assert embed_model in [ @@ -38,9 +38,16 @@ def __init__( self.vad_model, self.get_speech_ts = self.setup_VAD() - self.run_opts = ( - {"device": "cuda:0"} if torch.cuda.is_available() else {"device": "cpu"} - ) + self.USE_GPU=USE_GPU + if self.USE_GPU==True: + self.run_opts = ({"device": "cuda:0"} if torch.cuda.is_available() else {"device": "cpu"}) + elif self.USE_GPU==False: + self.run_opts = ({"device": "cpu"}) + + + #self.run_opts = ( + # {"device": "cuda:0"} if torch.cuda.is_available() else {"device": "cpu"} + #) if embed_model == "xvec": self.embed_model = EncoderClassifier.from_hparams( @@ -57,6 +64,7 @@ def __init__( self.window = window self.period = period + def setup_VAD(self): model, utils = torch.hub.load( @@ -95,7 +103,7 @@ def windowed_embeds(self, signal, fs, window=1.5, period=0.75): segments.append([start, len_signal - 1]) embeds = [] - + with torch.no_grad(): for i, j in segments: signal_seg = signal[:, i:j] @@ -192,7 +200,11 @@ def diarize( enhance_sim=True, extra_info=False, outfile=None, + ): + + + """ Diarize a 16khz mono wav file, produces list of segments From 53394543e1fc6a214994cc4d4dfcd7716f662562 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Wed, 20 Mar 2024 11:18:06 +0100 Subject: [PATCH 2/4] install onnxruntime-gpu --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 02f3975..c396415 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ pandas>=1.3.5 scikit-learn>=1.0.2 speechbrain>=0.5.11 torchaudio>=0.10.1 -onnxruntime>=1.14.0 +onnxruntime-gpu>=1.14.0 scipy<=1.8.1 # newer version can provoke segmentation faults \ No newline at end of file From 7cc1c07f0973b1dd4f2870980e8d755b4d9cd692 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Wed, 20 Mar 2024 14:27:49 +0100 Subject: [PATCH 3/4] make a choice of device: None, cpu and cuda --- main.py | 4 ++-- requirements.txt | 5 +++-- simple_diarizer/diarizer.py | 28 ++++++++++++++-------------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 4b03382..509178c 100644 --- a/main.py +++ b/main.py @@ -14,13 +14,13 @@ parser.add_argument("--max_speakers", dest='max_speakers', default=25, type=int, help="Maximum number of speakers (if number of speaker is unknown)") parser.add_argument("--embed_model", dest='embed_model', default="ecapa", type=str, help="Name of embedding") parser.add_argument("--cluster_method", dest='cluster_method', default="nme-sc", type=str, help="Clustering method") -parser.add_argument("--gpu", dest='gpu', default=False, type=bool, help="use of GPU") +parser.add_argument("--device", dest='device', default=None, type=str, help="choise of cpu or cuda") args = parser.parse_args() diar = Diarizer( embed_model=args.embed_model, # 'xvec' and 'ecapa' supported cluster_method=args.cluster_method, # 'ahc' 'sc' and 'nme-sc' supported - USE_GPU=args.gpu + device=args.device ) WAV_FILE=args.audio_name diff --git a/requirements.txt b/requirements.txt index c396415..35236b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ # ipython>=7.9.0 # matplotlib>=3.5.1 -pandas>=1.3.5 +# pandas>=1.3.5 scikit-learn>=1.0.2 speechbrain>=0.5.11 torchaudio>=0.10.1 onnxruntime-gpu>=1.14.0 -scipy<=1.8.1 # newer version can provoke segmentation faults \ No newline at end of file +scipy<=1.8.1 # newer version can provoke segmentation faults + diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index 825ccc1..615290e 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -15,7 +15,7 @@ class Diarizer: def __init__( - self, embed_model="xvec", cluster_method="sc", window=1.5, period=0.75, USE_GPU=False + self, embed_model="xvec", cluster_method="sc", window=1.5, period=0.75, device=None ): assert embed_model in [ @@ -38,33 +38,33 @@ def __init__( self.vad_model, self.get_speech_ts = self.setup_VAD() - self.USE_GPU=USE_GPU - if self.USE_GPU==True: - self.run_opts = ({"device": "cuda:0"} if torch.cuda.is_available() else {"device": "cpu"}) - elif self.USE_GPU==False: - self.run_opts = ({"device": "cpu"}) - + if device is None: + self.device = "cuda" if torch.cuda.is_available() else "cpu" + elif device=="cpu": + os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # force torch.cuda.is_available() = False + self.device = "cpu" + elif device=="cuda": + os.environ['CUDA_VISIBLE_DEVICES'] = '0' + device_num=torch.cuda.current_device() + to_cuda = f'cuda:{device_num}' + self.device = to_cuda - #self.run_opts = ( - # {"device": "cuda:0"} if torch.cuda.is_available() else {"device": "cpu"} - #) - if embed_model == "xvec": self.embed_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb", - run_opts=self.run_opts, + run_opts={"device": self.device}, ) if embed_model == "ecapa": self.embed_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb", - run_opts=self.run_opts, + run_opts={"device": self.device}, ) self.window = window self.period = period - + def setup_VAD(self): model, utils = torch.hub.load( From eab2e310df665de28f8cf3078ceba828457078c7 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Thu, 21 Mar 2024 18:09:01 +0100 Subject: [PATCH 4/4] NME clustering with GPU --- simple_diarizer/diarizer.py | 161 +++++++++---------------- simple_diarizer/spectral_clustering.py | 43 ++++--- 2 files changed, 80 insertions(+), 124 deletions(-) diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index 615290e..7de4ec5 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -3,7 +3,6 @@ from copy import deepcopy import numpy as np -import pandas as pd import torch import torchaudio from speechbrain.inference.speaker import EncoderClassifier @@ -15,7 +14,13 @@ class Diarizer: def __init__( - self, embed_model="xvec", cluster_method="sc", window=1.5, period=0.75, device=None + self, + embed_model="xvec", + cluster_method="sc", + window=1.5, + period=0.75, + device=None, + device_vad="cpu", ): assert embed_model in [ @@ -35,41 +40,44 @@ def __init__( if cluster_method == "nme-sc": self.cluster = cluster_NME_SC + default_device = "cuda" if torch.cuda.is_available() else "cpu" + if device_vad is None: + device_vad = default_device - self.vad_model, self.get_speech_ts = self.setup_VAD() + self.vad_model, self.get_speech_ts = self.setup_VAD(device_vad) if device is None: - self.device = "cuda" if torch.cuda.is_available() else "cpu" - elif device=="cpu": - os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # force torch.cuda.is_available() = False - self.device = "cpu" - elif device=="cuda": - os.environ['CUDA_VISIBLE_DEVICES'] = '0' - device_num=torch.cuda.current_device() - to_cuda = f'cuda:{device_num}' - self.device = to_cuda - + device = default_device + + print(f"Devices: VAD={device_vad}, embedding={device}") + if embed_model == "xvec": self.embed_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb", - run_opts={"device": self.device}, + run_opts={"device": device}, ) - if embed_model == "ecapa": + elif embed_model == "ecapa": self.embed_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb", - run_opts={"device": self.device}, + run_opts={"device": device}, ) self.window = window self.period = period - - def setup_VAD(self): + def setup_VAD(self, device): + self.device_vad = device + use_gpu = device != "cpu" model, utils = torch.hub.load( - repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=True + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + onnx=not use_gpu, + # map_location=device ) + if use_gpu: + model = model.to(device) # force_reload=True) get_speech_ts = utils[0] @@ -79,7 +87,7 @@ def vad(self, signal): """ Runs the VAD model on the signal """ - return self.get_speech_ts(signal, self.vad_model) + return self.get_speech_ts(signal.to(self.device_vad), self.vad_model) def windowed_embeds(self, signal, fs, window=1.5, period=0.75): """ @@ -103,7 +111,7 @@ def windowed_embeds(self, signal, fs, window=1.5, period=0.75): segments.append([start, len_signal - 1]) embeds = [] - + with torch.no_grad(): for i, j in segments: signal_seg = signal[:, i:j] @@ -200,11 +208,7 @@ def diarize( enhance_sim=True, extra_info=False, outfile=None, - ): - - - """ Diarize a 16khz mono wav file, produces list of segments @@ -250,7 +254,7 @@ def diarize( Uses AHC/SC/NME-SC to cluster """ recname = os.path.splitext(os.path.basename(wav_file))[0] - + if check_wav_16khz_mono(wav_file): signal, fs = torchaudio.load(wav_file) else: @@ -267,34 +271,37 @@ def diarize( print("Running VAD...") speech_ts = self.vad(signal[0]) print("Splitting by silence found {} utterances".format(len(speech_ts))) - #assert len(speech_ts) >= 1, "Couldn't find any speech during VAD" + # assert len(speech_ts) >= 1, "Couldn't find any speech during VAD" if len(speech_ts) >= 1: print("Extracting embeddings...") embeds, segments = self.recording_embeds(signal, fs, speech_ts) - [w,k]=embeds.shape - if w >= 2: - print('Clustering to {} speakers...'.format(num_speakers)) - cluster_labels = self.cluster(embeds, n_clusters=num_speakers,max_speakers=max_speakers, - threshold=threshold, enhance_sim=enhance_sim) + [w, k] = embeds.shape + if w >= 2: + print("Clustering to {} speakers...".format(num_speakers)) + cluster_labels = self.cluster( + embeds, + n_clusters=num_speakers, + max_speakers=max_speakers, + threshold=threshold, + enhance_sim=enhance_sim, + ) - - cleaned_segments = self.join_segments(cluster_labels, segments) cleaned_segments = self.make_output_seconds(cleaned_segments, fs) - cleaned_segments = self.join_samespeaker_segments(cleaned_segments, - silence_tolerance=silence_tolerance) - - + cleaned_segments = self.join_samespeaker_segments( + cleaned_segments, silence_tolerance=silence_tolerance + ) + else: - cluster_labels =[ 1] + cluster_labels = [1] cleaned_segments = self.join_segments(cluster_labels, segments) cleaned_segments = self.make_output_seconds(cleaned_segments, fs) - + else: cleaned_segments = [] - + print("Done!") if outfile: self.rttm_output(cleaned_segments, recname, outfile=outfile) @@ -302,15 +309,17 @@ def diarize( if not extra_info: return cleaned_segments else: - return {"clean_segments": cleaned_segments, - "embeds": embeds, - "segments": segments, - "cluster_labels": cluster_labels} + return { + "clean_segments": cleaned_segments, + "embeds": embeds, + "segments": segments, + "cluster_labels": cluster_labels, + } @staticmethod def rttm_output(segments, recname, outfile=None, channel=0): assert outfile, "Please specify an outfile" - rttm_line = "SPEAKER {} "+str(channel)+" {} {} {} \n" + rttm_line = "SPEAKER {} " + str(channel) + " {} {} {} \n" with open(outfile, "w") as fp: for seg in segments: start = seg["start"] @@ -384,66 +393,6 @@ def match_diarization_to_transcript(segments, text_segments): return final_segments - def match_diarization_to_transcript_ctm(self, segments, ctm_file): - """ - Match the output of .diarize to a ctm file produced by asr - """ - ctm_df = pd.read_csv( - ctm_file, - delimiter=" ", - names=["utt", "channel", "start", "offset", "word", "confidence"], - ) - ctm_df["end"] = ctm_df["start"] + ctm_df["offset"] - - starts = ctm_df["start"].values - ends = ctm_df["end"].values - words = ctm_df["word"].values - - # Get the earliest start from either diar output or asr output - earliest_start = np.min([ctm_df["start"].values[0], segments[0]["start"]]) - - worded_segments = self.join_samespeaker_segments(segments) - worded_segments[0]["start"] = earliest_start - cutoffs = [] - - for seg in worded_segments: - end_idx = np.searchsorted(ctm_df["end"].values, seg["end"], side="left") - 1 - cutoffs.append(end_idx) - - indexes = [[0, cutoffs[0]]] - for c in cutoffs[1:]: - indexes.append([indexes[-1][-1], c]) - - indexes[-1][-1] = len(words) - - final_segments = [] - - for i, seg in enumerate(worded_segments): - s_idx, e_idx = indexes[i] - words = ctm_df["word"].values[s_idx:e_idx] - seg["words"] = " ".join(words) - if len(words) >= 1: - final_segments.append(seg) - else: - print( - "Removed segment between {} and {} as no words were matched".format( - seg["start"], seg["end"] - ) - ) - - return final_segments - - @staticmethod - def nice_text_output(worded_segments, outfile): - with open(outfile, "w") as fp: - for seg in worded_segments: - fp.write( - "[{} to {}] Speaker {}: \n".format( - round(seg["start"], 2), round(seg["end"], 2), seg["label"] - ) - ) - fp.write("{}\n\n".format(seg["words"])) - if __name__ == "__main__": wavfile = sys.argv[1] diff --git a/simple_diarizer/spectral_clustering.py b/simple_diarizer/spectral_clustering.py index 8b3ac01..05a1f4a 100644 --- a/simple_diarizer/spectral_clustering.py +++ b/simple_diarizer/spectral_clustering.py @@ -1,6 +1,7 @@ import numpy as np import scipy from sklearn.cluster import SpectralClustering +import torch # NME low-level operations # These functions are taken from the Kaldi scripts. @@ -38,6 +39,9 @@ def Eigengap(S): S = sorted(S) return np.diff(S) +def getLamdaGaplist(lambdas): + lambdas = np.real(lambdas) + return list(lambdas[1:] - lambdas[:-1]) # Computes parameters of normalized eigenmaps for automatic thresholding selection def ComputeNMEParameters(A, p, max_num_clusters): @@ -48,23 +52,27 @@ def ComputeNMEParameters(A, p, max_num_clusters): # Laplacian matrix computation Lp = Laplacian(Ap) # Get max_num_clusters+1 smallest eigenvalues - S = scipy.sparse.linalg.eigsh( - Lp, - k=max_num_clusters + 1, - which="SA", - tol=1e-6, - return_eigenvectors=False, - mode="buckling", - ) - # Get largest eigenvalue - Smax = scipy.sparse.linalg.eigsh( - Lp, k=1, which="LA", tol=1e-6, return_eigenvectors=False, mode="buckling" - ) + from torch.linalg import eigh + + if torch.cuda.is_available()== True: + laplacian = torch.from_numpy(Lp).float().to('cuda') + lambdas, _ = eigh(laplacian) + S = lambdas.cpu().numpy() + + else: + S, _ = eigh(laplacian) + # Eigengap computation - e = Eigengap(S) - g = np.max(e[:max_num_clusters]) / (Smax + 1e-10) - r = p / g - k = np.argmax(e[:max_num_clusters]) + + e = np.sort(S) + g = getLamdaGaplist(e) + k = np.argmax(g[: min(max_num_clusters, len(g))]) + arg_sorted_idx = np.argsort(g[: max_num_clusters])[::-1] + max_key = arg_sorted_idx[0] + max_eig_gap = g[max_key] / (max(e) + 1e-10) + r = (p / A.shape[0]) / (max_eig_gap + 1e-10) + + return (e, g, k, r) @@ -96,8 +104,7 @@ def NME_SpectralClustering( if rbest is None or rbest > r: rbest = r pbest = p - kbest = k - + kbest = k num_clusters = num_clusters if num_clusters is not None else (kbest + 1) return NME_SpectralClustering_sklearn( A, num_clusters, pbest