Skip to content

Commit

Permalink
code optimize, model update, scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
R1ckShi committed Jan 15, 2024
1 parent a035d68 commit 97d648c
Show file tree
Hide file tree
Showing 44 changed files with 447 additions and 462 deletions.
22 changes: 4 additions & 18 deletions examples/industrial_data_pretraining/bicif_paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,14 @@
from funasr import AutoModel

model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
model_revision="v2.0.0",
model_revision="v2.0.2",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.1",
spk_model="/Users/shixian/code/modelscope_models/speech_campplus_sv_zh-cn_16k-common",
punc_model_revision="v2.0.2",
spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
spk_model_revision="v2.0.2",
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60)
print(res)

'''try asr with speaker label with
model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
model_revision="v2.0.0",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.1",
spk_model="/Users/shixian/code/modelscope_models/speech_campplus_sv_zh-cn_16k-common",
spk_mode='punc_segment',
)
res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_speaker_demo.wav", batch_size_s=300, batch_size_threshold_s=60)
print(res)
'''
10 changes: 7 additions & 3 deletions examples/industrial_data_pretraining/bicif_paraformer/infer.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@

model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model_revision="v2.0.0"
model_revision="v2.0.2"
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
vad_model_revision="v2.0.0"
vad_model_revision="v2.0.2"
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
punc_model_revision="v2.0.1"
punc_model_revision="v2.0.2"
spk_model="damo/speech_campplus_sv_zh-cn_16k-common"
spk_model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand All @@ -13,6 +15,8 @@ python funasr/bin/inference.py \
+vad_model_revision=${vad_model_revision} \
+punc_model=${punc_model} \
+punc_model_revision=${punc_model_revision} \
+spk_model=${spk_model} \
+spk_model_revision=${spk_model_revision} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
Expand Down
4 changes: 3 additions & 1 deletion examples/industrial_data_pretraining/campplus_sv/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

from funasr import AutoModel

model = AutoModel(model="/Users/shixian/code/modelscope_models/speech_campplus_sv_zh-cn_16k-common")
model = AutoModel(model="damo/speech_campplus_sv_zh-cn_16k-common",
model_revision="v2.0.2",
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from funasr import AutoModel

model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.0")
model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='达摩院 魔搭')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
model_revision="v2.0.0"
model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/ct_transformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

from funasr import AutoModel

model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.1")
model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
print(res)


from funasr import AutoModel

model = AutoModel(model="damo/punc_ct-transformer_cn-en-common-vocab471067-large", model_revision="v2.0.1")
model = AutoModel(model="damo/punc_ct-transformer_cn-en-common-vocab471067-large", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
print(res)
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/ct_transformer/infer.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
model_revision="v2.0.1"
model_revision="v2.0.2"

model="damo/punc_ct-transformer_cn-en-common-vocab471067-large"
model_revision="v2.0.1"
model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from funasr import AutoModel

model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.0")
model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.2")

res = model(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
"欢迎大家来到魔搭社区进行体验"),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

model="damo/speech_timestamp_prediction-v1-16k-offline"
model_revision="v2.0.0"
model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from funasr import AutoModel

model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
model_revision="v2.0.0",
model_revision="v2.0.2",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.1",
punc_model_revision="v2.0.2",
spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
spk_model_revision="v2.0.0"
spk_model_revision="v2.0.2"
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@

model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model_revision="v2.0.0"
model_revision="v2.0.2"
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
vad_model_revision="v2.0.2"
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
punc_model_revision="v2.0.1"
punc_model_revision="v2.0.2"
spk_model="damo/speech_campplus_sv_zh-cn_16k-common"
spk_model_revision="v2.0.0"
spk_model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@

from funasr import AutoModel

model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.0")
model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2")

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)


from funasr import AutoFrontend

frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.0")
frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2")

fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)

Expand Down
2 changes: 1 addition & 1 deletion examples/industrial_data_pretraining/paraformer/infer.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model_revision="v2.0.0"
model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention

model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.0")
model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.2")
cache = {}
res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
chunk_size=chunk_size,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
model_revision="v2.0.0"
model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
4 changes: 2 additions & 2 deletions examples/industrial_data_pretraining/seaco_paraformer/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from funasr import AutoModel

model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
model_revision="v2.0.0",
model_revision="v2.0.2",
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
vad_model_revision="v2.0.2",
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
punc_model_revision="v2.0.1",
punc_model_revision="v2.0.2",
)

res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model_revision="v2.0.0"
model_revision="v2.0.2"
vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
vad_model_revision="v2.0.2"
punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
punc_model_revision="v2.0.1"
punc_model_revision="v2.0.2"

python funasr/bin/inference.py \
+model=${model} \
Expand Down
2 changes: 1 addition & 1 deletion funasr/bin/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def generate(self, input, input_len=None, model=None, kwargs=None, key=None, **c

time1 = time.perf_counter()
with torch.no_grad():
results, meta_data = model.generate(**batch, **kwargs)
results, meta_data = model.inference(**batch, **kwargs)
time2 = time.perf_counter()

asr_result_list.extend(results)
Expand Down
18 changes: 11 additions & 7 deletions funasr/models/bat/model.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
"""Boundary Aware Transducer (BAT) model."""
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)

import logging
from contextlib import contextmanager
from typing import Dict, List, Optional, Tuple, Union

import torch
import logging
import torch.nn as nn
from packaging.version import parse as V

from typing import Dict, List, Optional, Tuple, Union


from torch.cuda.amp import autocast
from funasr.losses.label_smoothing_loss import (
LabelSmoothingLoss, # noqa: H301
)

from funasr.models.transformer.utils.nets_utils import get_transducer_task_io
from funasr.metrics.compute_acc import th_accuracy
from funasr.models.transformer.utils.nets_utils import make_pad_mask
from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
from funasr.train_utils.device_funcs import force_gatherable

from torch.cuda.amp import autocast




Expand Down
38 changes: 18 additions & 20 deletions funasr/models/bicif_paraformer/cif_predictor.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)

import torch
from torch import nn
from torch import Tensor
import logging
import numpy as np
from funasr.train_utils.device_funcs import to_device
from funasr.models.transformer.utils.nets_utils import make_pad_mask
from funasr.models.scama.utils import sequence_mask
from typing import Optional, Tuple

from funasr.register import tables
from funasr.models.transformer.utils.nets_utils import make_pad_mask


class mae_loss(nn.Module):
class mae_loss(torch.nn.Module):

def __init__(self, normalize_length=False):
super(mae_loss, self).__init__()
Expand Down Expand Up @@ -95,7 +93,7 @@ def cif_wo_hidden(alphas, threshold):
return fires

@tables.register("predictor_classes", "CifPredictorV3")
class CifPredictorV3(nn.Module):
class CifPredictorV3(torch.nn.Module):
def __init__(self,
idim,
l_order,
Expand All @@ -116,9 +114,9 @@ def __init__(self,
):
super(CifPredictorV3, self).__init__()

self.pad = nn.ConstantPad1d((l_order, r_order), 0)
self.cif_conv1d = nn.Conv1d(idim, idim, l_order + r_order + 1)
self.cif_output = nn.Linear(idim, 1)
self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0)
self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1)
self.cif_output = torch.nn.Linear(idim, 1)
self.dropout = torch.nn.Dropout(p=dropout)
self.threshold = threshold
self.smooth_factor = smooth_factor
Expand All @@ -131,14 +129,14 @@ def __init__(self,
self.upsample_type = upsample_type
self.use_cif1_cnn = use_cif1_cnn
if self.upsample_type == 'cnn':
self.upsample_cnn = nn.ConvTranspose1d(idim, idim, self.upsample_times, self.upsample_times)
self.cif_output2 = nn.Linear(idim, 1)
self.upsample_cnn = torch.nn.ConvTranspose1d(idim, idim, self.upsample_times, self.upsample_times)
self.cif_output2 = torch.nn.Linear(idim, 1)
elif self.upsample_type == 'cnn_blstm':
self.upsample_cnn = nn.ConvTranspose1d(idim, idim, self.upsample_times, self.upsample_times)
self.blstm = nn.LSTM(idim, idim, 1, bias=True, batch_first=True, dropout=0.0, bidirectional=True)
self.cif_output2 = nn.Linear(idim*2, 1)
self.upsample_cnn = torch.nn.ConvTranspose1d(idim, idim, self.upsample_times, self.upsample_times)
self.blstm = torch.nn.LSTM(idim, idim, 1, bias=True, batch_first=True, dropout=0.0, bidirectional=True)
self.cif_output2 = torch.nn.Linear(idim*2, 1)
elif self.upsample_type == 'cnn_attn':
self.upsample_cnn = nn.ConvTranspose1d(idim, idim, self.upsample_times, self.upsample_times)
self.upsample_cnn = torch.nn.ConvTranspose1d(idim, idim, self.upsample_times, self.upsample_times)
from funasr.models.transformer.encoder import EncoderLayer as TransformerEncoderLayer
from funasr.models.transformer.attention import MultiHeadedAttention
from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward
Expand All @@ -157,7 +155,7 @@ def __init__(self,
True, #normalize_before,
False, #concat_after,
)
self.cif_output2 = nn.Linear(idim, 1)
self.cif_output2 = torch.nn.Linear(idim, 1)
self.smooth_factor2 = smooth_factor2
self.noise_threshold2 = noise_threshold2

Expand Down
Loading

0 comments on commit 97d648c

Please sign in to comment.