diff --git a/README.md b/README.md
index 06d3a98..2c21e2c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-## Updated on 2024.09.01
+## Updated on 2024.09.02
 > Usage instructions: [here](./docs/README.md#usage)
 
 > This page is modified from [here](https://github.com/Vincentqyw/cv-arxiv-daily)
@@ -563,13 +563,13 @@
 |**2023-12-21**|**Self-Supervised Adaptive AV Fusion Module for Pre-Trained ASR Models**|Christopher Simic et.al.|[2312.13873](http://arxiv.org/abs/2312.13873)|null|
 |**2024-02-03**|**kNN-CTC: Enhancing ASR via Retrieval of CTC Pseudo Labels**|Jiaming Zhou et.al.|[2312.13560](http://arxiv.org/abs/2312.13560)|**[link](https://github.com/nku-hlt/knn-ctc)**|
 
-<p align=right>(<a href=#updated-on-20240901>back to top</a>)</p>
+<p align=right>(<a href=#updated-on-20240902>back to top</a>)</p>
 
 ## TTS
 
 |Publish Date|Title|Authors|PDF|Code|
 |---|---|---|---|---|
-|**2024-08-29**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|null|
+|**2024-08-30**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|**[link](https://github.com/gpt-omni/mini-omni)**|
 |**2024-08-29**|**RAVE for Speech: Efficient Voice Conversion at High Sampling Rates**|Anders R. Bargum et.al.|[2408.16546](http://arxiv.org/abs/2408.16546)|null|
 |**2024-08-29**|**Enabling Beam Search for Language Model-Based Text-to-Speech Synthesis**|Zehai Tu et.al.|[2408.16373](http://arxiv.org/abs/2408.16373)|null|
 |**2024-08-28**|**Multi-modal Adversarial Training for Zero-Shot Voice Cloning**|John Janiczek et.al.|[2408.15916](http://arxiv.org/abs/2408.15916)|null|
@@ -698,5 +698,5 @@
 |**2024-06-11**|**Towards Expressive Zero-Shot Speech Synthesis with Hierarchical Prosody Modeling**|Yuepeng Jiang et.al.|[2406.05681](http://arxiv.org/abs/2406.05681)|null|
 |**2024-06-12**|**Text-aware and Context-aware Expressive Audiobook Speech Synthesis**|Dake Guo et.al.|[2406.05672](http://arxiv.org/abs/2406.05672)|null|
 
-<p align=right>(<a href=#updated-on-20240901>back to top</a>)</p>
+<p align=right>(<a href=#updated-on-20240902>back to top</a>)</p>
 
diff --git a/docs/asr-arxiv-daily-web.json b/docs/asr-arxiv-daily-web.json
index 70346a2..a35de80 100644
--- a/docs/asr-arxiv-daily-web.json
+++ b/docs/asr-arxiv-daily-web.json
@@ -1 +1 @@
-{"ASR": {"2408.00624": "|**2024-08-01**|**SynesLM: A Unified Approach for Audio-visual Speech Recognition and Translation via Language Model and Synthetic Data**|Yichen Lu et.al.|[2408.00624](http://arxiv.org/abs/2408.00624)|**[link](https://github.com/espnet/espnet)**|\n", "2408.00205": "|**2024-08-01**|**Sentence-wise Speech Summarization: Task, Datasets, and End-to-End Modeling with LM Knowledge Distillation**|Kohei Matsuura et.al.|[2408.00205](http://arxiv.org/abs/2408.00205)|null|\n", "2407.21783": "|**2024-08-15**|**The Llama 3 Herd of Models**|Abhimanyu Dubey et.al.|[2407.21783](http://arxiv.org/abs/2407.21783)|null|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.21414": "|**2024-07-31**|**Towards interfacing large language models with ASR systems using confidence measures and prompting**|Maryam Naderi et.al.|[2407.21414](http://arxiv.org/abs/2407.21414)|null|\n", "2407.21211": "|**2024-07-30**|**Self-Supervised Models in Automatic Whispered Speech Recognition**|Aref Farhadipour et.al.|[2407.21211](http://arxiv.org/abs/2407.21211)|null|\n", "2407.21066": "|**2024-07-28**|**ELP-Adapters: Parameter Efficient Adapter Tuning for Various Speech Processing Tasks**|Nakamasa Inoue et.al.|[2407.21066](http://arxiv.org/abs/2407.21066)|null|\n", "2407.21061": "|**2024-07-26**|**Improving noisy student training for low-resource languages in End-to-End ASR using CycleGAN and inter-domain losses**|Chia-Yu Li et.al.|[2407.21061](http://arxiv.org/abs/2407.21061)|null|\n", "2407.18581": "|**2024-08-07**|**Dynamic Language Group-Based MoE: Enhancing Code-Switching Speech Recognition with Hierarchical Routing**|Hukai Huang et.al.|[2407.18581](http://arxiv.org/abs/2407.18581)|**[link](https://github.com/kaihuhuang/language-group)**|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18461": "|**2024-07-26**|**Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation**|Shiyao Wang et.al.|[2407.18461](http://arxiv.org/abs/2407.18461)|**[link](https://github.com/nku-hlt/pb-dsr)**|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17874": "|**2024-07-25**|**Improving Domain-Specific ASR with LLM-Generated Contextual Descriptions**|Jiwon Suh et.al.|[2407.17874](http://arxiv.org/abs/2407.17874)|null|\n", "2407.17852": "|**2024-07-25**|**Scaling A Simple Approach to Zero-Shot Speech Recognition**|Jinming Zhao et.al.|[2407.17852](http://arxiv.org/abs/2407.17852)|**[link](https://github.com/facebookresearch/fairseq)**|\n", "2407.17605": "|**2024-07-24**|**Coupling Speech Encoders with Downstream Text Models**|Ciprian Chelba et.al.|[2407.17605](http://arxiv.org/abs/2407.17605)|null|\n", "2407.17160": "|**2024-07-24**|**A Comparative Analysis of Bilingual and Trilingual Wav2Vec Models for Automatic Speech Recognition in Multilingual Oral History Archives**|Jan Lehe\u010dka et.al.|[2407.17160](http://arxiv.org/abs/2407.17160)|null|\n", "2407.16537": "|**2024-07-23**|**Quantifying the Role of Textual Predictability in Automatic Speech Recognition**|Sean Robertson et.al.|[2407.16537](http://arxiv.org/abs/2407.16537)|null|\n", "2407.16447": "|**2024-07-23**|**The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization**|Samuele Cornell et.al.|[2407.16447](http://arxiv.org/abs/2407.16447)|null|\n", "2407.16370": "|**2024-07-23**|**Evolutionary Prompt Design for LLM-Based Post-ASR Error Correction**|Rithik Sachdev et.al.|[2407.16370](http://arxiv.org/abs/2407.16370)|**[link](https://github.com/rithiksachdev/PostASR-Correction-SLT2024)**|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15749": "|**2024-07-22**|**Robustness of Speech Separation Models for Similar-pitch Speakers**|Bunlong Lay et.al.|[2407.15749](http://arxiv.org/abs/2407.15749)|null|\n", "2407.15300": "|**2024-07-22**|**SELM: Enhancing Speech Emotion Recognition for Out-of-Domain Scenarios**|Hazim Bukhari et.al.|[2407.15300](http://arxiv.org/abs/2407.15300)|null|\n", "2407.14573": "|**2024-08-24**|**Trading Devil Final: Backdoor attack via Stock market and Bayesian Optimization**|Orson Mengara et.al.|[2407.14573](http://arxiv.org/abs/2407.14573)|null|\n", "2407.14021": "|**2024-07-19**|**GE2E-AC: Generalized End-to-End Loss Training for Accent Classification**|Chihiro Watanabe et.al.|[2407.14021](http://arxiv.org/abs/2407.14021)|null|\n", "2407.13982": "|**2024-07-19**|**Reexamining Racial Disparities in Automatic Speech Recognition Performance: The Role of Confounding by Provenance**|Changye Li et.al.|[2407.13982](http://arxiv.org/abs/2407.13982)|null|\n", "2408.00005": "|**2024-07-18**|**Framework for Curating Speech Datasets and Evaluating ASR Systems: A Case Study for Polish**|Micha\u0142 Junczyk et.al.|[2408.00005](http://arxiv.org/abs/2408.00005)|**[link](https://github.com/goodmike31/pl-asr-bigos-tools)**|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.13300": "|**2024-07-18**|**Robust ASR Error Correction with Conservative Data Filtering**|Takuma Udagawa et.al.|[2407.13300](http://arxiv.org/abs/2407.13300)|null|\n", "2407.13292": "|**2024-07-18**|**Low-Resourced Speech Recognition for Iu Mien Language via Weakly-Supervised Phoneme-based Multilingual Pre-training**|Lukuan Dong et.al.|[2407.13292](http://arxiv.org/abs/2407.13292)|null|\n", "2407.13266": "|**2024-07-18**|**How Private is Low-Frequency Speech Audio in the Wild? An Analysis of Verbal Intelligibility by Humans and Machines**|Ailin Liu et.al.|[2407.13266](http://arxiv.org/abs/2407.13266)|null|\n", "2407.13142": "|**2024-07-18**|**A light-weight and efficient punctuation and word casing prediction model for on-device streaming ASR**|Jian You et.al.|[2407.13142](http://arxiv.org/abs/2407.13142)|null|\n", "2407.12389": "|**2024-07-17**|**Morphosyntactic Analysis for CHILDES**|Houjun Liu et.al.|[2407.12389](http://arxiv.org/abs/2407.12389)|null|\n", "2407.12240": "|**2024-07-17**|**Adaptive Cascading Network for Continual Test-Time Adaptation**|Kien X. Nguyen et.al.|[2407.12240](http://arxiv.org/abs/2407.12240)|null|\n", "2407.12094": "|**2024-07-16**|**Identifying Speakers in Dialogue Transcripts: A Text-based Approach Using Pretrained Language Models**|Minh Nguyen et.al.|[2407.12094](http://arxiv.org/abs/2407.12094)|**[link](https://github.com/adobe-research/speaker-identification)**|\n", "2407.11828": "|**2024-07-17**|**Vibravox: A Dataset of French Speech Captured with Body-conduction Audio Sensors**|Julien Hauret et.al.|[2407.11828](http://arxiv.org/abs/2407.11828)|**[link](https://github.com/jhauret/vibravox)**|\n", "2407.11641": "|**2024-07-16**|**Investigating the Effect of Label Topology and Training Criterion on ASR Performance and Alignment Quality**|Tina Raissi et.al.|[2407.11641](http://arxiv.org/abs/2407.11641)|null|\n", "2407.11516": "|**2024-07-16**|**The VoicePrivacy 2022 Challenge: Progress and Perspectives in Voice Anonymisation**|Michele Panariello et.al.|[2407.11516](http://arxiv.org/abs/2407.11516)|null|\n", "2407.11345": "|**2024-07-16**|**Beyond Binary: Multiclass Paraphasia Detection with Generative Pretrained Transformers and End-to-End Models**|Matthew Perez et.al.|[2407.11345](http://arxiv.org/abs/2407.11345)|null|\n", "2407.10603": "|**2024-07-15**|**Leave No Knowledge Behind During Knowledge Distillation: Towards Practical and Effective Knowledge Distillation for Code-Switching ASR Using Realistic Data**|Liang-Hsuan Tseng et.al.|[2407.10603](http://arxiv.org/abs/2407.10603)|null|\n", "2407.10303": "|**2024-07-14**|**Improving Neural Biasing for Contextual Speech Recognition by Early Context Injection and Text Perturbation**|Ruizhe Huang et.al.|[2407.10303](http://arxiv.org/abs/2407.10303)|null|\n", "2407.10255": "|**2024-07-14**|**CUSIDE-T: Chunking, Simulating Future and Decoding for Transducer based Streaming ASR**|Wenbo Zhao et.al.|[2407.10255](http://arxiv.org/abs/2407.10255)|null|\n", "2407.10118": "|**2024-07-14**|**Textless Dependency Parsing by Labeled Sequence Prediction**|Shunsuke Kando et.al.|[2407.10118](http://arxiv.org/abs/2407.10118)|**[link](https://github.com/mynlp/speechparser)**|\n", "2407.10048": "|**2024-07-14**|**Whisper-SV: Adapting Whisper for Low-data-resource Speaker Verification**|Li Zhang et.al.|[2407.10048](http://arxiv.org/abs/2407.10048)|null|\n", "2407.09849": "|**2024-07-13**|**Text-Based Detection of On-Hold Scripts in Contact Center Calls**|Dmitrii Galimzianov et.al.|[2407.09849](http://arxiv.org/abs/2407.09849)|**[link](https://github.com/gal-dmitry/HOLD_DETECTION_PUBLIC)**|\n", "2407.09817": "|**2024-08-24**|**Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System**|Lingwei Meng et.al.|[2407.09817](http://arxiv.org/abs/2407.09817)|**[link](https://github.com/LingweiMeng/Whisper-Sidecar)**|\n", "2407.09807": "|**2024-07-13**|**A Streaming Multi-Channel End-to-End Speech Recognition System with Realistic Evaluations**|Xiangzhu Kong et.al.|[2407.09807](http://arxiv.org/abs/2407.09807)|**[link](https://github.com/thu-spmi/cat)**|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.08618": "|**2024-08-12**|**Tamil Language Computing: the Present and the Future**|Kengatharaiyer Sarveswaran et.al.|[2407.08618](http://arxiv.org/abs/2407.08618)|null|\n", "2407.08658": "|**2024-07-10**|**Evaluating Voice Command Pipelines for Drone Control: From STT and LLM to Direct Classification and Siamese Networks**|Lucca Emmanuel Pineli Sim\u00f5es et.al.|[2407.08658](http://arxiv.org/abs/2407.08658)|null|\n", "2407.07566": "|**2024-07-10**|**HebDB: a Weakly Supervised Dataset for Hebrew Speech Processing**|Arnon Turetzky et.al.|[2407.07566](http://arxiv.org/abs/2407.07566)|null|\n", "2407.18930": "|**2024-07-10**|**Dynamic Encoder Size Based on Data-Driven Layer-wise Pruning for Speech Recognition**|Jingjing Xu et.al.|[2407.18930](http://arxiv.org/abs/2407.18930)|null|\n", "2407.17416": "|**2024-07-10**|**Explaining Spectrograms in Machine Learning: A Study on Neural Networks for Speech Classification**|Jesin James et.al.|[2407.17416](http://arxiv.org/abs/2407.17416)|null|\n", "2407.06606": "|**2024-07-09**|**Tailored Design of Audio-Visual Speech Recognition Models using Branchformers**|David Gimeno-G\u00f3mez et.al.|[2407.06606](http://arxiv.org/abs/2407.06606)|**[link](https://github.com/david-gimeno/tailored-avsr)**|\n", "2407.06310": "|**2024-07-08**|**Homogeneous Speaker Features for On-the-Fly Dysarthric and Elderly Speaker Adaptation**|Mengzhe Geng et.al.|[2407.06310](http://arxiv.org/abs/2407.06310)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.14525": "|**2024-07-07**|**Morse Code-Enabled Speech Recognition for Individuals with Visual and Hearing Impairments**|Ritabrata Roy Choudhury et.al.|[2407.14525](http://arxiv.org/abs/2407.14525)|null|\n", "2407.04675": "|**2024-07-10**|**Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition**|Ye Bai et.al.|[2407.04675](http://arxiv.org/abs/2407.04675)|null|\n", "2407.04662": "|**2024-07-05**|**Multitaper mel-spectrograms for keyword spotting**|Douglas Baptista de Souza et.al.|[2407.04662](http://arxiv.org/abs/2407.04662)|null|\n", "2407.04652": "|**2024-07-05**|**Pretraining End-to-End Keyword Search with Automatically Discovered Acoustic Units**|Bolaji Yusuf et.al.|[2407.04652](http://arxiv.org/abs/2407.04652)|**[link](https://github.com/beer-asr/beer)**|\n", "2407.04641": "|**2024-07-05**|**Speculative Speech Recognition by Audio-Prefixed Low-Rank Adaptation of Language Models**|Bolaji Yusuf et.al.|[2407.04641](http://arxiv.org/abs/2407.04641)|null|\n", "2407.04601": "|**2024-07-05**|**Written Term Detection Improves Spoken Term Detection**|Bolaji Yusuf et.al.|[2407.04601](http://arxiv.org/abs/2407.04601)|**[link](https://github.com/bolajiy/golden-retriever)**|\n", "2407.04533": "|**2024-07-09**|**Performance Analysis of Speech Encoders for Low-Resource SLU and ASR in Tunisian Dialect**|Salima Mdhaffar et.al.|[2407.04533](http://arxiv.org/abs/2407.04533)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2407.04482": "|**2024-07-05**|**Controlling Whisper: Universal Acoustic Adversarial Attacks to Control Speech Foundation Models**|Vyas Raina et.al.|[2407.04482](http://arxiv.org/abs/2407.04482)|null|\n", "2407.04439": "|**2024-07-05**|**XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models**|Shashi Kumar et.al.|[2407.04439](http://arxiv.org/abs/2407.04439)|null|\n", "2407.04368": "|**2024-07-05**|**Romanization Encoding For Multilingual ASR**|Wen Ding et.al.|[2407.04368](http://arxiv.org/abs/2407.04368)|null|\n", "2407.04280": "|**2024-07-05**|**LearnerVoice: A Dataset of Non-Native English Learners' Spontaneous Speech**|Haechan Kim et.al.|[2407.04280](http://arxiv.org/abs/2407.04280)|null|\n", "2407.04219": "|**2024-07-05**|**Semi-supervised Learning for Code-Switching ASR with Large Language Model Filter**|Yu Xi et.al.|[2407.04219](http://arxiv.org/abs/2407.04219)|null|\n", "2407.04051": "|**2024-07-11**|**FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs**|Keyu An et.al.|[2407.04051](http://arxiv.org/abs/2407.04051)|**[link](https://github.com/FunAudioLLM/SenseVoice)**|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.03966": "|**2024-07-04**|**Serialized Output Training by Learned Dominance**|Ying Shi et.al.|[2407.03966](http://arxiv.org/abs/2407.03966)|null|\n", "2407.03809": "|**2024-07-04**|**Finetuning End-to-End Models for Estonian Conversational Spoken Language Translation**|Tiia Sildam et.al.|[2407.03809](http://arxiv.org/abs/2407.03809)|null|\n", "2407.03734": "|**2024-07-04**|**Improving Self-supervised Pre-training using Accent-Specific Codebooks**|Darshan Prabhu et.al.|[2407.03734](http://arxiv.org/abs/2407.03734)|**[link](https://github.com/csalt-research/accented-codebooks-asr)**|\n", "2407.03718": "|**2024-07-24**|**Multi-Convformer: Extending Conformer with Multiple Convolution Kernels**|Darshan Prabhu et.al.|[2407.03718](http://arxiv.org/abs/2407.03718)|**[link](https://github.com/espnet/espnet)**|\n", "2407.03563": "|**2024-07-04**|**Learning Video Temporal Dynamics with Cross-Modal Attention for Robust Audio-Visual Speech Recognition**|Sungnyun Kim et.al.|[2407.03563](http://arxiv.org/abs/2407.03563)|null|\n", "2407.03495": "|**2024-07-03**|**Codec-ASR: Training Performant Automatic Speech Recognition Systems with Discrete Speech Representations**|Kunal Dhawan et.al.|[2407.03495](http://arxiv.org/abs/2407.03495)|null|\n", "2407.03440": "|**2024-07-03**|**Advanced Framework for Animal Sound Classification With Features Optimization**|Qiang Yang et.al.|[2407.03440](http://arxiv.org/abs/2407.03440)|null|\n", "2407.03026": "|**2024-07-03**|**Qifusion-Net: Layer-adapted Stream/Non-stream Model for End-to-End Multi-Accent Speech Recognition**|Jinming Chen et.al.|[2407.03026](http://arxiv.org/abs/2407.03026)|null|\n", "2407.13782": "|**2024-07-03**|**Self-supervised ASR Models and Features For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2407.13782](http://arxiv.org/abs/2407.13782)|null|\n", "2407.02052": "|**2024-07-02**|**The USTC-NERCSLIP Systems for The ICMC-ASR Challenge**|Minghui Wu et.al.|[2407.02052](http://arxiv.org/abs/2407.02052)|null|\n", "2407.02543": "|**2024-07-02**|**Towards the Next Frontier in Speech Representation Learning Using Disentanglement**|Varun Krishna et.al.|[2407.02543](http://arxiv.org/abs/2407.02543)|null|\n", "2407.01909": "|**2024-07-02**|**Pinyin Regularization in Error Correction for Chinese Speech Recognition with Large Language Models**|Zhiyuan Tang et.al.|[2407.01909](http://arxiv.org/abs/2407.01909)|**[link](https://github.com/tzyll/ChineseHP)**|\n", "2407.17477": "|**2024-07-30**|**Toward Automated Detection of Biased Social Signals from the Content of Clinical Conversations**|Feng Chen et.al.|[2407.17477](http://arxiv.org/abs/2407.17477)|null|\n", "2407.00756": "|**2024-06-30**|**Less Forgetting for Better Generalization: Exploring Continual-learning Fine-tuning Methods for Speech Self-supervised Representations**|Salah Zaiem et.al.|[2407.00756](http://arxiv.org/abs/2407.00756)|null|\n", "2407.00518": "|**2024-06-29**|**When Robots Get Chatty: Grounding Multimodal Human-Robot Conversation and Collaboration**|Philipp Allgeuer et.al.|[2407.00518](http://arxiv.org/abs/2407.00518)|null|\n", "2407.12817": "|**2024-06-29**|**Error Correction by Paying Attention to Both Acoustic and Confidence References for Automatic Speech Recognition**|Yuchun Shu et.al.|[2407.12817](http://arxiv.org/abs/2407.12817)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2407.12029": "|**2024-06-29**|**A Quality-Aware Voltage Overscaling Framework to Improve the Energy Efficiency and Lifetime of TPUs based on Statistical Error Modeling**|Alireza Senobari et.al.|[2407.12029](http://arxiv.org/abs/2407.12029)|null|\n", "2407.12028": "|**2024-06-28**|**TreeSeg: Hierarchical Topic Segmentation of Large Transcripts**|Dimitrios C. Gklezakos et.al.|[2407.12028](http://arxiv.org/abs/2407.12028)|null|\n", "2406.19706": "|**2024-06-28**|**SAML: Speaker Adaptive Mixture of LoRA Experts for End-to-End ASR**|Qiuming Zhao et.al.|[2406.19706](http://arxiv.org/abs/2406.19706)|null|\n", "2406.19674": "|**2024-06-28**|**Less is More: Accurate Speech Recognition & Translation without Web-Scale Data**|Krishna C. Puvvada et.al.|[2406.19674](http://arxiv.org/abs/2406.19674)|null|\n", "2406.19564": "|**2024-06-27**|**Voices Unheard: NLP Resources and Models for Yor\u00f9b\u00e1 Regional Dialects**|Orevaoghene Ahia et.al.|[2406.19564](http://arxiv.org/abs/2406.19564)|**[link](https://github.com/orevaahia/yorulect)**|\n", "2406.19363": "|**2024-06-27**|**Tradition or Innovation: A Comparison of Modern ASR Methods for Forced Alignment**|Rotem Rousso et.al.|[2406.19363](http://arxiv.org/abs/2406.19363)|null|\n", "2406.19311": "|**2024-06-27**|**Zero-Query Adversarial Attack on Black-box Automatic Speech Recognition Systems**|Zheng Fang et.al.|[2406.19311](http://arxiv.org/abs/2406.19311)|null|\n", "2406.18972": "|**2024-06-27**|**Applying LLMs for Rescoring N-best ASR Hypotheses of Casual Conversations: Effects of Domain Adaptation and Context Carry-over**|Atsunori Ogawa et.al.|[2406.18972](http://arxiv.org/abs/2406.18972)|null|\n", "2406.18928": "|**2024-06-27**|**Enhanced ASR Robustness to Packet Loss with a Front-End Adaptation Network**|Yehoshua Dissen et.al.|[2406.18928](http://arxiv.org/abs/2406.18928)|null|\n", "2406.18862": "|**2024-06-27**|**Streaming Decoder-Only Automatic Speech Recognition with Discrete Speech Units: A Pilot Study**|Peikun Chen et.al.|[2406.18862](http://arxiv.org/abs/2406.18862)|**[link](https://github.com/chenpk00/IS2024_stream_decoder_only_asr)**|\n", "2406.18373": "|**2024-06-26**|**Dynamic Data Pruning for Automatic Speech Recognition**|Qiao Xiao et.al.|[2406.18373](http://arxiv.org/abs/2406.18373)|null|\n", "2406.18301": "|**2024-06-26**|**MSR-86K: An Evolving, Multilingual Corpus with 86,300 Hours of Transcribed Audio for Speech Recognition Research**|Song Li et.al.|[2406.18301](http://arxiv.org/abs/2406.18301)|null|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18120": "|**2024-07-12**|**ArzEn-LLM: Code-Switched Egyptian Arabic-English Translation and Speech Recognition Using LLMs**|Ahmed Heakl et.al.|[2406.18120](http://arxiv.org/abs/2406.18120)|**[link](https://github.com/ahmedheakl/arazn-llm)**|\n", "2406.18021": "|**2024-06-26**|**SC-MoE: Switch Conformer Mixture of Experts for Unified Streaming and Non-streaming Code-Switching ASR**|Shuaishuai Ye et.al.|[2406.18021](http://arxiv.org/abs/2406.18021)|null|\n", "2406.17935": "|**2024-06-25**|**Sequential Editing for Lifelong Training of Speech Recognition Models**|Devang Kulshreshtha et.al.|[2406.17935](http://arxiv.org/abs/2406.17935)|null|\n", "2406.17926": "|**2024-06-25**|**FASA: a Flexible and Automatic Speech Aligner for Extracting High-quality Aligned Children Speech Data**|Dancheng Liu et.al.|[2406.17926](http://arxiv.org/abs/2406.17926)|**[link](https://github.com/DanchengLiu/FASA)**|\n", "2406.17618": "|**2024-06-25**|**Towards Building an End-to-End Multilingual Automatic Lyrics Transcription Model**|Jiawen Huang et.al.|[2406.17618](http://arxiv.org/abs/2406.17618)|**[link](https://github.com/jhuang448/MultilingualALT)**|\n", "2406.17614": "|**2024-06-25**|**MSRS: Training Multimodal Speech Recognition Models from Scratch with Sparse Mask Optimization**|Adriana Fernandez-Lopez et.al.|[2406.17614](http://arxiv.org/abs/2406.17614)|null|\n", "2406.17825": "|**2024-06-25**|**Automatic speech recognition for the Nepali language using CNN, bidirectional LSTM and ResNet**|Manish Dhakal et.al.|[2406.17825](http://arxiv.org/abs/2406.17825)|**[link](https://github.com/manishdhakal/asr-nepali-using-cnn-bilstm-resnet)**|\n", "2406.17272": "|**2024-06-25**|**A Comprehensive Solution to Connect Speech Encoder and Large Language Model for ASR**|Van Tung Pham et.al.|[2406.17272](http://arxiv.org/abs/2406.17272)|null|\n", "2406.17124": "|**2024-06-24**|**Investigating Confidence Estimation Measures for Speaker Diarization**|Anurag Chowdhury et.al.|[2406.17124](http://arxiv.org/abs/2406.17124)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16777": "|**2024-06-24**|**Blending LLMs into Cascaded Speech Translation: KIT's Offline Speech Translation System for IWSLT 2024**|Sai Koneru et.al.|[2406.16777](http://arxiv.org/abs/2406.16777)|null|\n", "2406.16120": "|**2024-06-23**|**Contextualized End-to-end Automatic Speech Recognition with Intermediate Biasing Loss**|Muhammad Shakeel et.al.|[2406.16120](http://arxiv.org/abs/2406.16120)|null|\n", "2406.16107": "|**2024-08-01**|**Decoder-only Architecture for Streaming End-to-end Speech Recognition**|Emiru Tsunoo et.al.|[2406.16107](http://arxiv.org/abs/2406.16107)|null|\n", "2406.15723": "|**2024-06-22**|**Acoustic Feature Mixup for Balanced Multi-aspect Pronunciation Assessment**|Heejin Do et.al.|[2406.15723](http://arxiv.org/abs/2406.15723)|null|\n", "2406.15668": "|**2024-06-21**|**PI-Whisper: An Adaptive and Incremental ASR Framework for Diverse and Evolving Speaker Characteristics**|Amir Nassereldine et.al.|[2406.15668](http://arxiv.org/abs/2406.15668)|null|\n", "2406.15265": "|**2024-06-21**|**Perception of Phonological Assimilation by Neural Speech Recognition Models**|Charlotte Pouw et.al.|[2406.15265](http://arxiv.org/abs/2406.15265)|null|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14747": "|**2024-06-20**|**An Adapter-Based Unified Model for Multiple Spoken Language Processing Tasks**|Varsha Suresh et.al.|[2406.14747](http://arxiv.org/abs/2406.14747)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.14266": "|**2024-06-20**|**Intelligent Interface: Enhancing Lecture Engagement with Didactic Activity Summaries**|Anna Wr\u00f3blewska et.al.|[2406.14266](http://arxiv.org/abs/2406.14266)|null|\n", "2406.13842": "|**2024-06-19**|**Joint vs Sequential Speaker-Role Detection and Automatic Speech Recognition for Air-traffic Control**|Alexander Blatt et.al.|[2406.13842](http://arxiv.org/abs/2406.13842)|null|\n", "2406.13502": "|**2024-06-19**|**ManWav: The First Manchu ASR Model**|Jean Seo et.al.|[2406.13502](http://arxiv.org/abs/2406.13502)|null|\n", "2406.13431": "|**2024-06-24**|**Children's Speech Recognition through Discrete Token Enhancement**|Vrunda N. Sukhadia et.al.|[2406.13431](http://arxiv.org/abs/2406.13431)|null|\n", "2406.12699": "|**2024-06-18**|**Bridging the Gap: Integrating Pre-trained Speech Enhancement and Recognition Models for Robust Speech Recognition**|Kuan-Chen Wang et.al.|[2406.12699](http://arxiv.org/abs/2406.12699)|null|\n", "2406.12674": "|**2024-06-18**|**Transcribe, Align and Segment: Creating speech datasets for low-resource languages**|Taras Sereda et.al.|[2406.12674](http://arxiv.org/abs/2406.12674)|null|\n", "2406.12621": "|**2024-06-18**|**Growing Trees on Sounds: Assessing Strategies for End-to-End Dependency Parsing of Speech**|Adrien Pupier et.al.|[2406.12621](http://arxiv.org/abs/2406.12621)|**[link](https://github.com/Pupiera/Growing_tree_on_sound)**|\n", "2406.12611": "|**2024-06-18**|**Rapid Language Adaptation for Multilingual E2E Speech Recognition Using Encoder Prompting**|Yosuke Kashiwagi et.al.|[2406.12611](http://arxiv.org/abs/2406.12611)|null|\n", "2406.12503": "|**2024-06-18**|**Unsupervised Online Continual Learning for Automatic Speech Recognition**|Steven Vander Eeckt et.al.|[2406.12503](http://arxiv.org/abs/2406.12503)|**[link](https://github.com/stevenvdeeckt/unsupervised-ocl-for-asr)**|\n", "2406.12387": "|**2024-06-18**|**Performant ASR Models for Medical Entities in Accented Speech**|Tejumade Afonja et.al.|[2406.12387](http://arxiv.org/abs/2406.12387)|null|\n", "2406.12317": "|**2024-06-18**|**Finding Task-specific Subnetworks in Multi-task Spoken Language Understanding Model**|Hayato Futami et.al.|[2406.12317](http://arxiv.org/abs/2406.12317)|null|\n", "2406.12233": "|**2024-06-18**|**SyncVSR: Data-Efficient Visual Speech Recognition with End-to-End Crossmodal Audio Token Synchronization**|Young Jin Ahn et.al.|[2406.12233](http://arxiv.org/abs/2406.12233)|**[link](https://github.com/KAIST-AILab/SyncVSR)**|\n", "2406.11546": "|**2024-06-17**|**GigaSpeech 2: An Evolving, Large-Scale and Multi-domain ASR Corpus for Low-Resource Languages with Automated Crawling, Transcription and Refinement**|Yifan Yang et.al.|[2406.11546](http://arxiv.org/abs/2406.11546)|**[link](https://github.com/SpeechColab/GigaSpeech2)**|\n", "2406.12937": "|**2024-06-17**|**Self-Train Before You Transcribe**|Robert Flynn et.al.|[2406.12937](http://arxiv.org/abs/2406.12937)|**[link](https://github.com/robflynnyh/Self-Train-Before-You-Transcribe)**|\n", "2406.11064": "|**2024-06-16**|**Continual Test-time Adaptation for End-to-end Speech Recognition on Noisy Speech**|Guan-Ting Lin et.al.|[2406.11064](http://arxiv.org/abs/2406.11064)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.11025": "|**2024-06-16**|**Large Language Models for Dysfluency Detection in Stuttered Speech**|Dominik Wagner et.al.|[2406.11025](http://arxiv.org/abs/2406.11025)|null|\n", "2406.11022": "|**2024-06-16**|**Outlier Reduction with Gated Attention for Improved Post-training Quantization in Large Sequence-to-sequence Speech Foundation Models**|Dominik Wagner et.al.|[2406.11022](http://arxiv.org/abs/2406.11022)|null|\n", "2406.11016": "|**2024-06-16**|**Optimized Speculative Sampling for GPU Hardware Accelerators**|Dominik Wagner et.al.|[2406.11016](http://arxiv.org/abs/2406.11016)|null|\n", "2406.10993": "|**2024-06-16**|**CoSTA: Code-Switched Speech Translation using Aligned Speech-Text Interleaving**|Bhavani Shankar et.al.|[2406.10993](http://arxiv.org/abs/2406.10993)|null|\n", "2406.10932": "|**2024-06-16**|**Imperceptible Rhythm Backdoor Attacks: Exploring Rhythm Transformation for Embedding Undetectable Vulnerabilities on Speech Recognition**|Wenhan Yao et.al.|[2406.10932](http://arxiv.org/abs/2406.10932)|null|\n", "2406.12931": "|**2024-06-16**|**Automatic Speech Recognition for Biomedical Data in Bengali Language**|Shariar Kabir et.al.|[2406.12931](http://arxiv.org/abs/2406.12931)|null|\n", "2406.10741": "|**2024-06-15**|**Speech Emotion Recognition Using CNN and Its Use Case in Digital Healthcare**|Nishargo Nigar et.al.|[2406.10741](http://arxiv.org/abs/2406.10741)|null|\n", "2406.10719": "|**2024-06-21**|**Trading Devil: Robust backdoor attack via Stochastic investment models and Bayesian approach**|Orson Mengara et.al.|[2406.10719](http://arxiv.org/abs/2406.10719)|null|\n", "2406.10177": "|**2024-06-14**|**Inclusive ASR for Disfluent Speech: Cascaded Large-Scale Self-Supervised Learning with Targeted Fine-Tuning and Data Augmentation**|Dena Mujtaba et.al.|[2406.10177](http://arxiv.org/abs/2406.10177)|null|\n", "2406.10083": "|**2024-06-14**|**On the Evaluation of Speech Foundation Models for Spoken Language Understanding**|Siddhant Arora et.al.|[2406.10083](http://arxiv.org/abs/2406.10083)|null|\n", "2406.10082": "|**2024-06-14**|**Whisper-Flamingo: Integrating Visual Features into Whisper for Audio-Visual Speech Recognition and Translation**|Andrew Rouditchenko et.al.|[2406.10082](http://arxiv.org/abs/2406.10082)|**[link](https://github.com/roudimit/whisper-flamingo)**|\n", "2406.10052": "|**2024-06-14**|**Simul-Whisper: Attention-Guided Streaming Whisper with Truncation Detection**|Haoyu Wang et.al.|[2406.10052](http://arxiv.org/abs/2406.10052)|**[link](https://github.com/backspacetg/simul_whisper)**|\n", "2406.09999": "|**2024-06-14**|**ROAR: Reinforcing Original to Augmented Data Ratio Dynamics for Wav2Vec2.0 Based ASR**|Vishwanath Pratap Singh et.al.|[2406.09999](http://arxiv.org/abs/2406.09999)|null|\n", "2406.10313": "|**2024-06-14**|**CNVSRC 2023: The First Chinese Continuous Visual Speech Recognition Challenge**|Chen Chen et.al.|[2406.10313](http://arxiv.org/abs/2406.10313)|null|\n", "2406.09950": "|**2024-06-14**|**An efficient text augmentation approach for contextualized Mandarin speech recognition**|Naijun Zheng et.al.|[2406.09950](http://arxiv.org/abs/2406.09950)|null|\n", "2406.09873": "|**2024-06-14**|**Perceiver-Prompt: Flexible Speaker Adaptation in Whisper for Chinese Disordered Speech Recognition**|Yicong Jiang et.al.|[2406.09873](http://arxiv.org/abs/2406.09873)|null|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.09676": "|**2024-06-14**|**Optimizing Byte-level Representation for End-to-end ASR**|Roger Hsiao et.al.|[2406.09676](http://arxiv.org/abs/2406.09676)|null|\n", "2406.09662": "|**2024-06-14**|**Learning Language Structures through Grounding**|Freda Shi et.al.|[2406.09662](http://arxiv.org/abs/2406.09662)|null|\n", "2406.09618": "|**2024-06-13**|**Multi-Modal Retrieval For Large Language Model Based Speech Recognition**|Jari Kolehmainen et.al.|[2406.09618](http://arxiv.org/abs/2406.09618)|null|\n", "2406.09569": "|**2024-06-13**|**Speech ReaLLM -- Real-time Streaming Speech Recognition with Multimodal LLMs by Teaching the Flow of Time**|Frank Seide et.al.|[2406.09569](http://arxiv.org/abs/2406.09569)|null|\n", "2406.09494": "|**2024-06-13**|**The Second DISPLACE Challenge : DIarization of SPeaker and LAnguage in Conversational Environments**|Shareef Babu Kalluri et.al.|[2406.09494](http://arxiv.org/abs/2406.09494)|null|\n", "2406.09202": "|**2024-06-13**|**Language Complexity and Speech Recognition Accuracy: Orthographic Complexity Hurts, Phonological Complexity Doesn't**|Chihiro Taguchi et.al.|[2406.09202](http://arxiv.org/abs/2406.09202)|**[link](https://github.com/ctaguchi/asrcomplexity)**|\n", "2406.09153": "|**2024-06-13**|**LASER: Learning by Aligning Self-supervised Representations of Speech for Improving Content-related Tasks**|Amit Meghanani et.al.|[2406.09153](http://arxiv.org/abs/2406.09153)|**[link](https://github.com/Trikaldarshi/LASER)**|\n", "2406.08914": "|**2024-06-13**|**Transcription-Free Fine-Tuning of Speech Separation Models for Noisy and Reverberant Multi-Speaker Automatic Speech Recognition**|William Ravenscroft et.al.|[2406.08914](http://arxiv.org/abs/2406.08914)|null|\n", "2406.08904": "|**2024-06-13**|**AdaPTwin: Low-Cost Adaptive Compression of Product Twins in Transformers**|Emil Biju et.al.|[2406.08904](http://arxiv.org/abs/2406.08904)|null|\n", "2406.08641": "|**2024-06-12**|**ML-SUPERB 2.0: Benchmarking Multilingual Speech Models Across Modeling Constraints, Languages, and Datasets**|Jiatong Shi et.al.|[2406.08641](http://arxiv.org/abs/2406.08641)|null|\n", "2406.08396": "|**2024-06-12**|**Neural Blind Source Separation and Diarization for Distant Speech Recognition**|Yoshiaki Bando et.al.|[2406.08396](http://arxiv.org/abs/2406.08396)|null|\n", "2406.08380": "|**2024-06-12**|**Towards Unsupervised Speech Recognition Without Pronunciation Models**|Junrui Ni et.al.|[2406.08380](http://arxiv.org/abs/2406.08380)|null|\n", "2406.08353": "|**2024-06-12**|**Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study on Word Error Rate and Fusion Techniques**|Yuanchao Li et.al.|[2406.08353](http://arxiv.org/abs/2406.08353)|**[link](https://github.com/yc-li20/SER-on-WER-and-Fusion)**|\n", "2406.08266": "|**2024-06-13**|**Refining Self-Supervised Learnt Speech Representation using Brain Activations**|Hengyu Li et.al.|[2406.08266](http://arxiv.org/abs/2406.08266)|null|\n", "2406.08207": "|**2024-06-12**|**Transformer-based Model for ASR N-Best Rescoring and Rewriting**|Iwen E. Kang et.al.|[2406.08207](http://arxiv.org/abs/2406.08207)|null|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.10284": "|**2024-06-12**|**Improving child speech recognition with augmented child-like speech**|Yuanyuan Zhang et.al.|[2406.10284](http://arxiv.org/abs/2406.10284)|null|\n", "2406.07914": "|**2024-06-14**|**Can Large Language Models Understand Spatial Audio?**|Changli Tang et.al.|[2406.07914](http://arxiv.org/abs/2406.07914)|null|\n", "2406.07909": "|**2024-06-12**|**Guiding Frame-Level CTC Alignments Using Self-knowledge Distillation**|Eungbeom Kim et.al.|[2406.07909](http://arxiv.org/abs/2406.07909)|null|\n", "2406.07846": "|**2024-06-12**|**DualVC 3: Leveraging Language Model Generated Pseudo Context for End-to-end Low Latency Streaming Voice Conversion**|Ziqian Ning et.al.|[2406.07846](http://arxiv.org/abs/2406.07846)|null|\n", "2406.07842": "|**2024-06-12**|**Dual-Pipeline with Low-Rank Adaptation for New Language Integration in Multilingual ASR**|Yerbolat Khassanov et.al.|[2406.07842](http://arxiv.org/abs/2406.07842)|null|\n", "2406.07823": "|**2024-06-12**|**PRoDeliberation: Parallel Robust Deliberation for End-to-End Spoken Language Understanding**|Trang Le et.al.|[2406.07823](http://arxiv.org/abs/2406.07823)|null|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.09443": "|**2024-06-12**|**Comparative Analysis of Personalized Voice Activity Detection Systems: Assessing Real-World Effectiveness**|Satyam Kumar et.al.|[2406.09443](http://arxiv.org/abs/2406.09443)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07256": "|**2024-06-11**|**AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection**|Rong Gong et.al.|[2406.07256](http://arxiv.org/abs/2406.07256)|null|\n", "2406.07589": "|**2024-06-11**|**Tag and correct: high precision post-editing approach to correction of speech recognition errors**|Tomasz Zi\u0119tkiewicz et.al.|[2406.07589](http://arxiv.org/abs/2406.07589)|null|\n", "2406.07096": "|**2024-06-11**|**Fast Context-Biasing for CTC and Transducer ASR models with CTC-based Word Spotter**|Andrei Andrusenko et.al.|[2406.07096](http://arxiv.org/abs/2406.07096)|null|\n", "2406.07090": "|**2024-07-29**|**Spoken Language Corpora Augmentation with Domain-Specific Voice-Cloned Speech**|Mateusz Czy\u017cnikiewicz et.al.|[2406.07090](http://arxiv.org/abs/2406.07090)|null|\n", "2406.07060": "|**2024-06-11**|**Reading Miscue Detection in Primary School through Automatic Speech Recognition**|Lingyun Gao et.al.|[2406.07060](http://arxiv.org/abs/2406.07060)|null|\n", "2406.06729": "|**2024-06-10**|**Synthetic Query Generation using Large Language Models for Virtual Assistants**|Sonal Sannigrahi et.al.|[2406.06729](http://arxiv.org/abs/2406.06729)|null|\n", "2406.06664": "|**2024-06-13**|**ASTRA: Aligning Speech and Text Representations for Asr without Sampling**|Neeraj Gaur et.al.|[2406.06664](http://arxiv.org/abs/2406.06664)|null|\n", "2406.06329": "|**2024-06-10**|**A Parameter-efficient Language Extension Framework for Multilingual ASR**|Wei Liu et.al.|[2406.06329](http://arxiv.org/abs/2406.06329)|null|\n", "2406.05968": "|**2024-06-10**|**Prompting Large Language Models with Audio for General-Purpose Speech Summarization**|Wonjune Kang et.al.|[2406.05968](http://arxiv.org/abs/2406.05968)|**[link](https://github.com/wonjune-kang/llm-speech-summarization)**|\n", "2406.05806": "|**2024-07-18**|**Do Prompts Really Prompt? Exploring the Prompt Understanding Capability of Whisper**|Chih-Kai Yang et.al.|[2406.05806](http://arxiv.org/abs/2406.05806)|null|\n", "2406.05784": "|**2024-07-20**|**Optimizing Multi-Stuttered Speech Classification: Leveraging Whisper's Encoder for Efficient Parameter Reduction in Automated Assessment**|Huma Ameer et.al.|[2406.05784](http://arxiv.org/abs/2406.05784)|null|\n", "2406.05661": "|**2024-06-09**|**MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked Language Modelling methods for learning Speech Representations**|Hemant Yadav et.al.|[2406.05661](http://arxiv.org/abs/2406.05661)|null|\n", "2406.04927": "|**2024-06-07**|**LLM-based speaker diarization correction: A generalizable approach**|Georgios Efstathiadis et.al.|[2406.04927](http://arxiv.org/abs/2406.04927)|**[link](https://github.com/GeorgeEfstathiadis/LLM-Diarize-ASR-Agnostic)**|\n", "2406.04791": "|**2024-07-02**|**Speaker-Smoothed kNN Speaker Adaptation for End-to-End ASR**|Shaojun Li et.al.|[2406.04791](http://arxiv.org/abs/2406.04791)|null|\n", "2406.06619": "|**2024-06-07**|**LoRA-Whisper: Parameter-Efficient and Extensible Multilingual ASR**|Zheshu Song et.al.|[2406.06619](http://arxiv.org/abs/2406.06619)|null|\n", "2406.04595": "|**2024-06-07**|**Pitch-Aware RNN-T for Mandarin Chinese Mispronunciation Detection and Diagnosis**|Xintong Wang et.al.|[2406.04595](http://arxiv.org/abs/2406.04595)|null|\n", "2406.04552": "|**2024-06-06**|**Flexible Multichannel Speech Enhancement for Noise-Robust Frontend**|Ante Juki\u0107 et.al.|[2406.04552](http://arxiv.org/abs/2406.04552)|null|\n", "2406.04541": "|**2024-06-06**|**Label-Synchronous Neural Transducer for E2E Simultaneous Speech Translation**|Keqi Deng et.al.|[2406.04541](http://arxiv.org/abs/2406.04541)|**[link](https://github.com/D-Keqi/LS-Transducer-SST)**|\n", "2406.04512": "|**2024-06-06**|**To Distill or Not to Distill? On the Robustness of Robust Knowledge Distillation**|Abdul Waheed et.al.|[2406.04512](http://arxiv.org/abs/2406.04512)|null|\n", "2406.04432": "|**2024-06-06**|**LipGER: Visually-Conditioned Generative Error Correction for Robust Automatic Speech Recognition**|Sreyan Ghosh et.al.|[2406.04432](http://arxiv.org/abs/2406.04432)|**[link](https://github.com/sreyan88/lipger)**|\n", "2406.04269": "|**2024-06-06**|**Beyond Performance Plateaus: A Comprehensive Study on Scalability in Speech Enhancement**|Wangyou Zhang et.al.|[2406.04269](http://arxiv.org/abs/2406.04269)|**[link](https://github.com/emrys365/se-scaling)**|\n", "2406.04240": "|**2024-07-02**|**Hypernetworks for Personalizing ASR to Atypical Speech**|Max M\u00fcller-Eberstein et.al.|[2406.04240](http://arxiv.org/abs/2406.04240)|null|\n", "2406.04123": "|**2024-06-06**|**Helsinki Speech Challenge 2024**|Martin Ludvigsen et.al.|[2406.04123](http://arxiv.org/abs/2406.04123)|null|\n", "2406.03872": "|**2024-06-06**|**BLSP-Emo: Towards Empathetic Large Speech-Language Models**|Chen Wang et.al.|[2406.03872](http://arxiv.org/abs/2406.03872)|**[link](https://github.com/cwang621/blsp-emo)**|\n", "2406.03814": "|**2024-06-14**|**Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and Gated Monolingual Datastores**|Jiaming Zhou et.al.|[2406.03814](http://arxiv.org/abs/2406.03814)|null|\n", "2406.03791": "|**2024-06-06**|**Speed of Light Exact Greedy Decoding for RNN-T Speech Recognition Models on GPU**|Daniel Galvez et.al.|[2406.03791](http://arxiv.org/abs/2406.03791)|null|\n", "2406.03274": "|**2024-06-11**|**Enhancing CTC-based speech recognition with diverse modeling units**|Shiyi Han et.al.|[2406.03274](http://arxiv.org/abs/2406.03274)|null|\n", "2406.03235": "|**2024-06-05**|**Error-preserving Automatic Speech Recognition of Young English Learners' Language**|Janick Michot et.al.|[2406.03235](http://arxiv.org/abs/2406.03235)|**[link](https://github.com/mict-zhaw/chall_e2e_stt)**|\n", "2406.03049": "|**2024-06-05**|**StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning**|Shaolei Zhang et.al.|[2406.03049](http://arxiv.org/abs/2406.03049)|**[link](https://github.com/ictnlp/streamspeech)**|\n", "2406.02950": "|**2024-06-05**|**4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and Mask Predict Decoders**|Yui Sudo et.al.|[2406.02950](http://arxiv.org/abs/2406.02950)|null|\n", "2406.02925": "|**2024-06-15**|**Task Arithmetic can Mitigate Synthetic-to-Real Gap in Automatic Speech Recognition**|Hsuan Su et.al.|[2406.02925](http://arxiv.org/abs/2406.02925)|null|\n", "2406.02921": "|**2024-06-11**|**Text Injection for Neural Contextual Biasing**|Zhong Meng et.al.|[2406.02921](http://arxiv.org/abs/2406.02921)|null|\n", "2406.06582": "|**2024-06-25**|**Discrete Multimodal Transformers with a Pretrained Large Language Model for Mixed-Supervision Speech Processing**|Viet Anh Trinh et.al.|[2406.06582](http://arxiv.org/abs/2406.06582)|null|\n", "2406.02649": "|**2024-06-04**|**Keyword-Guided Adaptation of Automatic Speech Recognition**|Aviv Shamsian et.al.|[2406.02649](http://arxiv.org/abs/2406.02649)|null|\n", "2406.02166": "|**2024-06-04**|**Whistle: Data-Efficient Multilingual and Crosslingual Speech Recognition via Weakly Phonetic Supervision**|Saierdaer Yusuyin et.al.|[2406.02166](http://arxiv.org/abs/2406.02166)|**[link](https://github.com/thu-spmi/cat)**|\n", "2406.02004": "|**2024-06-05**|**Efficiently Train ASR Models that Memorize Less and Perform Better with Per-core Clipping**|Lun Wang et.al.|[2406.02004](http://arxiv.org/abs/2406.02004)|null|\n", "2406.01446": "|**2024-06-03**|**Enabling ASR for Low-Resource Languages: A Comprehensive Dataset Creation Approach**|Ara Yeroyan et.al.|[2406.01446](http://arxiv.org/abs/2406.01446)|null|\n", "2406.01314": "|**2024-06-03**|**Compute-Efficient Medical Image Classification with Softmax-Free Transformers and Sequence Normalization**|Firas Khader et.al.|[2406.01314](http://arxiv.org/abs/2406.01314)|null|\n", "2406.00899": "|**2024-06-02**|**YODAS: Youtube-Oriented Dataset for Audio and Speech**|Xinjian Li et.al.|[2406.00899](http://arxiv.org/abs/2406.00899)|null|\n", "2406.00522": "|**2024-06-01**|**Wav2Prompt: End-to-End Speech Prompt Generation and Tuning For LLM in Zero and Few-shot Learning**|Keqi Deng et.al.|[2406.00522](http://arxiv.org/abs/2406.00522)|null|\n", "2407.11982": "|**2024-05-31**|**Open the Data! Chuvash Datasets**|Nikolay Plotnikov et.al.|[2407.11982](http://arxiv.org/abs/2407.11982)|null|\n", "2405.18669": "|**2024-05-31**|**Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities**|Vicky Zayats et.al.|[2405.18669](http://arxiv.org/abs/2405.18669)|null|\n", "2405.18537": "|**2024-05-28**|**Augmented Conversation with Embedded Speech-Driven On-the-Fly Referencing in AR**|Shivesh Jadon et.al.|[2405.18537](http://arxiv.org/abs/2405.18537)|null|\n", "2405.18346": "|**2024-05-28**|**Intelligent Clinical Documentation: Harnessing Generative AI for Patient-Centric Clinical Note Generation**|Anjanava Biswas et.al.|[2405.18346](http://arxiv.org/abs/2405.18346)|null|\n", "2405.17874": "|**2024-05-28**|**NUTS, NARS, and Speech**|D. van der Sluis et.al.|[2405.17874](http://arxiv.org/abs/2405.17874)|null|\n", "2405.17809": "|**2024-05-28**|**TransVIP: Speech to Speech Translation System with Voice and Isochrony Preservation**|Chenyang Le et.al.|[2405.17809](http://arxiv.org/abs/2405.17809)|null|\n", "2405.17376": "|**2024-05-27**|**Federating Dynamic Models using Early-Exit Architectures for Automatic Speech Recognition on Heterogeneous Clients**|Mohamed Nabih Ali et.al.|[2405.17376](http://arxiv.org/abs/2405.17376)|null|\n", "2405.17250": "|**2024-05-27**|**\"Pass the butter\": A study on desktop-classic multitasking robotic arm based on advanced YOLOv7 and BERT**|Haohua Que et.al.|[2405.17250](http://arxiv.org/abs/2405.17250)|null|\n", "2406.00038": "|**2024-05-27**|**ViSpeR: Multilingual Audio-Visual Speech Recognition**|Sanath Narayan et.al.|[2406.00038](http://arxiv.org/abs/2406.00038)|null|\n", "2405.16952": "|**2024-05-27**|**A Variance-Preserving Interpolation Approach for Diffusion Models with Applications to Single Channel Speech Enhancement and Recognition**|Zilu Guo et.al.|[2405.16952](http://arxiv.org/abs/2405.16952)|**[link](https://github.com/zelokuo/VPIDM)**|\n", "2405.15216": "|**2024-05-24**|**Denoising LM: Pushing the Limits of Error Correction Models for Speech Recognition**|Zijin Gu et.al.|[2405.15216](http://arxiv.org/abs/2405.15216)|null|\n", "2405.15097": "|**2024-05-23**|**Contrastive and Consistency Learning for Neural Noisy-Channel Model in Spoken Language Understanding**|Suyoung Kim et.al.|[2405.15097](http://arxiv.org/abs/2405.15097)|**[link](https://github.com/syoung7388/ccl)**|\n", "2405.14259": "|**2024-06-02**|**Let's Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Multi-modal Text Recognition**|Chan-Jan Hsu et.al.|[2405.14259](http://arxiv.org/abs/2405.14259)|**[link](https://github.com/mtkresearch/generative-fusion-decoding)**|\n", "2405.14161": "|**2024-05-23**|**Self-Taught Recognizer: Toward Unsupervised Adaptation for Speech Foundation Models**|Yuchen Hu et.al.|[2405.14161](http://arxiv.org/abs/2405.14161)|**[link](https://github.com/yuchen005/star-adapt)**|\n", "2405.14093": "|**2024-05-23**|**A Survey on Vision-Language-Action Models for Embodied AI**|Yueen Ma et.al.|[2405.14093](http://arxiv.org/abs/2405.14093)|null|\n", "2405.13903": "|**2024-05-22**|**ST-Gait++: Leveraging spatio-temporal convolutions for gait-based emotion recognition on videos**|Maria Lu\u00edsa Lima et.al.|[2405.13903](http://arxiv.org/abs/2405.13903)|null|\n", "2405.13514": "|**2024-05-22**|**Joint Optimization of Streaming and Non-Streaming Automatic Speech Recognition with Multi-Decoder and Knowledge Distillation**|Muhammad Shakeel et.al.|[2405.13514](http://arxiv.org/abs/2405.13514)|null|\n", "2405.13477": "|**2024-05-22**|**A Near-Real-Time Processing Ego Speech Filtering Pipeline Designed for Speech Interruption During Human-Robot Interaction**|Yue Li et.al.|[2405.13477](http://arxiv.org/abs/2405.13477)|null|\n", "2405.13379": "|**2024-05-22**|**You don't understand me!: Comparing ASR results for L1 and L2 speakers of Swedish**|Ronald Cumbal et.al.|[2405.13379](http://arxiv.org/abs/2405.13379)|null|\n", "2405.13344": "|**2024-05-22**|**Contextualized Automatic Speech Recognition with Dynamic Vocabulary**|Yui Sudo et.al.|[2405.13344](http://arxiv.org/abs/2405.13344)|null|\n", "2405.13166": "|**2024-05-28**|**FairLENS: Assessing Fairness in Law Enforcement Speech Recognition**|Yicheng Wang et.al.|[2405.13166](http://arxiv.org/abs/2405.13166)|null|\n", "2405.13162": "|**2024-05-21**|**Non-autoregressive real-time Accent Conversion model with voice cloning**|Vladimir Nechaev et.al.|[2405.13162](http://arxiv.org/abs/2405.13162)|null|\n", "2405.12815": "|**2024-05-21**|**Could a Computer Architect Understand our Brain?**|Valentin Puente-Varona et.al.|[2405.12815](http://arxiv.org/abs/2405.12815)|null|\n", "2405.12609": "|**2024-07-01**|**Mamba in Speech: Towards an Alternative to Self-Attention**|Xiangyu Zhang et.al.|[2405.12609](http://arxiv.org/abs/2405.12609)|null|\n", "2405.12018": "|**2024-05-20**|**Continuous Sign Language Recognition with Adapted Conformer via Unsupervised Pretraining**|Neena Aloysius et.al.|[2405.12018](http://arxiv.org/abs/2405.12018)|null|\n", "2405.11078": "|**2024-05-17**|**Acoustic modeling for Overlapping Speech Recognition: JHU Chime-5 Challenge System**|Vimal Manohar et.al.|[2405.11078](http://arxiv.org/abs/2405.11078)|**[link](https://github.com/fgnt/nara_wpe)**|\n", "2405.10025": "|**2024-05-16**|**Listen Again and Choose the Right Answer: A New Paradigm for Automatic Speech Recognition with Large Language Models**|Yuchen Hu et.al.|[2405.10025](http://arxiv.org/abs/2405.10025)|null|\n", "2405.09708": "|**2024-05-15**|**No More Mumbles: Enhancing Robot Intelligibility through Speech Adaptation**|Qiaoqiao Ren et.al.|[2405.09708](http://arxiv.org/abs/2405.09708)|**[link](https://github.com/qiaoqiao2323/robot-speech-intelligibility)**|\n", "2405.09470": "|**2024-05-15**|**Towards Evaluating the Robustness of Automatic Speech Recognition Systems via Audio Style Transfer**|Weifei Jin et.al.|[2405.09470](http://arxiv.org/abs/2405.09470)|null|\n", "2405.13018": "|**2024-05-15**|**Continued Pretraining for Domain Adaptation of Wav2vec2.0 in Automatic Speech Recognition for Elementary Math Classroom Settings**|Ahmed Adel Attia et.al.|[2405.13018](http://arxiv.org/abs/2405.13018)|null|\n", "2405.19342": "|**2024-05-14**|**Sonos Voice Control Bias Assessment Dataset: A Methodology for Demographic Bias Assessment in Voice Assistants**|Chlo\u00e9 Sekkat et.al.|[2405.19342](http://arxiv.org/abs/2405.19342)|null|\n", "2405.08402": "|**2024-05-14**|**Investigating the 'Autoencoder Behavior' in Speech Self-Supervised Models: a focus on HuBERT's Pretraining**|Valentin Vielzeuf et.al.|[2405.08402](http://arxiv.org/abs/2405.08402)|null|\n", "2405.08295": "|**2024-05-31**|**SpeechVerse: A Large-scale Generalizable Audio Language Model**|Nilaksh Das et.al.|[2405.08295](http://arxiv.org/abs/2405.08295)|null|\n", "2405.07442": "|**2024-06-07**|**Rene: A Pre-trained Multi-modal Architecture for Auscultation of Respiratory Diseases**|Pengfei Zhang et.al.|[2405.07442](http://arxiv.org/abs/2405.07442)|**[link](https://github.com/zpforlove/rene)**|\n", "2405.07354": "|**2024-05-12**|**SoccerNet-Echoes: A Soccer Game Audio Commentary Dataset**|Sushant Gautam et.al.|[2405.07354](http://arxiv.org/abs/2405.07354)|**[link](https://github.com/SoccerNet/sn-echoes)**|\n", "2405.13001": "|**2024-05-12**|**Large Language Models for Education: A Survey**|Hanyi Xu et.al.|[2405.13001](http://arxiv.org/abs/2405.13001)|null|\n", "2405.06368": "|**2024-07-22**|**DP-DyLoRA: Fine-Tuning Transformer-Based Models On-Device under Differentially Private Federated Learning using Dynamic Low-Rank Adaptation**|Jie Xu et.al.|[2405.06368](http://arxiv.org/abs/2405.06368)|null|\n", "2405.06150": "|**2024-05-10**|**Lost in Transcription: Identifying and Quantifying the Accuracy Biases of Automatic Speech Recognition Systems Against Disfluent Speech**|Dena Mujtaba et.al.|[2405.06150](http://arxiv.org/abs/2405.06150)|null|\n", "2405.06134": "|**2024-07-17**|**Muting Whisper: A Universal Acoustic Adversarial Attack on Speech Foundation Models**|Vyas Raina et.al.|[2405.06134](http://arxiv.org/abs/2405.06134)|**[link](https://github.com/rainavyas/prepend_acoustic_attack)**|\n", "2405.05498": "|**2024-05-09**|**The RoyalFlush Automatic Speech Diarization and Recognition System for In-Car Multi-Channel Automatic Speech Recognition Challenge**|Jingguang Tian et.al.|[2405.05498](http://arxiv.org/abs/2405.05498)|null|\n", "2405.04296": "|**2024-05-07**|**Open Implementation and Study of BEST-RQ for Speech Processing**|Ryan Whetten et.al.|[2405.04296](http://arxiv.org/abs/2405.04296)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2405.03484": "|**2024-05-06**|**Whispy: Adapting STT Whisper Models to Real-Time Environments**|Antonio Bevilacqua et.al.|[2405.03484](http://arxiv.org/abs/2405.03484)|null|\n", "2405.03152": "|**2024-05-06**|**MMGER: Multi-modal and Multi-granularity Generative Error Correction with LLM for Joint Accent and Speech Recognition**|Bingshen Mu et.al.|[2405.03152](http://arxiv.org/abs/2405.03152)|null|\n", "2405.02995": "|**2024-05-11**|**Analysis about Theoretical Foundations for Method to Enhancing ASR Performance using OCR Word Frequency Differences**|Kyudan Jung et.al.|[2405.02995](http://arxiv.org/abs/2405.02995)|null|\n", "2405.02578": "|**2024-05-04**|**Mixat: A Data Set of Bilingual Emirati-English Speech**|Maryam Al Ali et.al.|[2405.02578](http://arxiv.org/abs/2405.02578)|**[link](https://github.com/mbzuai-nlp/mixat)**|\n", "2406.02566": "|**2024-05-03**|**Combining X-Vectors and Bayesian Batch Active Learning: Two-Stage Active Learning Pipeline for Speech Recognition**|Ognjen Kundacina et.al.|[2406.02566](http://arxiv.org/abs/2406.02566)|null|\n", "2405.02132": "|**2024-05-06**|**Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets**|Xuelong Geng et.al.|[2405.02132](http://arxiv.org/abs/2405.02132)|null|\n", "2406.02565": "|**2024-05-02**|**Sequence-to-sequence models in peer-to-peer learning: A practical application**|Robert \u0160ajina et.al.|[2406.02565](http://arxiv.org/abs/2406.02565)|null|\n", "2405.01293": "|**2024-05-02**|**Low-resource speech recognition and dialect identification of Irish in a multi-task framework**|Liam Lonergan et.al.|[2405.01293](http://arxiv.org/abs/2405.01293)|null|\n", "2405.01207": "|**2024-05-02**|**Improving Membership Inference in ASR Model Auditing with Perturbed Loss Features**|Francisco Teixeira et.al.|[2405.01207](http://arxiv.org/abs/2405.01207)|null|\n", "2405.01004": "|**2024-05-02**|**Deep Learning Models in Speech Recognition: Measuring GPU Energy Consumption, Impact of Noise and Model Quantization for Edge Deployment**|Aditya Chakravarty et.al.|[2405.01004](http://arxiv.org/abs/2405.01004)|**[link](https://github.com/zzadiues3338/asr-energy-jetson)**|\n", "2405.00966": "|**2024-05-02**|**Efficient Compression of Multitask Multilingual Speech Models**|Thomas Palmeira Ferraz et.al.|[2405.00966](http://arxiv.org/abs/2405.00966)|null|\n", "2405.01601": "|**2024-05-01**|**Efficient Sample-Specific Encoder Perturbations**|Yassir Fathullah et.al.|[2405.01601](http://arxiv.org/abs/2405.01601)|null|\n", "2405.00307": "|**2024-05-01**|**Active Learning with Task Adaptation Pre-training for Speech Emotion Recognition**|Dongyuan Li et.al.|[2405.00307](http://arxiv.org/abs/2405.00307)|null|\n", "2405.00223": "|**2024-07-24**|**Confides: A Visual Analytics Solution for Automated Speech Recognition Analysis and Exploration**|Sunwoo Ha et.al.|[2405.00223](http://arxiv.org/abs/2405.00223)|null|\n", "2404.19310": "|**2024-05-09**|**Does Whisper understand Swiss German? An automatic, qualitative, and human evaluation**|Eyal Liron Dolev et.al.|[2404.19310](http://arxiv.org/abs/2404.19310)|null|\n", "2404.19214": "|**2024-04-30**|**EfficientASR: Speech Recognition Network Compression via Attention Redundancy and Chunk-Level FFN Optimization**|Jianzong Wang et.al.|[2404.19214](http://arxiv.org/abs/2404.19214)|null|\n", "2404.18739": "|**2024-04-29**|**Towards Dog Bark Decoding: Leveraging Human Speech Processing for Automated Bark Classification**|Artem Abzaliev et.al.|[2404.18739](http://arxiv.org/abs/2404.18739)|null|\n", "2406.02563": "|**2024-04-29**|**A cost minimization approach to fix the vocabulary size in a tokenizer for an End-to-End ASR system**|Sunil Kumar Kopparapu et.al.|[2406.02563](http://arxiv.org/abs/2406.02563)|null|\n", "2404.17394": "|**2024-04-26**|**Child Speech Recognition in Human-Robot Interaction: Problem Solved?**|Ruben Janssens et.al.|[2404.17394](http://arxiv.org/abs/2404.17394)|null|\n", "2404.16743": "|**2024-04-26**|**Automatic Speech Recognition System-Independent Word Error Rate Estimation**|Chanho Park et.al.|[2404.16743](http://arxiv.org/abs/2404.16743)|null|\n", "2404.16547": "|**2024-04-25**|**Developing Acoustic Models for Automatic Speech Recognition in Swedish**|Giampiero Salvi et.al.|[2404.16547](http://arxiv.org/abs/2404.16547)|null|\n", "2404.16407": "|**2024-04-25**|**U2++ MoE: Scaling 4.7x parameters with minimal impact on RTF**|Xingchen Song et.al.|[2404.16407](http://arxiv.org/abs/2404.16407)|null|\n", "2404.16112": "|**2024-04-24**|**Mamba-360: Survey of State Space Models as Transformer Alternative for Long Sequence Modelling: Methods, Applications, and Challenges**|Badri Narayana Patro et.al.|[2404.16112](http://arxiv.org/abs/2404.16112)|**[link](https://github.com/badripatro/mamba360)**|\n", "2406.02562": "|**2024-04-24**|**Gated Low-rank Adaptation for personalized Code-Switching Automatic Speech Recognition on the low-spec devices**|Gwantae Kim et.al.|[2406.02562](http://arxiv.org/abs/2406.02562)|null|\n", "2404.15501": "|**2024-04-23**|**Killkan: The Automatic Speech Recognition Dataset for Kichwa with Morphosyntactic Information**|Chihiro Taguchi et.al.|[2404.15501](http://arxiv.org/abs/2404.15501)|**[link](https://github.com/ctaguchi/killkan)**|\n", "2406.02561": "|**2024-04-23**|**Breaking Walls: Pioneering Automatic Speech Recognition for Central Kurdish: End-to-End Transformer Paradigm**|Abdulhady Abas Abdullah et.al.|[2406.02561](http://arxiv.org/abs/2406.02561)|null|\n", "2404.14860": "|**2024-04-23**|**Rethinking Processing Distortions: Disentangling the Impact of Speech Enhancement Errors on Speech Recognition Performance**|Tsubasa Ochiai et.al.|[2404.14860](http://arxiv.org/abs/2404.14860)|null|\n", "2404.14605": "|**2024-04-22**|**Assessment of Sign Language-Based versus Touch-Based Input for Deaf Users Interacting with Intelligent Personal Assistants**|Nina Tran et.al.|[2404.14605](http://arxiv.org/abs/2404.14605)|null|\n", "2406.02560": "|**2024-07-18**|**Less Peaky and More Accurate CTC Forced Alignment by Label Priors**|Ruizhe Huang et.al.|[2406.02560](http://arxiv.org/abs/2406.02560)|**[link](https://github.com/huangruizhe/audio)**|\n", "2404.14024": "|**2024-04-22**|**Exploring neural oscillations during speech perception via surrogate gradient spiking neural networks**|Alexandre Bittar et.al.|[2404.14024](http://arxiv.org/abs/2404.14024)|null|\n", "2404.13362": "|**2024-04-20**|**Semantically Corrected Amharic Automatic Speech Recognition**|Samuael Adnew et.al.|[2404.13362](http://arxiv.org/abs/2404.13362)|**[link](https://github.com/samuael/postprocessed_geez_asr)**|\n", "2404.12888": "|**2024-04-19**|**Learn2Talk: 3D Talking Face Learns from 2D Talking Face**|Yixiang Zhuang et.al.|[2404.12888](http://arxiv.org/abs/2404.12888)|null|\n", "2404.12628": "|**2024-04-19**|**Efficient infusion of self-supervised representations in Automatic Speech Recognition**|Darshan Prabhu et.al.|[2404.12628](http://arxiv.org/abs/2404.12628)|null|\n", "2404.15168": "|**2024-04-18**|**Artificial Neural Networks to Recognize Speakers Division from Continuous Bengali Speech**|Hasmot Ali et.al.|[2404.15168](http://arxiv.org/abs/2404.15168)|null|\n", "2404.10922": "|**2024-04-16**|**Teaching a Multilingual Large Language Model to Understand Multilingual Speech via Multi-Instructional Training**|Pavel Denisov et.al.|[2404.10922](http://arxiv.org/abs/2404.10922)|**[link](https://github.com/akreal/bloomzmms)**|\n", "2404.09841": "|**2024-04-16**|**Anatomy of Industrial Scale Multilingual ASR**|Francis McCann Ramirez et.al.|[2404.09841](http://arxiv.org/abs/2404.09841)|null|\n", "2404.09754": "|**2024-04-15**|**Resilience of Large Language Models for Noisy Instructions**|Bin Wang et.al.|[2404.09754](http://arxiv.org/abs/2404.09754)|null|\n", "2406.09425": "|**2024-04-13**|**SGPRS: Seamless GPU Partitioning Real-Time Scheduler for Periodic Deep Learning Workloads**|Amir Fakhim Babaei et.al.|[2406.09425](http://arxiv.org/abs/2406.09425)|null|\n", "2404.08424": "|**2024-04-12**|**Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction in an Object Categorization Task**|Hassan Ali et.al.|[2404.08424](http://arxiv.org/abs/2404.08424)|null|\n", "2404.08368": "|**2024-07-26**|**Automatic Speech Recognition Advancements for Indigenous Languages of the Americas**|Monica Romero et.al.|[2404.08368](http://arxiv.org/abs/2404.08368)|null|\n", "2404.07575": "|**2024-04-12**|**An Effective Automated Speaking Assessment Approach to Mitigating Data Scarcity and Imbalanced Distribution**|Tien-Hong Lo et.al.|[2404.07575](http://arxiv.org/abs/2404.07575)|null|\n", "2404.07341": "|**2024-04-12**|**Conformer-1: Robust ASR via Large-Scale Semisupervised Bootstrapping**|Kevin Zhang et.al.|[2404.07341](http://arxiv.org/abs/2404.07341)|null|\n", "2404.08011": "|**2024-04-10**|**An inclusive review on deep learning techniques and their scope in handwriting recognition**|Sukhdeep Singh et.al.|[2404.08011](http://arxiv.org/abs/2404.08011)|null|\n", "2404.06079": "|**2024-04-10**|**The X-LANCE Technical Report for Interspeech 2024 Speech Processing Using Discrete Speech Unit Challenge**|Yiwei Guo et.al.|[2404.06079](http://arxiv.org/abs/2404.06079)|null|\n", "2404.05659": "|**2024-05-28**|**VietMed: A Dataset and Benchmark for Automatic Speech Recognition of Vietnamese in the Medical Domain**|Khai Le-Duc et.al.|[2404.05659](http://arxiv.org/abs/2404.05659)|**[link](https://github.com/leduckhai/multimed)**|\n", "2404.04769": "|**2024-04-07**|**Safeguarding Voice Privacy: Harnessing Near-Ultrasonic Interference To Protect Against Unauthorized Audio Recording**|Forrest McKee et.al.|[2404.04769](http://arxiv.org/abs/2404.04769)|null|\n", "2404.04295": "|**2024-04-04**|**Transducers with Pronunciation-aware Embeddings for Automatic Speech Recognition**|Hainan Xu et.al.|[2404.04295](http://arxiv.org/abs/2404.04295)|null|\n", "2404.03073": "|**2024-04-03**|**Mai Ho'om\u0101una i ka 'Ai: Language Models Improve Automatic Speech Recognition in Hawaiian**|Kaavya Chaparala et.al.|[2404.03073](http://arxiv.org/abs/2404.03073)|null|\n", "2404.02408": "|**2024-04-03**|**CMULAB: An Open-Source Framework for Training and Deployment of Natural Language Processing Models**|Zaid Sheikh et.al.|[2404.02408](http://arxiv.org/abs/2404.02408)|**[link](https://github.com/neulab/cmulab)**|\n", "2404.02098": "|**2024-04-02**|**BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory Speech Recognition**|Alexandros Haliassos et.al.|[2404.02098](http://arxiv.org/abs/2404.02098)|**[link](https://github.com/ahaliassos/raven)**|\n", "2404.02052": "|**2024-04-02**|**Noise Masking Attacks and Defenses for Pretrained Speech Models**|Matthew Jagielski et.al.|[2404.02052](http://arxiv.org/abs/2404.02052)|null|\n", "2404.01991": "|**2024-04-02**|**Kallaama: A Transcribed Speech Dataset about Agriculture in the Three Most Widely Spoken Languages in Senegal**|Elodie Gauthier et.al.|[2404.01991](http://arxiv.org/abs/2404.01991)|**[link](https://github.com/gauthelo/kallaama-speech-dataset)**|\n", "2404.01737": "|**2024-04-02**|**Transfer Learning from Whisper for Microscopic Intelligibility Prediction**|Paul Best et.al.|[2404.01737](http://arxiv.org/abs/2404.01737)|null|\n", "2404.07226": "|**2024-03-31**|**Houston we have a Divergence: A Subgroup Performance Analysis of ASR Models**|Alkis Koudounas et.al.|[2404.07226](http://arxiv.org/abs/2404.07226)|null|\n", "2403.20262": "|**2024-07-22**|**ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language Models**|Thibaut Thonet et.al.|[2403.20262](http://arxiv.org/abs/2403.20262)|**[link](https://github.com/utter-project/elitr-bench)**|\n", "2403.19822": "|**2024-03-28**|**Multi-Stage Multi-Modal Pre-Training for Automatic Speech Recognition**|Yash Jain et.al.|[2403.19822](http://arxiv.org/abs/2403.19822)|null|\n", "2403.19224": "|**2024-03-28**|**Emotion Neural Transducer for Fine-Grained Speech Emotion Recognition**|Siyuan Shen et.al.|[2403.19224](http://arxiv.org/abs/2403.19224)|**[link](https://github.com/ecnu-cross-innovation-lab/ent)**|\n", "2403.19207": "|**2024-03-28**|**LV-CTC: Non-autoregressive ASR with CTC and latent variable models**|Yuya Fujita et.al.|[2403.19207](http://arxiv.org/abs/2403.19207)|null|\n", "2403.18721": "|**2024-06-04**|**PhysicsAssistant: An LLM-Powered Interactive Learning Robot for Physics Lab Investigations**|Ehsan Latif et.al.|[2403.18721](http://arxiv.org/abs/2403.18721)|null|\n", "2406.02555": "|**2024-03-27**|**PhoWhisper: Automatic Speech Recognition for Vietnamese**|Thanh-Thien Le et.al.|[2406.02555](http://arxiv.org/abs/2406.02555)|**[link](https://github.com/vinairesearch/phowhisper)**|\n", "2403.18182": "|**2024-03-27**|**ZAEBUC-Spoken: A Multilingual Multidialectal Arabic-English Speech Corpus**|Injy Hamed et.al.|[2403.18182](http://arxiv.org/abs/2403.18182)|null|\n", "2403.17645": "|**2024-04-11**|**DANCER: Entity Description Augmented Named Entity Corrector for Automatic Speech Recognition**|Yi-Cheng Wang et.al.|[2403.17645](http://arxiv.org/abs/2403.17645)|null|\n", "2403.17363": "|**2024-03-26**|**Extracting Biomedical Entities from Noisy Audio Transcripts**|Nima Ebadi et.al.|[2403.17363](http://arxiv.org/abs/2403.17363)|null|\n", "2403.19709": "|**2024-03-25**|**Hierarchical Recurrent Adapters for Efficient Multi-Task Adaptation of Large Speech Models**|Tsendsuren Munkhdalai et.al.|[2403.19709](http://arxiv.org/abs/2403.19709)|null|\n", "2403.16655": "|**2024-03-25**|**Grammatical vs Spelling Error Correction: An Investigation into the Responsiveness of Transformer-based Language Models using BART and MarianMT**|Rohit Raju et.al.|[2403.16655](http://arxiv.org/abs/2403.16655)|null|\n", "2403.15510": "|**2024-03-22**|**Privacy-Preserving End-to-End Spoken Language Understanding**|Yinggui Wang et.al.|[2403.15510](http://arxiv.org/abs/2403.15510)|null|\n", "2403.14438": "|**2024-03-26**|**A Multimodal Approach to Device-Directed Speech Detection with Large Language Models**|Dominik Wagner et.al.|[2403.14438](http://arxiv.org/abs/2403.14438)|null|\n", "2403.14402": "|**2024-03-21**|**XLAVS-R: Cross-Lingual Audio-Visual Speech Representation Learning for Noise-Robust Speech Perception**|HyoJung Han et.al.|[2403.14402](http://arxiv.org/abs/2403.14402)|null|\n", "2403.14168": "|**2024-06-04**|**M$^3$AV: A Multimodal, Multigenre, and Multipurpose Audio-Visual Academic Lecture Dataset**|Zhe Chen et.al.|[2403.14168](http://arxiv.org/abs/2403.14168)|null|\n", "2403.13960": "|**2024-03-20**|**Open Access NAO (OAN): a ROS2-based software framework for HRI applications with the NAO robot**|Antonio Bono et.al.|[2403.13960](http://arxiv.org/abs/2403.13960)|null|\n", "2403.13465": "|**2024-03-20**|**BanglaNum -- A Public Dataset for Bengali Digit Recognition from Speech**|Mir Sayeed Mohammad et.al.|[2403.13465](http://arxiv.org/abs/2403.13465)|null|\n", "2403.13423": "|**2024-03-20**|**Advanced Long-Content Speech Recognition With Factorized Neural Transducer**|Xun Gong et.al.|[2403.13423](http://arxiv.org/abs/2403.13423)|null|\n", "2403.15469": "|**2024-03-20**|**Isometric Neural Machine Translation using Phoneme Count Ratio Reward-based Reinforcement Learning**|Shivam Ratnakant Mhaskar et.al.|[2403.15469](http://arxiv.org/abs/2403.15469)|null|\n", "2403.12821": "|**2024-03-21**|**FlowerFormer: Empowering Neural Architecture Encoding using a Flow-aware Graph Transformer**|Dongyeong Hwang et.al.|[2403.12821](http://arxiv.org/abs/2403.12821)|**[link](https://github.com/y0ngjaenius/cvpr2024_flowerformer)**|\n", "2403.12477": "|**2024-03-19**|**Real-time Speech Extraction Using Spatially Regularized Independent Low-rank Matrix Analysis and Rank-constrained Spatial Covariance Matrix Estimation**|Yuto Ishikawa et.al.|[2403.12477](http://arxiv.org/abs/2403.12477)|null|\n", "2403.12273": "|**2024-03-18**|**Multimodal Human-Autonomous Agents Interaction Using Pre-Trained Language and Visual Foundation Models**|Linus Nwankwo et.al.|[2403.12273](http://arxiv.org/abs/2403.12273)|null|\n", "2403.11578": "|**2024-03-18**|**AdaMER-CTC: Connectionist Temporal Classification with Adaptive Maximum Entropy Regularization for Automatic Speech Recognition**|SooHwan Eom et.al.|[2403.11578](http://arxiv.org/abs/2403.11578)|null|\n", "2403.15442": "|**2024-07-21**|**Artificial Intelligence for Cochlear Implants: Review of Strategies, Challenges, and Perspectives**|Billel Essaid et.al.|[2403.15442](http://arxiv.org/abs/2403.15442)|null|\n", "2403.10961": "|**2024-03-16**|**Energy-Based Models with Applications to Speech and Language Processing**|Zhijian Ou et.al.|[2403.10961](http://arxiv.org/abs/2403.10961)|null|\n", "2403.10937": "|**2024-03-16**|**Initial Decoding with Minimally Augmented Language Model for Improved Lattice Rescoring in Low Resource ASR**|Savitha Murthy et.al.|[2403.10937](http://arxiv.org/abs/2403.10937)|null|\n", "2403.10420": "|**2024-03-15**|**Neural Networks Hear You Loud And Clear: Hearing Loss Compensation Using Deep Neural Networks**|Peter Leer et.al.|[2403.10420](http://arxiv.org/abs/2403.10420)|null|\n", "2403.09753": "|**2024-03-14**|**SpokeN-100: A Cross-Lingual Benchmarking Dataset for The Classification of Spoken Numbers in Different Languages**|Ren\u00e9 Groh et.al.|[2403.09753](http://arxiv.org/abs/2403.09753)|**[link](https://github.com/ankilab/spoken-100)**|\n", "2403.09298": "|**2024-03-14**|**More than words: Advancements and challenges in speech recognition for singing**|Anna Kruspe et.al.|[2403.09298](http://arxiv.org/abs/2403.09298)|null|\n", "2405.12983": "|**2024-03-14**|**Multilingual Audio-Visual Speech Recognition with Hybrid CTC/RNN-T Fast Conformer**|Maxime Burchi et.al.|[2405.12983](http://arxiv.org/abs/2405.12983)|null|\n", "2403.08258": "|**2024-05-21**|**Skipformer: A Skip-and-Recover Strategy for Efficient Speech Recognition**|Wenjing Zhu et.al.|[2403.08258](http://arxiv.org/abs/2403.08258)|null|\n", "2403.08196": "|**2024-03-13**|**SpeechColab Leaderboard: An Open-Source Platform for Automatic Speech Recognition Evaluation**|Jiayu Du et.al.|[2403.08196](http://arxiv.org/abs/2403.08196)|**[link](https://github.com/speechcolab/leaderboard)**|\n", "2403.08187": "|**2024-03-13**|**Automatic Speech Recognition (ASR) for the Diagnosis of pronunciation of Speech Sound Disorders in Korean children**|Taekyung Ahn et.al.|[2403.08187](http://arxiv.org/abs/2403.08187)|null|\n", "2403.08011": "|**2024-03-12**|**Gujarati-English Code-Switching Speech Recognition using ensemble prediction of spoken language**|Yash Sharma et.al.|[2403.08011](http://arxiv.org/abs/2403.08011)|null|\n", "2403.07767": "|**2024-03-12**|**Beyond the Labels: Unveiling Text-Dependency in Paralinguistic Speech Recognition Datasets**|Jan Pe\u0161\u00e1n et.al.|[2403.07767](http://arxiv.org/abs/2403.07767)|null|\n", "2403.07947": "|**2024-03-11**|**The evaluation of a code-switched Sepedi-English automatic speech recognition system**|Amanda Phaladi et.al.|[2403.07947](http://arxiv.org/abs/2403.07947)|null|\n", "2403.06734": "|**2024-03-11**|**Real-Time Multimodal Cognitive Assistant for Emergency Medical Services**|Keshara Weerasinghe et.al.|[2403.06734](http://arxiv.org/abs/2403.06734)|**[link](https://github.com/uva-dsa/ems-pipeline)**|\n", "2403.06387": "|**2024-03-11**|**Towards Decoupling Frontend Enhancement and Backend Recognition in Monaural Robust ASR**|Yufeng Yang et.al.|[2403.06387](http://arxiv.org/abs/2403.06387)|null|\n", "2403.06260": "|**2024-03-10**|**SCORE: Self-supervised Correspondence Fine-tuning for Improved Content Representations**|Amit Meghanani et.al.|[2403.06260](http://arxiv.org/abs/2403.06260)|**[link](https://github.com/trikaldarshi/score_finetuning)**|\n", "2403.05887": "|**2024-03-09**|**Aligning Speech to Languages to Enhance Code-switching Speech Recognition**|Hexin Liu et.al.|[2403.05887](http://arxiv.org/abs/2403.05887)|null|\n", "2403.07937": "|**2024-03-08**|**Speech Robust Bench: A Robustness Benchmark For Speech Recognition**|Muhammad A. Shah et.al.|[2403.07937](http://arxiv.org/abs/2403.07937)|null|\n", "2403.04445": "|**2024-03-07**|**Classist Tools: Social Class Correlates with Performance in NLP**|Amanda Cercas Curry et.al.|[2403.04445](http://arxiv.org/abs/2403.04445)|null|\n", "2403.04280": "|**2024-05-30**|**A New Benchmark for Evaluating Automatic Speech Recognition in the Arabic Call Domain**|Qusai Abo Obaidah et.al.|[2403.04280](http://arxiv.org/abs/2403.04280)|null|\n", "2403.04245": "|**2024-03-07**|**A Study of Dropout-Induced Modality Bias on Robustness to Missing Video Frames for Audio-Visual Speech Recognition**|Yusheng Dai et.al.|[2403.04245](http://arxiv.org/abs/2403.04245)|**[link](https://github.com/dalision/modalbiasavsr)**|\n", "2403.03538": "|**2024-03-06**|**RADIA -- Radio Advertisement Detection with Intelligent Analytics**|Jorge \u00c1lvarez et.al.|[2403.03538](http://arxiv.org/abs/2403.03538)|null|\n", "2403.03522": "|**2024-03-13**|**Non-verbal information in spontaneous speech -- towards a new framework of analysis**|Tirza Biron et.al.|[2403.03522](http://arxiv.org/abs/2403.03522)|null|\n", "2403.02938": "|**2024-03-05**|**AIx Speed: Playback Speed Optimization Using Listening Comprehension of Speech Recognition Models**|Kazuki Kawamura et.al.|[2403.02938](http://arxiv.org/abs/2403.02938)|null|\n", "2403.02288": "|**2024-03-04**|**PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings**|Joonas Kalda et.al.|[2403.02288](http://arxiv.org/abs/2403.02288)|**[link](https://github.com/joonaskalda/pixit)**|\n", "2403.02173": "|**2024-03-04**|**What has LeBenchmark Learnt about French Syntax?**|Zdravko Dugonji\u0107 et.al.|[2403.02173](http://arxiv.org/abs/2403.02173)|null|\n", "2403.02010": "|**2024-03-04**|**SA-SOT: Speaker-Aware Serialized Output Training for Multi-Talker ASR**|Zhiyun Fan et.al.|[2403.02010](http://arxiv.org/abs/2403.02010)|null|\n", "2403.01983": "|**2024-03-04**|**Language and Speech Technology for Central Kurdish Varieties**|Sina Ahmadi et.al.|[2403.01983](http://arxiv.org/abs/2403.01983)|**[link](https://github.com/sinaahmadi/cordi)**|\n", "2403.18843": "|**2024-03-04**|**JEP-KD: Joint-Embedding Predictive Architecture Based Knowledge Distillation for Visual Speech Recognition**|Chang Sun et.al.|[2403.18843](http://arxiv.org/abs/2403.18843)|null|\n", "2403.01369": "|**2024-03-03**|**A Closer Look at Wav2Vec2 Embeddings for On-Device Single-Channel Speech Enhancement**|Ravi Shankar et.al.|[2403.01369](http://arxiv.org/abs/2403.01369)|null|\n", "2403.05583": "|**2024-03-02**|**A Cross-Modal Approach to Silent Speech with LLM-Enhanced Recognition**|Tyler Benster et.al.|[2403.05583](http://arxiv.org/abs/2403.05583)|**[link](https://github.com/tbenst/silent_speech)**|\n", "2403.01255": "|**2024-04-18**|**Automatic Speech Recognition using Advanced Deep Learning Approaches: A survey**|Hamza Kheddar et.al.|[2403.01255](http://arxiv.org/abs/2403.01255)|null|\n", "2403.00370": "|**2024-03-01**|**Post-decoder Biasing for End-to-End Speech Recognition of Multi-turn Medical Interview**|Heyang Liu et.al.|[2403.00370](http://arxiv.org/abs/2403.00370)|null|\n", "2402.19443": "|**2024-02-29**|**Probing the Information Encoded in Neural-based Acoustic Models of Automatic Speech Recognition Systems**|Quentin Raymondaud et.al.|[2402.19443](http://arxiv.org/abs/2402.19443)|null|\n", "2402.18923": "|**2024-02-29**|**Inappropriate Pause Detection In Dysarthric Speech Using Large-Scale Speech Recognition**|Jeehyun Lee et.al.|[2402.18923](http://arxiv.org/abs/2402.18923)|null|\n", "2402.18275": "|**2024-06-04**|**Exploration of Adapter for Noise Robust Automatic Speech Recognition**|Hao Shi et.al.|[2402.18275](http://arxiv.org/abs/2402.18275)|null|\n", "2402.17954": "|**2024-06-19**|**Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps**|Giuseppe Attanasio et.al.|[2402.17954](http://arxiv.org/abs/2402.17954)|**[link](https://github.com/g8a9/multilingual-asr-gender-gap)**|\n", "2402.17189": "|**2024-02-27**|**An Effective Mixture-Of-Experts Approach For Code-Switching Speech Recognition Leveraging Encoder Disentanglement**|Tzu-Ting Yang et.al.|[2402.17189](http://arxiv.org/abs/2402.17189)|null|\n", "2402.17184": "|**2024-02-27**|**Extreme Encoder Output Frame Rate Reduction: Improving Computational Latencies of Large End-to-End Models**|Rohit Prabhavalkar et.al.|[2402.17184](http://arxiv.org/abs/2402.17184)|null|\n", "2402.15733": "|**2024-04-01**|**ArEEG_Chars: Dataset for Envisioned Speech Recognition using EEG for Arabic Characters**|Hazem Darwish et.al.|[2402.15733](http://arxiv.org/abs/2402.15733)|null|\n", "2402.15151": "|**2024-05-14**|**Where Visual Speech Meets Language: VSP-LLM Framework for Efficient and Context-Aware Visual Speech Processing**|Jeong Hun Yeo et.al.|[2402.15151](http://arxiv.org/abs/2402.15151)|**[link](https://github.com/sally-sh/vsp-llm)**|\n", "2402.14563": "|**2024-02-22**|**Wizard of Oz Experimentation for Language Technology Applications: Challenges and Tools**|Stephan Schl\u00f6gl et.al.|[2402.14563](http://arxiv.org/abs/2402.14563)|null|\n", "2402.14888": "|**2024-02-22**|**Efficient data selection employing Semantic Similarity-based Graph Structures for model training**|Roxana Petcu et.al.|[2402.14888](http://arxiv.org/abs/2402.14888)|null|\n", "2402.14185": "|**2024-02-22**|**HINT: High-quality INPainting Transformer with Mask-Aware Encoding and Enhanced Attention**|Shuang Chen et.al.|[2402.14185](http://arxiv.org/abs/2402.14185)|**[link](https://github.com/chrischen1023/hint)**|\n", "2402.13687": "|**2024-02-21**|**An Augmented Lagrangian Method for Training Recurrent Neural Networks**|Yue Wang et.al.|[2402.13687](http://arxiv.org/abs/2402.13687)|null|\n", "2402.13511": "|**2024-02-22**|**Mel-FullSubNet: Mel-Spectrogram Enhancement for Improving Both Speech Quality and ASR**|Rui Zhou et.al.|[2402.13511](http://arxiv.org/abs/2402.13511)|null|\n", "2402.13208": "|**2024-02-20**|**How do Hyenas deal with Human Speech? Speech Recognition and Translation with ConfHyena**|Marco Gaido et.al.|[2402.13208](http://arxiv.org/abs/2402.13208)|**[link](https://github.com/hlt-mt/fbk-fairseq)**|\n", "2402.13076": "|**2024-02-20**|**Not All Weights Are Created Equal: Enhancing Energy Efficiency in On-Device Streaming Speech Recognition**|Yang Li et.al.|[2402.13076](http://arxiv.org/abs/2402.13076)|null|\n", "2402.13004": "|**2024-02-20**|**Comparison of Conventional Hybrid and CTC/Attention Decoders for Continuous Visual Speech Recognition**|David Gimeno-G\u00f3mez et.al.|[2402.13004](http://arxiv.org/abs/2402.13004)|null|\n", "2402.12654": "|**2024-06-16**|**OWSM-CTC: An Open Encoder-Only Speech Foundation Model for Speech Recognition, Translation, and Language Identification**|Yifan Peng et.al.|[2402.12654](http://arxiv.org/abs/2402.12654)|null|\n", "2402.11954": "|**2024-02-19**|**Multimodal Emotion Recognition from Raw Audio with Sinc-convolution**|Xiaohui Zhang et.al.|[2402.11954](http://arxiv.org/abs/2402.11954)|null|\n", "2402.11571": "|**2024-02-18**|**Ain't Misbehavin' -- Using LLMs to Generate Expressive Robot Behavior in Conversations with the Tabletop Robot Haru**|Zining Wang et.al.|[2402.11571](http://arxiv.org/abs/2402.11571)|null|\n", "2402.11520": "|**2024-02-18**|**Cross-Attention Fusion of Visual and Geometric Features for Large Vocabulary Arabic Lipreading**|Samar Daou et.al.|[2402.11520](http://arxiv.org/abs/2402.11520)|null|\n", "2402.09797": "|**2024-02-15**|**A cross-talk robust multichannel VAD model for multiparty agent interactions trained using synthetic re-recordings**|Hyewon Han et.al.|[2402.09797](http://arxiv.org/abs/2402.09797)|null|\n", "2402.08932": "|**2024-02-14**|**Listening to Multi-talker Conversations: Modular and End-to-end Perspectives**|Desh Raj et.al.|[2402.08932](http://arxiv.org/abs/2402.08932)|null|\n", "2402.08898": "|**2024-02-14**|**UniEnc-CASSNAT: An Encoder-only Non-autoregressive ASR for Speech SSL Models**|Ruchao Fan et.al.|[2402.08898](http://arxiv.org/abs/2402.08898)|null|\n", "2402.08846": "|**2024-02-13**|**An Embarrassingly Simple Approach for LLM with Strong ASR Capacity**|Ziyang Ma et.al.|[2402.08846](http://arxiv.org/abs/2402.08846)|**[link](https://github.com/X-LANCE/SLAM-LLM)**|\n", "2402.08788": "|**2024-02-13**|**Syllable based DNN-HMM Cantonese Speech to Text System**|Timothy Wong et.al.|[2402.08788](http://arxiv.org/abs/2402.08788)|null|\n", "2402.08021": "|**2024-05-03**|**Careless Whisper: Speech-to-Text Hallucination Harms**|Allison Koenecke et.al.|[2402.08021](http://arxiv.org/abs/2402.08021)|**[link](https://github.com/koenecke/hallucination_harms)**|\n", "2402.07729": "|**2024-07-26**|**AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension**|Qian Yang et.al.|[2402.07729](http://arxiv.org/abs/2402.07729)|**[link](https://github.com/ofa-sys/air-bench)**|\n", "2402.07658": "|**2024-02-12**|**The Sound of Healthcare: Improving Medical Transcription ASR Accuracy with Large Language Models**|Ayo Adedeji et.al.|[2402.07658](http://arxiv.org/abs/2402.07658)|null|\n", "2402.07513": "|**2024-02-12**|**The Balancing Act: Unmasking and Alleviating ASR Biases in Portuguese**|Ajinkya Kulkarni et.al.|[2402.07513](http://arxiv.org/abs/2402.07513)|null|\n", "2402.07431": "|**2024-02-13**|**SALAD: Smart AI Language Assistant Daily**|Ragib Amin Nihal et.al.|[2402.07431](http://arxiv.org/abs/2402.07431)|null|\n", "2402.07095": "|**2024-02-11**|**Does ChatGPT and Whisper Make Humanoid Robots More Relatable?**|Xiaohui Chen et.al.|[2402.07095](http://arxiv.org/abs/2402.07095)|null|\n", "2402.06966": "|**2024-02-10**|**DeepCover: Advancing RNN Test Coverage and Online Error Prediction using State Machine Extraction**|Pouria Golshanrad et.al.|[2402.06966](http://arxiv.org/abs/2402.06966)|**[link](https://github.com/pouriagr/deep-cover)**|\n", "2402.06923": "|**2024-02-10**|**CochCeps-Augment: A Novel Self-Supervised Contrastive Learning Using Cochlear Cepstrum-based Masking for Speech Emotion Recognition**|Ioannis Ziogas et.al.|[2402.06923](http://arxiv.org/abs/2402.06923)|null|\n", "2402.06592": "|**2024-02-09**|**Self-consistent context aware conformer transducer for speech recognition**|Konstantin Kolokolov et.al.|[2402.06592](http://arxiv.org/abs/2402.06592)|null|\n", "2402.05706": "|**2024-02-08**|**Unified Speech-Text Pretraining for Spoken Dialog Modeling**|Heeseung Kim et.al.|[2402.05706](http://arxiv.org/abs/2402.05706)|null|\n", "2402.05457": "|**2024-02-08**|**It's Never Too Late: Fusing Acoustic Information into Large Language Models for Automatic Speech Recognition**|Chen Chen et.al.|[2402.05457](http://arxiv.org/abs/2402.05457)|null|\n", "2402.04805": "|**2024-02-07**|**Progressive unsupervised domain adaptation for ASR using ensemble models and multi-stage training**|Rehan Ahmad et.al.|[2402.04805](http://arxiv.org/abs/2402.04805)|null|\n", "2402.03988": "|**2024-05-28**|**REBORN: Reinforcement-Learned Boundary Segmentation with Iterative Training for Unsupervised ASR**|Liang-Hsuan Tseng et.al.|[2402.03988](http://arxiv.org/abs/2402.03988)|**[link](https://github.com/andybi7676/reborn-uasr)**|\n", "2402.03519": "|**2024-02-05**|**Resolving Transcription Ambiguity in Spanish: A Hybrid Acoustic-Lexical System for Punctuation Restoration**|Xiliang Zhu et.al.|[2402.03519](http://arxiv.org/abs/2402.03519)|null|\n", "2402.03050": "|**2024-02-05**|**A Comprehensive Study of the Current State-of-the-Art in Nepali Automatic Speech Recognition Systems**|Rupak Raj Ghimire et.al.|[2402.03050](http://arxiv.org/abs/2402.03050)|null|\n", "2402.02302": "|**2024-02-03**|**Predicting positive transfer for improved low-resource speech recognition using acoustic pseudo-tokens**|Nay San et.al.|[2402.02302](http://arxiv.org/abs/2402.02302)|null|\n", "2402.01931": "|**2024-02-02**|**Digits micro-model for accurate and secure transactions**|Chirag Chhablani et.al.|[2402.01931](http://arxiv.org/abs/2402.01931)|null|\n", "2402.01917": "|**2024-02-02**|**Whispering in Norwegian: Navigating Orthographic and Dialectic Challenges**|Per E Kummervold et.al.|[2402.01917](http://arxiv.org/abs/2402.01917)|null|\n", "2402.01172": "|**2024-02-02**|**Streaming Sequence Transduction through Dynamic Compression**|Weiting Tan et.al.|[2402.01172](http://arxiv.org/abs/2402.01172)|**[link](https://github.com/steventan0110/star)**|\n", "2402.01152": "|**2024-02-05**|**AccentFold: A Journey through African Accents for Zero-Shot ASR Adaptation to Target Accents**|Abraham Toluwase Owodunni et.al.|[2402.01152](http://arxiv.org/abs/2402.01152)|null|\n", "2402.01778": "|**2024-02-01**|**Introduction to speech recognition**|Gabriel Dauphin et.al.|[2402.01778](http://arxiv.org/abs/2402.01778)|null|\n", "2402.00632": "|**2024-02-01**|**Prosody in Cascade and Direct Speech-to-Text Translation: a case study on Korean Wh-Phrases**|Giulio Zhou et.al.|[2402.00632](http://arxiv.org/abs/2402.00632)|null|\n", "2402.00235": "|**2024-01-31**|**Exploring the limits of decoder-only models trained on public speech recognition corpora**|Ankit Gupta et.al.|[2402.00235](http://arxiv.org/abs/2402.00235)|null|\n", "2401.18045": "|**2024-01-31**|**SpeechComposer: Unifying Multiple Speech Tasks with Prompt Composition**|Yihan Wu et.al.|[2401.18045](http://arxiv.org/abs/2401.18045)|null|\n", "2401.17604": "|**2024-02-08**|**Computation and Parameter Efficient Multi-Modal Fusion Transformer for Cued Speech Recognition**|Lei Liu et.al.|[2401.17604](http://arxiv.org/abs/2401.17604)|null|\n", "2401.16658": "|**2024-06-16**|**OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer**|Yifan Peng et.al.|[2401.16658](http://arxiv.org/abs/2401.16658)|null|\n", "2401.15704": "|**2024-01-28**|**Phoneme-Based Proactive Anti-Eavesdropping with Controlled Recording Privilege**|Peng Huang et.al.|[2401.15704](http://arxiv.org/abs/2401.15704)|null|\n", "2401.15676": "|**2024-01-28**|**On Speaker Attribution with SURT**|Desh Raj et.al.|[2401.15676](http://arxiv.org/abs/2401.15676)|**[link](https://github.com/k2-fsa/icefall)**|\n", "2401.15532": "|**2024-01-28**|**Byte Pair Encoding Is All You Need For Automatic Bengali Speech Recognition**|Ahnaf Mozib Samin et.al.|[2401.15532](http://arxiv.org/abs/2401.15532)|null|\n", "2401.15385": "|**2024-01-27**|**Towards Event Extraction from Speech with Contextual Clues**|Jingqi Kang et.al.|[2401.15385](http://arxiv.org/abs/2401.15385)|**[link](https://github.com/jodie-kang/speechee)**|\n", "2401.14890": "|**2024-01-26**|**Comparison of parameters of vowel sounds of russian and english languages**|V. I. Fedoseev et.al.|[2401.14890](http://arxiv.org/abs/2401.14890)|null|\n", "2401.14625": "|**2024-01-26**|**Toward Practical Automatic Speech Recognition and Post-Processing: a Call for Explainable Error Benchmark Guideline**|Seonmin Koo et.al.|[2401.14625](http://arxiv.org/abs/2401.14625)|null|\n", "2401.14185": "|**2024-01-25**|**TDFNet: An Efficient Audio-Visual Speech Separation Model with Top-down Fusion**|Samuel Pegg et.al.|[2401.14185](http://arxiv.org/abs/2401.14185)|**[link](https://github.com/spkgyk/TDFNet)**|\n", "2401.13575": "|**2024-01-24**|**CNN architecture extraction on edge GPU**|Peter Horvath et.al.|[2401.13575](http://arxiv.org/abs/2401.13575)|null|\n", "2401.13463": "|**2024-03-18**|**SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken Question Answering**|Chyi-Jiunn Lin et.al.|[2401.13463](http://arxiv.org/abs/2401.13463)|null|\n", "2401.13260": "|**2024-05-28**|**MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion, Asr Error Detection, and Asr Error Correction**|Jiajun He et.al.|[2401.13260](http://arxiv.org/abs/2401.13260)|null|\n", "2401.13146": "|**2024-01-23**|**Locality enhanced dynamic biasing and sampling strategies for contextual ASR**|Md Asif Jalal et.al.|[2401.13146](http://arxiv.org/abs/2401.13146)|null|\n", "2401.12789": "|**2024-01-23**|**Multilingual and Fully Non-Autoregressive ASR with Large Language Model Fusion: A Comprehensive Study**|W. Ronny Huang et.al.|[2401.12789](http://arxiv.org/abs/2401.12789)|null|\n", "2401.12085": "|**2024-01-22**|**Consistency Based Unsupervised Self-training For ASR Personalisation**|Jisi Zhang et.al.|[2401.12085](http://arxiv.org/abs/2401.12085)|null|\n", "2401.11983": "|**2024-01-22**|**Lightweight Protection for Privacy in Offloaded Speech Understanding**|Dongqi Cai et.al.|[2401.11983](http://arxiv.org/abs/2401.11983)|null|\n", "2401.11700": "|**2024-01-22**|**Keep Decoding Parallel with Effective Knowledge Distillation from Language Models to End-to-end Speech Recognisers**|Michael Hentschel et.al.|[2401.11700](http://arxiv.org/abs/2401.11700)|null|\n", "2401.11382": "|**2024-06-06**|**Using Large Language Model for End-to-End Chinese ASR and NER**|Yuang Li et.al.|[2401.11382](http://arxiv.org/abs/2401.11382)|null|\n", "2401.11268": "|**2024-02-02**|**Word-Level ASR Quality Estimation for Efficient Corpus Sampling and Post-Editing through Analyzing Attentions of a Reference-Free Metric**|Golara Javadi et.al.|[2401.11268](http://arxiv.org/abs/2401.11268)|**[link](https://github.com/aixplain/NoRefER)**|\n", "2401.11132": "|**2024-01-20**|**ConceptThread: Visualizing Threaded Concepts in MOOC Videos**|Zhiguang Zhou et.al.|[2401.11132](http://arxiv.org/abs/2401.11132)|null|\n", "2401.10449": "|**2024-01-19**|**Contextualized Automatic Speech Recognition with Attention-Based Bias Phrase Boosted Beam Search**|Yui Sudo et.al.|[2401.10449](http://arxiv.org/abs/2401.10449)|null|\n", "2401.10447": "|**2024-01-19**|**Investigating Training Strategies and Model Robustness of Low-Rank Adaptation for Language Modeling in Speech Recognition**|Yu Yu et.al.|[2401.10447](http://arxiv.org/abs/2401.10447)|null|\n", "2401.10446": "|**2024-01-19**|**Large Language Models are Efficient Learners of Noise-Robust Speech Recognition**|Yuchen Hu et.al.|[2401.10446](http://arxiv.org/abs/2401.10446)|**[link](https://github.com/yuchen005/robustger)**|\n", "2401.10411": "|**2024-01-18**|**AGADIR: Towards Array-Geometry Agnostic Directional Speech Recognition**|Ju Lin et.al.|[2401.10411](http://arxiv.org/abs/2401.10411)|null|\n", "2401.10070": "|**2024-01-18**|**Communication-Efficient Personalized Federated Learning for Speech-to-Text Tasks**|Yichao Du et.al.|[2401.10070](http://arxiv.org/abs/2401.10070)|null|\n", "2401.09802": "|**2024-07-18**|**Efficient Training for Multilingual Visual Speech Recognition: Pre-training with Discretized Visual Speech Representation**|Minsu Kim et.al.|[2401.09802](http://arxiv.org/abs/2401.09802)|null|\n", "2401.09759": "|**2024-07-02**|**SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech Recognition**|Hao Wang et.al.|[2401.09759](http://arxiv.org/abs/2401.09759)|null|\n", "2401.09315": "|**2024-01-17**|**On Speech Pre-emphasis as a Simple and Inexpensive Method to Boost Speech Enhancement**|Iv\u00e1n L\u00f3pez-Espejo et.al.|[2401.09315](http://arxiv.org/abs/2401.09315)|null|\n", "2401.08916": "|**2024-01-17**|**Two-pass Endpoint Detection for Speech Recognition**|Anirudh Raju et.al.|[2401.08916](http://arxiv.org/abs/2401.08916)|null|\n", "2401.08887": "|**2024-01-16**|**NOTSOFAR-1 Challenge: New Datasets, Baseline, and Tasks for Distant Meeting Transcription**|Alon Vinnikov et.al.|[2401.08887](http://arxiv.org/abs/2401.08887)|null|\n", "2401.08835": "|**2024-01-16**|**Improving ASR Contextual Biasing with Guided Attention**|Jiyang Tang et.al.|[2401.08835](http://arxiv.org/abs/2401.08835)|null|\n", "2401.08833": "|**2024-01-16**|**Revisiting Self-supervised Learning of Speech Representation from a Mutual Information Perspective**|Alexander H. Liu et.al.|[2401.08833](http://arxiv.org/abs/2401.08833)|null|\n", "2401.08052": "|**2024-03-01**|**Multi-Input Multi-Output Target-Speaker Voice Activity Detection For Unified, Flexible, and Robust Audio-Visual Speaker Diarization**|Ming Cheng et.al.|[2401.08052](http://arxiv.org/abs/2401.08052)|null|\n", "2401.07957": "|**2024-01-15**|**Machine Perceptual Quality: Evaluating the Impact of Severe Lossy Compression on Audio and Image Models**|Dan Jacobellis et.al.|[2401.07957](http://arxiv.org/abs/2401.07957)|**[link](https://github.com/danjacobellis/mpq)**|\n", "2401.07575": "|**2024-07-24**|**Cascaded Cross-Modal Transformer for Audio-Textual Classification**|Nicolae-Catalin Ristea et.al.|[2401.07575](http://arxiv.org/abs/2401.07575)|**[link](https://github.com/ristea/ccmt)**|\n", "2401.07506": "|**2024-01-15**|**SeMaScore : a new evaluation metric for automatic speech recognition tasks**|Zitha Sasindran et.al.|[2401.07506](http://arxiv.org/abs/2401.07506)|null|\n", "2401.07360": "|**2024-01-14**|**Promptformer: Prompted Conformer Transducer for ASR**|Sergio Duarte-Torres et.al.|[2401.07360](http://arxiv.org/abs/2401.07360)|null|\n", "2401.06980": "|**2024-01-13**|**Joint Unsupervised and Supervised Training for Automatic Speech Recognition via Bilevel Optimization**|A F M Saif et.al.|[2401.06980](http://arxiv.org/abs/2401.06980)|**[link](https://github.com/afmsaif/joint-unsupervised-and-supervised-training-for-automatic-speech-recognition-via-bilevel-optimization)**|\n", "2401.09354": "|**2024-01-12**|**Transcending Controlled Environments Assessing the Transferability of ASRRobust NLU Models to Real-World Applications**|Hania Khan et.al.|[2401.09354](http://arxiv.org/abs/2401.09354)|null|\n", "2401.06588": "|**2024-01-12**|**Dynamic Behaviour of Connectionist Speech Recognition with Strong Latency Constraints**|Giampiero Salvi et.al.|[2401.06588](http://arxiv.org/abs/2401.06588)|null|\n", "2401.06832": "|**2024-01-12**|**XLS-R Deep Learning Model for Multilingual ASR on Low- Resource Languages: Indonesian, Javanese, and Sundanese**|Panji Arisaputra et.al.|[2401.06832](http://arxiv.org/abs/2401.06832)|null|\n", "2401.06390": "|**2024-01-12**|**LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition**|Fan Yu et.al.|[2401.06390](http://arxiv.org/abs/2401.06390)|**[link](https://github.com/alibaba-damo-academy/FunASR)**|\n", "2401.05689": "|**2024-01-11**|**UCorrect: An Unsupervised Framework for Automatic Speech Recognition Error Correction**|Jiaxin Guo et.al.|[2401.05689](http://arxiv.org/abs/2401.05689)|null|\n", "2401.06183": "|**2024-01-11**|**End to end Hindi to English speech conversion using Bark, mBART and a finetuned XLSR Wav2Vec2**|Aniket Tathe et.al.|[2401.06183](http://arxiv.org/abs/2401.06183)|null|\n", "2401.05551": "|**2024-01-10**|**Useful Blunders: Can Automated Speech Recognition Errors Improve Downstream Dementia Classification?**|Changye Li et.al.|[2401.05551](http://arxiv.org/abs/2401.05551)|null|\n", "2401.05336": "|**2024-01-10**|**Towards Online Sign Language Recognition and Translation**|Ronglai Zuo et.al.|[2401.05336](http://arxiv.org/abs/2401.05336)|**[link](https://github.com/FangyunWei/SLRT)**|\n", "2401.04482": "|**2024-07-17**|**Continuously Learning New Words in Automatic Speech Recognition**|Christian Huber et.al.|[2401.04482](http://arxiv.org/abs/2401.04482)|null|\n", "2401.04235": "|**2024-01-08**|**High-precision Voice Search Query Correction via Retrievable Speech-text Embedings**|Christopher Li et.al.|[2401.04235](http://arxiv.org/abs/2401.04235)|null|\n", "2401.04152": "|**2024-07-22**|**Cross-Speaker Encoding Network for Multi-Talker Speech Recognition**|Jiawen Kang et.al.|[2401.04152](http://arxiv.org/abs/2401.04152)|**[link](https://github.com/kjw11/csenet-asr)**|\n", "2401.03936": "|**2024-01-08**|**Exploratory Evaluation of Speech Content Masking**|Jennifer Williams et.al.|[2401.03936](http://arxiv.org/abs/2401.03936)|null|\n", "2401.03697": "|**2024-03-07**|**An audio-quality-based multi-strategy approach for target speaker extraction in the MISP 2023 Challenge**|Runduo Han et.al.|[2401.03697](http://arxiv.org/abs/2401.03697)|null|\n", "2401.03689": "|**2024-06-10**|**LUPET: Incorporating Hierarchical Information Path into Multilingual ASR**|Wei Liu et.al.|[2401.03689](http://arxiv.org/abs/2401.03689)|null|\n", "2401.03687": "|**2024-01-08**|**BS-PLCNet: Band-split Packet Loss Concealment Network with Multi-task Learning Framework and Multi-discriminators**|Zihan Zhang et.al.|[2401.03687](http://arxiv.org/abs/2401.03687)|null|\n", "2401.03506": "|**2024-07-22**|**DiarizationLM: Speaker Diarization Post-Processing with Large Language Models**|Quan Wang et.al.|[2401.03506](http://arxiv.org/abs/2401.03506)|**[link](https://github.com/google/speaker-id)**|\n", "2401.06788": "|**2024-02-29**|**The NPU-ASLP-LiAuto System Description for Visual Speech Recognition in CNVSRC 2023**|He Wang et.al.|[2401.06788](http://arxiv.org/abs/2401.06788)|**[link](https://github.com/mkt-dataoceanai/cnvsrc2023baseline)**|\n", "2401.03473": "|**2024-02-21**|**ICMC-ASR: The ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition Challenge**|He Wang et.al.|[2401.03473](http://arxiv.org/abs/2401.03473)|null|\n", "2401.03468": "|**2024-01-07**|**Multichannel AV-wav2vec2: A Framework for Learning Multichannel Multi-Modal Speech Representation**|Qiushi Zhu et.al.|[2401.03468](http://arxiv.org/abs/2401.03468)|**[link](https://github.com/zqs01/multi-channel-wav2vec2)**|\n", "2401.03424": "|**2024-04-08**|**MLCA-AVSR: Multi-Layer Cross Attention Fusion based Audio-Visual Speech Recognition**|He Wang et.al.|[2401.03424](http://arxiv.org/abs/2401.03424)|null|\n", "2401.03251": "|**2024-01-06**|**TeLeS: Temporal Lexeme Similarity Score to Estimate Confidence in End-to-End ASR**|Nagarathna Ravi et.al.|[2401.03251](http://arxiv.org/abs/2401.03251)|**[link](https://github.com/madhavlab/2023_teles_wlc)**|\n", "2401.03175": "|**2024-01-06**|**Part-of-Speech Tagger for Bodo Language using Deep Learning approach**|Dhrubajyoti Pathak et.al.|[2401.03175](http://arxiv.org/abs/2401.03175)|null|\n", "2401.02921": "|**2024-01-05**|**Towards ASR Robust Spoken Language Understanding Through In-Context Learning With Word Confusion Networks**|Kevin Everson et.al.|[2401.02921](http://arxiv.org/abs/2401.02921)|null|\n", "2401.02890": "|**2024-01-05**|**Nonlinear functional regression by functional deep neural network with kernel embedding**|Zhongjie Shi et.al.|[2401.02890](http://arxiv.org/abs/2401.02890)|null|\n", "2401.02673": "|**2024-01-05**|**A unified multichannel far-field speech recognition system: combining neural beamforming with attention based end-to-end model**|Dongdi Zhao et.al.|[2401.02673](http://arxiv.org/abs/2401.02673)|null|\n", "2401.02417": "|**2024-01-04**|**Task Oriented Dialogue as a Catalyst for Self-Supervised Automatic Speech Recognition**|David M. Chan et.al.|[2401.02417](http://arxiv.org/abs/2401.02417)|**[link](https://github.com/amazon-science/amazon-od3)**|\n", "2402.10218": "|**2024-01-04**|**AntiDeepFake: AI for Deep Fake Speech Recognition**|Enkhtogtokh Togootogtokh et.al.|[2402.10218](http://arxiv.org/abs/2402.10218)|null|\n", "2401.02046": "|**2024-01-04**|**CTC Blank Triggered Dynamic Layer-Skipping for Efficient CTC-based Speech Recognition**|Junfeng Hou et.al.|[2401.02046](http://arxiv.org/abs/2401.02046)|null|\n", "2401.01572": "|**2024-01-03**|**Hallucinations in Neural Automatic Speech Recognition: Identifying Errors and Hallucinatory Models**|Rita Frieske et.al.|[2401.01572](http://arxiv.org/abs/2401.01572)|null|\n", "2401.01537": "|**2024-06-04**|**The Art of Deception: Robust Backdoor Attack using Dynamic Stacking of Triggers**|Orson Mengara et.al.|[2401.01537](http://arxiv.org/abs/2401.01537)|null|\n", "2401.00662": "|**2024-01-01**|**Enhancing Pre-trained ASR System Fine-tuning for Dysarthric Speech Recognition using Adversarial Data Augmentation**|Huimeng Wang et.al.|[2401.00662](http://arxiv.org/abs/2401.00662)|null|\n", "2312.17279": "|**2024-05-02**|**Stateful Conformer with Cache-based Inference for Streaming Automatic Speech Recognition**|Vahid Noroozi et.al.|[2312.17279](http://arxiv.org/abs/2312.17279)|null|\n", "2312.16002": "|**2023-12-26**|**The NUS-HLT System for ICASSP2024 ICMC-ASR Grand Challenge**|Meng Ge et.al.|[2312.16002](http://arxiv.org/abs/2312.16002)|null|\n", "2312.15922": "|**2023-12-26**|**Towards Probing Contact Center Large Language Models**|Varun Nathan et.al.|[2312.15922](http://arxiv.org/abs/2312.15922)|null|\n", "2312.15499": "|**2023-12-24**|**Exploring data augmentation in bias mitigation against non-native-accented speech**|Yuanyuan Zhang et.al.|[2312.15499](http://arxiv.org/abs/2312.15499)|null|\n", "2312.14609": "|**2023-12-22**|**BLSTM-Based Confidence Estimation for End-to-End Speech Recognition**|Atsunori Ogawa et.al.|[2312.14609](http://arxiv.org/abs/2312.14609)|null|\n", "2312.14378": "|**2024-02-09**|**Multimodal Attention Merging for Improved Speech Recognition and Audio Event Classification**|Anirudh S. Sundar et.al.|[2312.14378](http://arxiv.org/abs/2312.14378)|null|\n", "2312.14055": "|**2024-07-22**|**Multi-Sentence Grounding for Long-term Instructional Video**|Zeqian Li et.al.|[2312.14055](http://arxiv.org/abs/2312.14055)|null|\n", "2312.14020": "|**2023-12-21**|**BANSpEmo: A Bangla Emotional Speech Recognition Dataset**|Md Gulzar Hussain et.al.|[2312.14020](http://arxiv.org/abs/2312.14020)|null|\n", "2312.13873": "|**2023-12-21**|**Self-Supervised Adaptive AV Fusion Module for Pre-Trained ASR Models**|Christopher Simic et.al.|[2312.13873](http://arxiv.org/abs/2312.13873)|null|\n", "2312.13560": "|**2024-02-03**|**kNN-CTC: Enhancing ASR via Retrieval of CTC Pseudo Labels**|Jiaming Zhou et.al.|[2312.13560](http://arxiv.org/abs/2312.13560)|**[link](https://github.com/nku-hlt/knn-ctc)**|\n", "2408.02582": "|**2024-08-05**|**Clustering and Mining Accented Speech for Inclusive and Fair Speech Recognition**|Jaeyoung Kim et.al.|[2408.02582](http://arxiv.org/abs/2408.02582)|null|\n", "2408.02369": "|**2024-08-08**|**The NPU-ASLP System Description for Visual Speech Recognition in CNVSRC 2024**|He Wang et.al.|[2408.02369](http://arxiv.org/abs/2408.02369)|**[link](https://gitlab.com/csltstu/sunine)**|\n", "2408.02178": "|**2024-08-05**|**StreamVoice+: Evolving into End-to-end Streaming Zero-shot Voice Conversion**|Zhichao Wang et.al.|[2408.02178](http://arxiv.org/abs/2408.02178)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.02978": "|**2024-08-06**|**ASR-enhanced Multimodal Representation Learning for Cross-Domain Product Retrieval**|Ruixiang Zhao et.al.|[2408.02978](http://arxiv.org/abs/2408.02978)|null|\n", "2408.02945": "|**2024-08-06**|**Self-Supervised Learning for Multi-Channel Neural Transducer**|Atsushi Kojima et.al.|[2408.02945](http://arxiv.org/abs/2408.02945)|null|\n", "2408.04325": "|**2024-08-08**|**HydraFormer: One Encoder For All Subsampling Rates**|Yaoxun Xu et.al.|[2408.04325](http://arxiv.org/abs/2408.04325)|**[link](https://github.com/hydraformer/hydraformer)**|\n", "2408.04306": "|**2024-08-08**|**Preserving spoken content in voice anonymisation with character-level vocoder conditioning**|Michele Panariello et.al.|[2408.04306](http://arxiv.org/abs/2408.04306)|**[link](https://github.com/m-pana/spk_anon_nac_lm)**|\n", "2408.04174": "|**2024-08-08**|**wav2graph: A Framework for Supervised Learning Knowledge Graph from Speech**|Khai Le-Duc et.al.|[2408.04174](http://arxiv.org/abs/2408.04174)|**[link](https://github.com/leduckhai/wav2graph)**|\n", "2408.03979": "|**2024-08-07**|**Speaker Adaptation for Quantised End-to-End ASR Models**|Qiuming Zhao et.al.|[2408.03979](http://arxiv.org/abs/2408.03979)|null|\n", "2408.05101": "|**2024-08-09**|**MooER: LLM-based Speech Recognition and Translation Models from Moore Threads**|Junhao Xu et.al.|[2408.05101](http://arxiv.org/abs/2408.05101)|**[link](https://github.com/moorethreads/mooer)**|\n", "2408.06264": "|**2024-08-12**|**Audio Enhancement for Computer Audition -- An Iterative Training Paradigm Using Sample Importance**|Manuel Milling et.al.|[2408.06264](http://arxiv.org/abs/2408.06264)|null|\n", "2408.06043": "|**2024-08-12**|**Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning**|Wonjun Lee et.al.|[2408.06043](http://arxiv.org/abs/2408.06043)|null|\n", "2408.05769": "|**2024-08-11**|**LI-TTA: Language Informed Test-Time Adaptation for Automatic Speech Recognition**|Eunseop Yoon et.al.|[2408.05769](http://arxiv.org/abs/2408.05769)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.05554": "|**2024-08-10**|**Improving Whisper's Recognition Performance for Under-Represented Language Kazakh Leveraging Unpaired Speech and Text**|Jinpeng Li et.al.|[2408.05554](http://arxiv.org/abs/2408.05554)|null|\n", "2408.06484": "|**2024-08-12**|**Cross-Lingual Conversational Speech Summarization with Large Language Models**|Max Nelson et.al.|[2408.06484](http://arxiv.org/abs/2408.06484)|null|\n", "2408.07388": "|**2024-08-14**|**DPSNN: Spiking Neural Network for Low-Latency Streaming Speech Enhancement**|Tao Sun et.al.|[2408.07388](http://arxiv.org/abs/2408.07388)|null|\n", "2408.08027": "|**2024-08-15**|**Enhancing Large Language Model-based Speech Recognition by Contextualization for Rare and Ambiguous Words**|Kento Nozawa et.al.|[2408.08027](http://arxiv.org/abs/2408.08027)|null|\n", "2408.07851": "|**2024-08-14**|**SER Evals: In-domain and Out-of-domain Benchmarking for Speech Emotion Recognition**|Mohamed Osman et.al.|[2408.07851](http://arxiv.org/abs/2408.07851)|**[link](https://github.com/spaghettiSystems/serval)**|\n", "2408.07081": "|**2024-08-16**|**MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical Expressions into $LaTeX$ Formulas for Improved Readability**|Kyudan Jung et.al.|[2408.07081](http://arxiv.org/abs/2408.07081)|null|\n", "2408.09688": "|**2024-08-19**|**Recording for Eyes, Not Echoing to Ears: Contextualized Spoken-to-Written Conversion of ASR Transcripts**|Jiaqing Liu et.al.|[2408.09688](http://arxiv.org/abs/2408.09688)|null|\n", "2408.09491": "|**2024-08-18**|**A Transcription Prompt-based Efficient Audio Large Language Model for Robust Speech Recognition**|Yangze Li et.al.|[2408.09491](http://arxiv.org/abs/2408.09491)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10524": "|**2024-08-20**|**XCB: an effective contextual biasing approach to bias cross-lingual phrases in speech recognition**|Xucheng Wan et.al.|[2408.10524](http://arxiv.org/abs/2408.10524)|null|\n", "2408.11804": "|**2024-08-21**|**Approaching Deep Learning through the Spectral Dynamics of Weights**|David Yunis et.al.|[2408.11804](http://arxiv.org/abs/2408.11804)|**[link](https://github.com/dyunis/spectral_dynamics)**|\n", "2408.11258": "|**2024-08-21**|**Improving Speech Recognition Error Prediction for Modern and Off-the-shelf Speech Recognizers**|Prashant Serai et.al.|[2408.11258](http://arxiv.org/abs/2408.11258)|null|\n", "2408.12500": "|**2024-08-22**|**WhisperMask: A Noise Suppressive Mask-Type Microphone for Whisper Speech**|Hirotaka Hiraki et.al.|[2408.12500](http://arxiv.org/abs/2408.12500)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12279": "|**2024-08-22**|**Developing vocal system impaired patient-aimed voice quality assessment approach using ASR representation-included multiple features**|Shaoxiang Dang et.al.|[2408.12279](http://arxiv.org/abs/2408.12279)|null|\n", "2408.11940": "|**2024-08-21**|**The State of Commercial Automatic French Legal Speech Recognition Systems and their Impact on Court Reporters et al**|Nicolad Garneau et.al.|[2408.11940](http://arxiv.org/abs/2408.11940)|null|\n", "2408.11873": "|**2024-08-19**|**Parameter-Efficient Transfer Learning under Federated Learning for Automatic Speech Recognition**|Xuan Kan et.al.|[2408.11873](http://arxiv.org/abs/2408.11873)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13106": "|**2024-08-28**|**NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks**|He Huang et.al.|[2408.13106](http://arxiv.org/abs/2408.13106)|null|\n", "2408.13008": "|**2024-08-23**|**Focused Discriminative Training For Streaming CTC-Trained Automatic Speech Recognition Models**|Adnan Haider et.al.|[2408.13008](http://arxiv.org/abs/2408.13008)|null|\n", "2408.12734": "|**2024-08-22**|**Towards measuring fairness in speech recognition: Fair-Speech dataset**|Irina-Elena Veliche et.al.|[2408.12734](http://arxiv.org/abs/2408.12734)|null|\n", "2408.14418": "|**2024-08-26**|**MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR Errors with LLM-generated Synthetic Dialogues**|Kuluhan Binici et.al.|[2408.14418](http://arxiv.org/abs/2408.14418)|null|\n", "2408.14262": "|**2024-08-26**|**Self-supervised Speech Representations Still Struggle with African American Vernacular English**|Kalvin Chang et.al.|[2408.14262](http://arxiv.org/abs/2408.14262)|**[link](https://github.com/cmu-llab/s3m-aave)**|\n", "2408.14082": "|**2024-08-26**|**Automatic recognition and detection of aphasic natural speech**|Mara Barberis et.al.|[2408.14082](http://arxiv.org/abs/2408.14082)|null|\n", "2408.13996": "|**2024-08-28**|**Research Advances and New Paradigms for Biology-inspired Spiking Neural Networks**|Tianyu Zheng et.al.|[2408.13996](http://arxiv.org/abs/2408.13996)|null|\n", "2408.13739": "|**2024-08-25**|**Literary and Colloquial Tamil Dialect Identification**|M. Nanmalar et.al.|[2408.13739](http://arxiv.org/abs/2408.13739)|null|\n", "2408.13644": "|**2024-08-24**|**Studying the Effect of Audio Filters in Pre-Trained Models for Environmental Sound Classification**|Aditya Dawn et.al.|[2408.13644](http://arxiv.org/abs/2408.13644)|null|\n", "2408.14991": "|**2024-08-27**|**Speech Recognition Transformers: Topological-lingualism Perspective**|Shruti Singh et.al.|[2408.14991](http://arxiv.org/abs/2408.14991)|null|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.15616": "|**2024-08-28**|**Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error Rate Computations And Granular Error Classifications**|Korbinian Kuhn et.al.|[2408.15616](http://arxiv.org/abs/2408.15616)|**[link](https://github.com/shuffle-project/beyond-levenshtein)**|\n", "2408.15585": "|**2024-08-28**|**Whisper-PMFA: Partial Multi-Scale Feature Aggregation for Speaker Verification using Whisper Models**|Yiyang Zhao et.al.|[2408.15585](http://arxiv.org/abs/2408.15585)|null|\n", "2408.16589": "|**2024-08-29**|**CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions**|Laurin Wagner et.al.|[2408.16589](http://arxiv.org/abs/2408.16589)|null|\n", "2408.16564": "|**2024-08-29**|**Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing Interaction and Causal Processing**|Qianhui Liu et.al.|[2408.16564](http://arxiv.org/abs/2408.16564)|null|\n", "2408.16287": "|**2024-08-29**|**Measuring the Accuracy of Automatic Speech Recognition Solutions**|Korbinian Kuhn et.al.|[2408.16287](http://arxiv.org/abs/2408.16287)|**[link](https://github.com/shuffle-project/asr-comparison)**|\n", "2408.16204": "|**2024-08-29**|**Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient Manipulation**|Lun Wang et.al.|[2408.16204](http://arxiv.org/abs/2408.16204)|null|\n", "2408.16180": "|**2024-08-29**|**Benchmarking Japanese Speech Recognition on ASR-LLM Setups with Multi-Pass Augmented Generative Error Correction**|Yuka Ko et.al.|[2408.16180](http://arxiv.org/abs/2408.16180)|null|\n"}, "TTS": {"2408.06227": "|**2024-08-12**|**FLEURS-R: A Restored Multilingual Speech Corpus for Generation Tasks**|Min Ma et.al.|[2408.06227](http://arxiv.org/abs/2408.06227)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.03887": "|**2024-08-06**|**Central Kurdish Text-to-Speech Synthesis with Novel End-to-End Transformer Training**|Hawraz A. Ahmad et.al.|[2408.03887](http://arxiv.org/abs/2408.03887)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.00284": "|**2024-08-01**|**Bailing-TTS: Chinese Dialectal Speech Synthesis Towards Human-like Spontaneous Representation**|Xinhan Di et.al.|[2408.00284](http://arxiv.org/abs/2408.00284)|null|\n", "2407.21491": "|**2024-08-01**|**Generative Expressive Conversational Speech Synthesis**|Rui Liu et.al.|[2407.21491](http://arxiv.org/abs/2407.21491)|**[link](https://github.com/ai-s2-lab/gpt-talker)**|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18541": "|**2024-07-26**|**Towards Improving NAM-to-Speech Synthesis Intelligibility using Self-Supervised Speech Models**|Neil Shah et.al.|[2407.18541](http://arxiv.org/abs/2407.18541)|null|\n", "2407.18505": "|**2024-07-26**|**VoxSim: A perceptual voice similarity dataset**|Junseok Ahn et.al.|[2407.18505](http://arxiv.org/abs/2407.18505)|null|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17167": "|**2024-07-24**|**Zero-Shot vs. Few-Shot Multi-Speaker TTS Using Pre-trained Czech SpeechT5 Model**|Jan Lehe\u010dka et.al.|[2407.17167](http://arxiv.org/abs/2407.17167)|null|\n", "2407.16840": "|**2024-07-23**|**Synth4Kws: Synthesized Speech for User Defined Keyword Spotting in Low Resource Environments**|Pai Zhu et.al.|[2407.16840](http://arxiv.org/abs/2407.16840)|null|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15188": "|**2024-07-21**|**Overview of Speaker Modeling and Its Applications: From the Lens of Deep Speaker Representation Learning**|Shuai Wang et.al.|[2407.15188](http://arxiv.org/abs/2407.15188)|null|\n", "2407.14212": "|**2024-07-19**|**Braille-to-Speech Generator: Audio Generation Based on Joint Fine-Tuning of CLIP and Fastspeech2**|Chun Xu et.al.|[2407.14212](http://arxiv.org/abs/2407.14212)|null|\n", "2407.14056": "|**2024-07-19**|**Rasa: Building Expressive Speech Synthesis Systems for Indian Languages in Low-resource Settings**|Praveen Srinivasa Varadhan et.al.|[2407.14056](http://arxiv.org/abs/2407.14056)|**[link](https://github.com/AI4Bharat/Rasa)**|\n", "2407.14006": "|**2024-07-19**|**MSceneSpeech: A Multi-Scene Speech Dataset For Expressive Speech Synthesis**|Qian Yang et.al.|[2407.14006](http://arxiv.org/abs/2407.14006)|null|\n", "2407.13509": "|**2024-07-18**|**Spontaneous Style Text-to-Speech Synthesis with Controllable Spontaneous Behaviors Based on Language Models**|Weiqin Li et.al.|[2407.13509](http://arxiv.org/abs/2407.13509)|null|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.12707": "|**2024-07-22**|**TTSDS -- Text-to-Speech Distribution Score**|Christoph Minixhofer et.al.|[2407.12707](http://arxiv.org/abs/2407.12707)|**[link](https://github.com/ttsds/ttsds)**|\n", "2408.00788": "|**2024-07-17**|**SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network**|Kexin Wang et.al.|[2408.00788](http://arxiv.org/abs/2408.00788)|null|\n", "2407.12229": "|**2024-07-17**|**Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-to-Speech**|Haibin Wu et.al.|[2407.12229](http://arxiv.org/abs/2407.12229)|null|\n", "2407.12206": "|**2024-07-16**|**A Language Modeling Approach to Diacritic-Free Hebrew TTS**|Amit Roth et.al.|[2407.12206](http://arxiv.org/abs/2407.12206)|null|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.09370": "|**2024-07-17**|**Learning High-Frequency Functions Made Easy with Sinusoidal Positional Encoding**|Chuanhao Sun et.al.|[2407.09370](http://arxiv.org/abs/2407.09370)|**[link](https://github.com/zhyuan11/SPE)**|\n", "2407.08551": "|**2024-07-11**|**Autoregressive Speech Synthesis without Vector Quantization**|Lingwei Meng et.al.|[2407.08551](http://arxiv.org/abs/2407.08551)|null|\n", "2407.08248": "|**2024-07-11**|**Toward accessible comics for blind and low vision readers**|Christophe Rigaud et.al.|[2407.08248](http://arxiv.org/abs/2407.08248)|null|\n", "2407.08016": "|**2024-07-10**|**Source Tracing of Audio Deepfake Systems**|Nicholas Klein et.al.|[2407.08016](http://arxiv.org/abs/2407.08016)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05471": "|**2024-07-07**|**Fine-Grained and Interpretable Neural Speech Editing**|Max Morrison et.al.|[2407.05471](http://arxiv.org/abs/2407.05471)|**[link](https://github.com/maxrmorrison/torbi)**|\n", "2407.05421": "|**2024-07-07**|**ASRRL-TTS: Agile Speaker Representation Reinforcement Learning for Text-to-Speech Speaker Adaptation**|Ruibo Fu et.al.|[2407.05421](http://arxiv.org/abs/2407.05421)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.04575": "|**2024-07-05**|**FA-GAN: Artifacts-free and Phase-aware High-fidelity GAN-based Vocoder**|Rubing Shen et.al.|[2407.04575](http://arxiv.org/abs/2407.04575)|null|\n", "2407.04291": "|**2024-07-05**|**We Need Variations in Speech Synthesis: Sub-center Modelling for Speaker Embeddings**|Ismail Rasim Ulgen et.al.|[2407.04291](http://arxiv.org/abs/2407.04291)|null|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.04034": "|**2024-07-04**|**Optimizing a-DCF for Spoofing-Robust Speaker Verification**|O\u011fuzhan Kurnaz et.al.|[2407.04034](http://arxiv.org/abs/2407.04034)|null|\n", "2407.03892": "|**2024-07-04**|**On the Effectiveness of Acoustic BPE in Decoder-Only TTS**|Bohan Li et.al.|[2407.03892](http://arxiv.org/abs/2407.03892)|null|\n", "2407.03236": "|**2024-07-14**|**CATT: Character-based Arabic Tashkeel Transformer**|Faris Alasmary et.al.|[2407.03236](http://arxiv.org/abs/2407.03236)|**[link](https://github.com/abjadai/catt)**|\n", "2407.02937": "|**2024-07-03**|**Probing the Feasibility of Multilingual Speaker Anonymization**|Sarina Meyer et.al.|[2407.02937](http://arxiv.org/abs/2407.02937)|**[link](https://github.com/digitalphonetics/speaker-anonymization)**|\n", "2407.02243": "|**2024-07-02**|**Robust Zero-Shot Text-to-Speech Synthesis with Reverse Inference Optimization**|Yuchen Hu et.al.|[2407.02243](http://arxiv.org/abs/2407.02243)|null|\n", "2407.01927": "|**2024-07-02**|**TTSlow: Slow Down Text-to-Speech with Efficiency Robustness Evaluations**|Xiaoxue Gao et.al.|[2407.01927](http://arxiv.org/abs/2407.01927)|null|\n", "2407.01291": "|**2024-07-01**|**Lightweight Zero-shot Text-to-Speech with Mixture of Adapters**|Kenichi Fujita et.al.|[2407.01291](http://arxiv.org/abs/2407.01291)|null|\n", "2407.12038": "|**2024-07-31**|**ICAGC 2024: Inspirational and Convincing Audio Generation Challenge 2024**|Ruibo Fu et.al.|[2407.12038](http://arxiv.org/abs/2407.12038)|null|\n", "2407.00826": "|**2024-06-30**|**NAIST Simultaneous Speech Translation System for IWSLT 2024**|Yuka Ko et.al.|[2407.00826](http://arxiv.org/abs/2407.00826)|null|\n", "2407.00766": "|**2024-06-30**|**An Attribute Interpolation Method in Speech Synthesis by Model Merging**|Masato Murata et.al.|[2407.00766](http://arxiv.org/abs/2407.00766)|null|\n", "2407.00753": "|**2024-06-30**|**FLY-TTS: Fast, Lightweight and High-Quality End-to-End Text-to-Speech Synthesis**|Yinlin Guo et.al.|[2407.00753](http://arxiv.org/abs/2407.00753)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2406.19243": "|**2024-06-27**|**Application of ASV for Voice Identification after VC and Duration Predictor Improvement in TTS Models**|Borodin Kirill Nikolayevich et.al.|[2406.19243](http://arxiv.org/abs/2406.19243)|null|\n", "2406.19135": "|**2024-06-27**|**DEX-TTS: Diffusion-based EXpressive Text-to-Speech with Style Modeling on Time Variability**|Hyun Joon Park et.al.|[2406.19135](http://arxiv.org/abs/2406.19135)|**[link](https://github.com/winddori2002/dex-tts)**|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18089": "|**2024-06-26**|**A Study on Synthesizing Expressive Violin Performances: Approaches and Comparisons**|Tzu-Yun Hung et.al.|[2406.18089](http://arxiv.org/abs/2406.18089)|null|\n", "2406.18088": "|**2024-06-29**|**LLM-Driven Multimodal Opinion Expression Identification**|Bonian Jia et.al.|[2406.18088](http://arxiv.org/abs/2406.18088)|null|\n", "2406.18009": "|**2024-06-26**|**E2 TTS: Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS**|Sefik Emre Eskimez et.al.|[2406.18009](http://arxiv.org/abs/2406.18009)|null|\n", "2406.17957": "|**2024-06-25**|**Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment**|Paarth Neekhara et.al.|[2406.17957](http://arxiv.org/abs/2406.17957)|null|\n", "2406.17310": "|**2024-06-25**|**High Fidelity Text-to-Speech Via Discrete Tokens Using Token Transducer and Group Masked Language Model**|Joun Yeop Lee et.al.|[2406.17310](http://arxiv.org/abs/2406.17310)|null|\n", "2406.17257": "|**2024-06-25**|**Leveraging Parameter-Efficient Transfer Learning for Multi-Lingual Text-to-Speech Adaptation**|Yingting Li et.al.|[2406.17257](http://arxiv.org/abs/2406.17257)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16751": "|**2024-07-07**|**Towards Zero-Shot Text-To-Speech for Arabic Dialects**|Khai Duy Doan et.al.|[2406.16751](http://arxiv.org/abs/2406.16751)|null|\n", "2406.16716": "|**2024-06-24**|**One-Class Learning with Adaptive Centroid Shift for Audio Deepfake Detection**|Hyun Myung Kim et.al.|[2406.16716](http://arxiv.org/abs/2406.16716)|null|\n", "2406.17801": "|**2024-06-22**|**A multi-speaker multi-lingual voice cloning system based on vits2 for limmits 2024 challenge**|Xiaopeng Wang et.al.|[2406.17801](http://arxiv.org/abs/2406.17801)|null|\n", "2406.15752": "|**2024-06-22**|**TacoLM: GaTed Attention Equipped Codec Language Model are Efficient Zero-Shot Text to Speech Synthesizers**|Yakun Song et.al.|[2406.15752](http://arxiv.org/abs/2406.15752)|**[link](https://github.com/Ereboas/TacoLM)**|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14875": "|**2024-06-21**|**GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech**|Wenbin Wang et.al.|[2406.14875](http://arxiv.org/abs/2406.14875)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.12946": "|**2024-06-18**|**Instruction Data Generation and Unsupervised Adaptation for Speech Language Models**|Vahid Noroozi et.al.|[2406.12946](http://arxiv.org/abs/2406.12946)|null|\n", "2406.12164": "|**2024-07-09**|**A Mel Spectrogram Enhancement Paradigm Based on CWT in Speech Synthesis**|Guoqiang Hu et.al.|[2406.12164](http://arxiv.org/abs/2406.12164)|null|\n", "2406.11727": "|**2024-06-27**|**1000 African Voices: Advancing inclusive multi-speaker multi-accent speech synthesis**|Sewade Ogun et.al.|[2406.11727](http://arxiv.org/abs/2406.11727)|null|\n", "2406.11427": "|**2024-06-17**|**DiTTo-TTS: Efficient and Scalable Zero-Shot Text-to-Speech with Diffusion Transformer**|Keon Lee et.al.|[2406.11427](http://arxiv.org/abs/2406.11427)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.10844": "|**2024-06-16**|**Multi-Scale Accent Modeling with Disentangling for Multi-Speaker Multi-Accent TTS Synthesis**|Xuehao Zhou et.al.|[2406.10844](http://arxiv.org/abs/2406.10844)|null|\n", "2406.10514": "|**2024-06-15**|**GTR-Voice: Articulatory Phonetics Informed Controllable Expressive Speech Synthesis**|Zehua Kcriss Li et.al.|[2406.10514](http://arxiv.org/abs/2406.10514)|null|\n", "2406.10422": "|**2024-06-14**|**Phoneme Discretized Saliency Maps for Explainable Detection of AI-Generated Voice**|Shubham Gupta et.al.|[2406.10422](http://arxiv.org/abs/2406.10422)|null|\n", "2406.10056": "|**2024-06-14**|**UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner**|Dongchao Yang et.al.|[2406.10056](http://arxiv.org/abs/2406.10056)|**[link](https://github.com/yangdongchao/llm-codec)**|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.08989": "|**2024-06-13**|**ToneUnit: A Speech Discretization Approach for Tonal Language Speech Synthesis**|Dehua Tao et.al.|[2406.08989](http://arxiv.org/abs/2406.08989)|null|\n", "2406.08820": "|**2024-06-13**|**DisfluencySpeech -- Single-Speaker Conversational Speech Dataset with Paralanguage**|Kyra Wang et.al.|[2406.08820](http://arxiv.org/abs/2406.08820)|null|\n", "2406.08812": "|**2024-06-13**|**Generating Speakers by Prompting Listener Impressions for Pre-trained Multi-Speaker Text-to-Speech Systems**|Zhengyang Chen et.al.|[2406.08812](http://arxiv.org/abs/2406.08812)|null|\n", "2406.08802": "|**2024-06-13**|**DubWise: Video-Guided Speech Duration Control in Multimodal LLM-based Text-to-Speech for Dubbing**|Neha Sahipjohn et.al.|[2406.08802](http://arxiv.org/abs/2406.08802)|null|\n", "2406.08568": "|**2024-06-12**|**Training Data Augmentation for Dysarthric Automatic Speech Recognition by Text-to-Dysarthric-Speech Synthesis**|Wing-Zin Leung et.al.|[2406.08568](http://arxiv.org/abs/2406.08568)|null|\n", "2406.08416": "|**2024-06-20**|**TokSing: Singing Voice Synthesis based on Discrete Tokens**|Yuning Wu et.al.|[2406.08416](http://arxiv.org/abs/2406.08416)|null|\n", "2406.08196": "|**2024-06-12**|**FreeV: Free Lunch For Vocoders Through Pseudo Inversed Mel Filter**|Yuanjun Lv et.al.|[2406.08196](http://arxiv.org/abs/2406.08196)|**[link](https://github.com/bakerbunker/freev)**|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.08076": "|**2024-06-12**|**VECL-TTS: Voice identity and Emotional style controllable Cross-Lingual Text-to-Speech**|Ashishkumar Gudmalwar et.al.|[2406.08076](http://arxiv.org/abs/2406.08076)|null|\n", "2406.07969": "|**2024-06-12**|**LibriTTS-P: A Corpus with Speaking Style and Speaker Identity Prompts for Text-to-Speech and Style Captioning**|Masaya Kawamura et.al.|[2406.07969](http://arxiv.org/abs/2406.07969)|**[link](https://github.com/line/libritts-p)**|\n", "2406.07855": "|**2024-06-12**|**VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment**|Bing Han et.al.|[2406.07855](http://arxiv.org/abs/2406.07855)|null|\n", "2406.07803": "|**2024-06-12**|**EmoSphere-TTS: Emotional Style and Intensity Modeling via Spherical Emotion Vector for Controllable Emotional Text-to-Speech**|Deok-Hyeon Cho et.al.|[2406.07803](http://arxiv.org/abs/2406.07803)|**[link](https://github.com/Choddeok/EmoSphere-TTS)**|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07289": "|**2024-06-11**|**Can We Achieve High-quality Direct Speech-to-Speech Translation without Parallel Speech Data?**|Qingkai Fang et.al.|[2406.07289](http://arxiv.org/abs/2406.07289)|null|\n", "2406.07237": "|**2024-06-11**|**CodecFake: Enhancing Anti-Spoofing Models Against Deepfake Audios from Codec-Based Speech Synthesis Systems**|Haibin Wu et.al.|[2406.07237](http://arxiv.org/abs/2406.07237)|null|\n", "2406.06979": "|**2024-06-11**|**AudioMarkBench: Benchmarking Robustness of Audio Watermarking**|Hongbin Liu et.al.|[2406.06979](http://arxiv.org/abs/2406.06979)|**[link](https://github.com/moyangkuo/audiomarkbench)**|\n", "2406.06406": "|**2024-06-11**|**Controlling Emotion in Text-to-Speech with Natural Language Prompts**|Thomas Bott et.al.|[2406.06406](http://arxiv.org/abs/2406.06406)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06403": "|**2024-06-10**|**Meta Learning Text-to-Speech Synthesis in over 7000 Languages**|Florian Lux et.al.|[2406.06403](http://arxiv.org/abs/2406.06403)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06111": "|**2024-06-10**|**JenGAN: Stacked Shifted Filters in GAN-Based Speech Synthesis**|Hyunjae Cho et.al.|[2406.06111](http://arxiv.org/abs/2406.06111)|null|\n", "2406.05965": "|**2024-06-10**|**MakeSinger: A Semi-Supervised Training Method for Data-Efficient Singing Voice Synthesis via Classifier-free Diffusion Guidance**|Semin Kim et.al.|[2406.05965](http://arxiv.org/abs/2406.05965)|null|\n", "2406.05763": "|**2024-06-19**|**WenetSpeech4TTS: A 12,800-hour Mandarin TTS Corpus for Large Speech Generation Model Benchmark**|Linhan Ma et.al.|[2406.05763](http://arxiv.org/abs/2406.05763)|**[link](https://github.com/dukGuo/valle-audiodec)**|\n", "2406.05699": "|**2024-06-09**|**An Investigation of Noise Robustness for Flow-Matching-Based Zero-Shot TTS**|Xiaofei Wang et.al.|[2406.05699](http://arxiv.org/abs/2406.05699)|null|\n", "2406.05681": "|**2024-06-11**|**Towards Expressive Zero-Shot Speech Synthesis with Hierarchical Prosody Modeling**|Yuepeng Jiang et.al.|[2406.05681](http://arxiv.org/abs/2406.05681)|null|\n", "2406.05672": "|**2024-06-12**|**Text-aware and Context-aware Expressive Audiobook Speech Synthesis**|Dake Guo et.al.|[2406.05672](http://arxiv.org/abs/2406.05672)|null|\n", "2408.06906": "|**2024-08-13**|**VNet: A GAN-based Multi-Tier Discriminator Network for Speech Synthesis Vocoders**|Yubing Cao et.al.|[2408.06906](http://arxiv.org/abs/2408.06906)|null|\n", "2408.06858": "|**2024-08-13**|**SaSLaW: Dialogue Speech Corpus with Audio-visual Egocentric Information Toward Environment-adaptive Dialogue Speech Synthesis**|Osamu Take et.al.|[2408.06858](http://arxiv.org/abs/2408.06858)|**[link](https://github.com/sarulab-speech/saslaw)**|\n", "2408.06827": "|**2024-08-13**|**PRESENT: Zero-Shot Text-to-Prosody Control**|Perry Lam et.al.|[2408.06827](http://arxiv.org/abs/2408.06827)|**[link](https://github.com/iamanigeeit/present)**|\n", "2408.07547": "|**2024-08-14**|**PeriodWave: Multi-Period Flow Matching for High-Fidelity Waveform Generation**|Sang-Hoon Lee et.al.|[2408.07547](http://arxiv.org/abs/2408.07547)|**[link](https://github.com/sh-lee-prml/periodwave)**|\n", "2408.07414": "|**2024-08-14**|**WavLM model ensemble for audio deepfake detection**|David Combei et.al.|[2408.07414](http://arxiv.org/abs/2408.07414)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10852": "|**2024-08-20**|**EELE: Exploring Efficient and Extensible LoRA Integration in Emotional Text-to-Speech**|Xin Qi et.al.|[2408.10852](http://arxiv.org/abs/2408.10852)|null|\n", "2408.10771": "|**2024-08-20**|**SSL-TTS: Leveraging Self-Supervised Embeddings and kNN Retrieval for Zero-Shot Multi-speaker TTS**|Karl El Hajal et.al.|[2408.10771](http://arxiv.org/abs/2408.10771)|null|\n", "2408.10549": "|**2024-08-20**|**AI-Based IVR**|Gassyrbek Kosherbay et.al.|[2408.10549](http://arxiv.org/abs/2408.10549)|null|\n", "2408.10463": "|**2024-08-20**|**Adversarial training of Keyword Spotting to Minimize TTS Data Overfitting**|Hyun Jin Park et.al.|[2408.10463](http://arxiv.org/abs/2408.10463)|null|\n", "2408.10207": "|**2024-07-01**|**A Comprehensive Survey on Diffusion Models and Their Applications**|Md Manjurul Ahsan et.al.|[2408.10207](http://arxiv.org/abs/2408.10207)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12170": "|**2024-08-22**|**VoiceX: A Text-To-Speech Framework for Custom Voices**|Silvan Mertes et.al.|[2408.12170](http://arxiv.org/abs/2408.12170)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13240": "|**2024-08-23**|**Which Prosodic Features Matter Most for Pragmatics?**|Nigel G. Ward et.al.|[2408.13240](http://arxiv.org/abs/2408.13240)|null|\n", "2408.14423": "|**2024-08-27**|**DualSpeech: Enhancing Speaker-Fidelity and Text-Intelligibility Through Dual Classifier-Free Guidance**|Jinhyeok Yang et.al.|[2408.14423](http://arxiv.org/abs/2408.14423)|null|\n", "2408.13970": "|**2024-08-26**|**Anonymization of Voices in Spaces for Civic Dialogue: Measuring Impact on Empathy, Trust, and Feeling Heard**|Wonjune Kang et.al.|[2408.13970](http://arxiv.org/abs/2408.13970)|null|\n", "2408.13893": "|**2024-08-28**|**SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models**|Dongchao Yang et.al.|[2408.13893](http://arxiv.org/abs/2408.13893)|null|\n", "2408.13608": "|**2024-08-24**|**SpeechCraft: A Fine-grained Expressive Speech Dataset with Natural Language Description**|Zeyu Jin et.al.|[2408.13608](http://arxiv.org/abs/2408.13608)|**[link](https://github.com/thuhcsi/speechcraft)**|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.14739": "|**2024-08-28**|**VoiceTailor: Lightweight Plug-In Adapter for Diffusion-Based Personalized Text-to-Speech**|Heeseung Kim et.al.|[2408.14739](http://arxiv.org/abs/2408.14739)|null|\n", "2408.14713": "|**2024-08-27**|**StyleSpeech: Parameter-efficient Fine Tuning for Pre-trained Controllable Text-to-Speech**|Haowei Lou et.al.|[2408.14713](http://arxiv.org/abs/2408.14713)|null|\n", "2408.15916": "|**2024-08-28**|**Multi-modal Adversarial Training for Zero-Shot Voice Cloning**|John Janiczek et.al.|[2408.15916](http://arxiv.org/abs/2408.15916)|null|\n", "2408.15775": "|**2024-08-29**|**Easy, Interpretable, Effective: openSMILE for voice deepfake detection**|Octavian Pascu et.al.|[2408.15775](http://arxiv.org/abs/2408.15775)|null|\n", "2408.15676": "|**2024-08-28**|**VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling**|Yixuan Zhou et.al.|[2408.15676](http://arxiv.org/abs/2408.15676)|null|\n", "2408.16725": "|**2024-08-29**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|null|\n", "2408.16546": "|**2024-08-29**|**RAVE for Speech: Efficient Voice Conversion at High Sampling Rates**|Anders R. Bargum et.al.|[2408.16546](http://arxiv.org/abs/2408.16546)|null|\n", "2408.16373": "|**2024-08-29**|**Enabling Beam Search for Language Model-Based Text-to-Speech Synthesis**|Zehai Tu et.al.|[2408.16373](http://arxiv.org/abs/2408.16373)|null|\n"}}
\ No newline at end of file
+{"ASR": {"2408.00624": "|**2024-08-01**|**SynesLM: A Unified Approach for Audio-visual Speech Recognition and Translation via Language Model and Synthetic Data**|Yichen Lu et.al.|[2408.00624](http://arxiv.org/abs/2408.00624)|**[link](https://github.com/espnet/espnet)**|\n", "2408.00205": "|**2024-08-01**|**Sentence-wise Speech Summarization: Task, Datasets, and End-to-End Modeling with LM Knowledge Distillation**|Kohei Matsuura et.al.|[2408.00205](http://arxiv.org/abs/2408.00205)|null|\n", "2407.21783": "|**2024-08-15**|**The Llama 3 Herd of Models**|Abhimanyu Dubey et.al.|[2407.21783](http://arxiv.org/abs/2407.21783)|null|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.21414": "|**2024-07-31**|**Towards interfacing large language models with ASR systems using confidence measures and prompting**|Maryam Naderi et.al.|[2407.21414](http://arxiv.org/abs/2407.21414)|null|\n", "2407.21211": "|**2024-07-30**|**Self-Supervised Models in Automatic Whispered Speech Recognition**|Aref Farhadipour et.al.|[2407.21211](http://arxiv.org/abs/2407.21211)|null|\n", "2407.21066": "|**2024-07-28**|**ELP-Adapters: Parameter Efficient Adapter Tuning for Various Speech Processing Tasks**|Nakamasa Inoue et.al.|[2407.21066](http://arxiv.org/abs/2407.21066)|null|\n", "2407.21061": "|**2024-07-26**|**Improving noisy student training for low-resource languages in End-to-End ASR using CycleGAN and inter-domain losses**|Chia-Yu Li et.al.|[2407.21061](http://arxiv.org/abs/2407.21061)|null|\n", "2407.18581": "|**2024-08-07**|**Dynamic Language Group-Based MoE: Enhancing Code-Switching Speech Recognition with Hierarchical Routing**|Hukai Huang et.al.|[2407.18581](http://arxiv.org/abs/2407.18581)|**[link](https://github.com/kaihuhuang/language-group)**|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18461": "|**2024-07-26**|**Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation**|Shiyao Wang et.al.|[2407.18461](http://arxiv.org/abs/2407.18461)|**[link](https://github.com/nku-hlt/pb-dsr)**|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17874": "|**2024-07-25**|**Improving Domain-Specific ASR with LLM-Generated Contextual Descriptions**|Jiwon Suh et.al.|[2407.17874](http://arxiv.org/abs/2407.17874)|null|\n", "2407.17852": "|**2024-07-25**|**Scaling A Simple Approach to Zero-Shot Speech Recognition**|Jinming Zhao et.al.|[2407.17852](http://arxiv.org/abs/2407.17852)|**[link](https://github.com/facebookresearch/fairseq)**|\n", "2407.17605": "|**2024-07-24**|**Coupling Speech Encoders with Downstream Text Models**|Ciprian Chelba et.al.|[2407.17605](http://arxiv.org/abs/2407.17605)|null|\n", "2407.17160": "|**2024-07-24**|**A Comparative Analysis of Bilingual and Trilingual Wav2Vec Models for Automatic Speech Recognition in Multilingual Oral History Archives**|Jan Lehe\u010dka et.al.|[2407.17160](http://arxiv.org/abs/2407.17160)|null|\n", "2407.16537": "|**2024-07-23**|**Quantifying the Role of Textual Predictability in Automatic Speech Recognition**|Sean Robertson et.al.|[2407.16537](http://arxiv.org/abs/2407.16537)|null|\n", "2407.16447": "|**2024-07-23**|**The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization**|Samuele Cornell et.al.|[2407.16447](http://arxiv.org/abs/2407.16447)|null|\n", "2407.16370": "|**2024-07-23**|**Evolutionary Prompt Design for LLM-Based Post-ASR Error Correction**|Rithik Sachdev et.al.|[2407.16370](http://arxiv.org/abs/2407.16370)|**[link](https://github.com/rithiksachdev/PostASR-Correction-SLT2024)**|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15749": "|**2024-07-22**|**Robustness of Speech Separation Models for Similar-pitch Speakers**|Bunlong Lay et.al.|[2407.15749](http://arxiv.org/abs/2407.15749)|null|\n", "2407.15300": "|**2024-07-22**|**SELM: Enhancing Speech Emotion Recognition for Out-of-Domain Scenarios**|Hazim Bukhari et.al.|[2407.15300](http://arxiv.org/abs/2407.15300)|null|\n", "2407.14573": "|**2024-08-24**|**Trading Devil Final: Backdoor attack via Stock market and Bayesian Optimization**|Orson Mengara et.al.|[2407.14573](http://arxiv.org/abs/2407.14573)|null|\n", "2407.14021": "|**2024-07-19**|**GE2E-AC: Generalized End-to-End Loss Training for Accent Classification**|Chihiro Watanabe et.al.|[2407.14021](http://arxiv.org/abs/2407.14021)|null|\n", "2407.13982": "|**2024-07-19**|**Reexamining Racial Disparities in Automatic Speech Recognition Performance: The Role of Confounding by Provenance**|Changye Li et.al.|[2407.13982](http://arxiv.org/abs/2407.13982)|null|\n", "2408.00005": "|**2024-07-18**|**Framework for Curating Speech Datasets and Evaluating ASR Systems: A Case Study for Polish**|Micha\u0142 Junczyk et.al.|[2408.00005](http://arxiv.org/abs/2408.00005)|**[link](https://github.com/goodmike31/pl-asr-bigos-tools)**|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.13300": "|**2024-07-18**|**Robust ASR Error Correction with Conservative Data Filtering**|Takuma Udagawa et.al.|[2407.13300](http://arxiv.org/abs/2407.13300)|null|\n", "2407.13292": "|**2024-07-18**|**Low-Resourced Speech Recognition for Iu Mien Language via Weakly-Supervised Phoneme-based Multilingual Pre-training**|Lukuan Dong et.al.|[2407.13292](http://arxiv.org/abs/2407.13292)|null|\n", "2407.13266": "|**2024-07-18**|**How Private is Low-Frequency Speech Audio in the Wild? An Analysis of Verbal Intelligibility by Humans and Machines**|Ailin Liu et.al.|[2407.13266](http://arxiv.org/abs/2407.13266)|null|\n", "2407.13142": "|**2024-07-18**|**A light-weight and efficient punctuation and word casing prediction model for on-device streaming ASR**|Jian You et.al.|[2407.13142](http://arxiv.org/abs/2407.13142)|null|\n", "2407.12389": "|**2024-07-17**|**Morphosyntactic Analysis for CHILDES**|Houjun Liu et.al.|[2407.12389](http://arxiv.org/abs/2407.12389)|null|\n", "2407.12240": "|**2024-07-17**|**Adaptive Cascading Network for Continual Test-Time Adaptation**|Kien X. Nguyen et.al.|[2407.12240](http://arxiv.org/abs/2407.12240)|null|\n", "2407.12094": "|**2024-07-16**|**Identifying Speakers in Dialogue Transcripts: A Text-based Approach Using Pretrained Language Models**|Minh Nguyen et.al.|[2407.12094](http://arxiv.org/abs/2407.12094)|**[link](https://github.com/adobe-research/speaker-identification)**|\n", "2407.11828": "|**2024-07-17**|**Vibravox: A Dataset of French Speech Captured with Body-conduction Audio Sensors**|Julien Hauret et.al.|[2407.11828](http://arxiv.org/abs/2407.11828)|**[link](https://github.com/jhauret/vibravox)**|\n", "2407.11641": "|**2024-07-16**|**Investigating the Effect of Label Topology and Training Criterion on ASR Performance and Alignment Quality**|Tina Raissi et.al.|[2407.11641](http://arxiv.org/abs/2407.11641)|null|\n", "2407.11516": "|**2024-07-16**|**The VoicePrivacy 2022 Challenge: Progress and Perspectives in Voice Anonymisation**|Michele Panariello et.al.|[2407.11516](http://arxiv.org/abs/2407.11516)|null|\n", "2407.11345": "|**2024-07-16**|**Beyond Binary: Multiclass Paraphasia Detection with Generative Pretrained Transformers and End-to-End Models**|Matthew Perez et.al.|[2407.11345](http://arxiv.org/abs/2407.11345)|null|\n", "2407.10603": "|**2024-07-15**|**Leave No Knowledge Behind During Knowledge Distillation: Towards Practical and Effective Knowledge Distillation for Code-Switching ASR Using Realistic Data**|Liang-Hsuan Tseng et.al.|[2407.10603](http://arxiv.org/abs/2407.10603)|null|\n", "2407.10303": "|**2024-07-14**|**Improving Neural Biasing for Contextual Speech Recognition by Early Context Injection and Text Perturbation**|Ruizhe Huang et.al.|[2407.10303](http://arxiv.org/abs/2407.10303)|null|\n", "2407.10255": "|**2024-07-14**|**CUSIDE-T: Chunking, Simulating Future and Decoding for Transducer based Streaming ASR**|Wenbo Zhao et.al.|[2407.10255](http://arxiv.org/abs/2407.10255)|null|\n", "2407.10118": "|**2024-07-14**|**Textless Dependency Parsing by Labeled Sequence Prediction**|Shunsuke Kando et.al.|[2407.10118](http://arxiv.org/abs/2407.10118)|**[link](https://github.com/mynlp/speechparser)**|\n", "2407.10048": "|**2024-07-14**|**Whisper-SV: Adapting Whisper for Low-data-resource Speaker Verification**|Li Zhang et.al.|[2407.10048](http://arxiv.org/abs/2407.10048)|null|\n", "2407.09849": "|**2024-07-13**|**Text-Based Detection of On-Hold Scripts in Contact Center Calls**|Dmitrii Galimzianov et.al.|[2407.09849](http://arxiv.org/abs/2407.09849)|**[link](https://github.com/gal-dmitry/HOLD_DETECTION_PUBLIC)**|\n", "2407.09817": "|**2024-08-24**|**Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System**|Lingwei Meng et.al.|[2407.09817](http://arxiv.org/abs/2407.09817)|**[link](https://github.com/LingweiMeng/Whisper-Sidecar)**|\n", "2407.09807": "|**2024-07-13**|**A Streaming Multi-Channel End-to-End Speech Recognition System with Realistic Evaluations**|Xiangzhu Kong et.al.|[2407.09807](http://arxiv.org/abs/2407.09807)|**[link](https://github.com/thu-spmi/cat)**|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.08618": "|**2024-08-12**|**Tamil Language Computing: the Present and the Future**|Kengatharaiyer Sarveswaran et.al.|[2407.08618](http://arxiv.org/abs/2407.08618)|null|\n", "2407.08658": "|**2024-07-10**|**Evaluating Voice Command Pipelines for Drone Control: From STT and LLM to Direct Classification and Siamese Networks**|Lucca Emmanuel Pineli Sim\u00f5es et.al.|[2407.08658](http://arxiv.org/abs/2407.08658)|null|\n", "2407.07566": "|**2024-07-10**|**HebDB: a Weakly Supervised Dataset for Hebrew Speech Processing**|Arnon Turetzky et.al.|[2407.07566](http://arxiv.org/abs/2407.07566)|null|\n", "2407.18930": "|**2024-07-10**|**Dynamic Encoder Size Based on Data-Driven Layer-wise Pruning for Speech Recognition**|Jingjing Xu et.al.|[2407.18930](http://arxiv.org/abs/2407.18930)|null|\n", "2407.17416": "|**2024-07-10**|**Explaining Spectrograms in Machine Learning: A Study on Neural Networks for Speech Classification**|Jesin James et.al.|[2407.17416](http://arxiv.org/abs/2407.17416)|null|\n", "2407.06606": "|**2024-07-09**|**Tailored Design of Audio-Visual Speech Recognition Models using Branchformers**|David Gimeno-G\u00f3mez et.al.|[2407.06606](http://arxiv.org/abs/2407.06606)|**[link](https://github.com/david-gimeno/tailored-avsr)**|\n", "2407.06310": "|**2024-07-08**|**Homogeneous Speaker Features for On-the-Fly Dysarthric and Elderly Speaker Adaptation**|Mengzhe Geng et.al.|[2407.06310](http://arxiv.org/abs/2407.06310)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.14525": "|**2024-07-07**|**Morse Code-Enabled Speech Recognition for Individuals with Visual and Hearing Impairments**|Ritabrata Roy Choudhury et.al.|[2407.14525](http://arxiv.org/abs/2407.14525)|null|\n", "2407.04675": "|**2024-07-10**|**Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition**|Ye Bai et.al.|[2407.04675](http://arxiv.org/abs/2407.04675)|null|\n", "2407.04662": "|**2024-07-05**|**Multitaper mel-spectrograms for keyword spotting**|Douglas Baptista de Souza et.al.|[2407.04662](http://arxiv.org/abs/2407.04662)|null|\n", "2407.04652": "|**2024-07-05**|**Pretraining End-to-End Keyword Search with Automatically Discovered Acoustic Units**|Bolaji Yusuf et.al.|[2407.04652](http://arxiv.org/abs/2407.04652)|**[link](https://github.com/beer-asr/beer)**|\n", "2407.04641": "|**2024-07-05**|**Speculative Speech Recognition by Audio-Prefixed Low-Rank Adaptation of Language Models**|Bolaji Yusuf et.al.|[2407.04641](http://arxiv.org/abs/2407.04641)|null|\n", "2407.04601": "|**2024-07-05**|**Written Term Detection Improves Spoken Term Detection**|Bolaji Yusuf et.al.|[2407.04601](http://arxiv.org/abs/2407.04601)|**[link](https://github.com/bolajiy/golden-retriever)**|\n", "2407.04533": "|**2024-07-09**|**Performance Analysis of Speech Encoders for Low-Resource SLU and ASR in Tunisian Dialect**|Salima Mdhaffar et.al.|[2407.04533](http://arxiv.org/abs/2407.04533)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2407.04482": "|**2024-07-05**|**Controlling Whisper: Universal Acoustic Adversarial Attacks to Control Speech Foundation Models**|Vyas Raina et.al.|[2407.04482](http://arxiv.org/abs/2407.04482)|null|\n", "2407.04439": "|**2024-07-05**|**XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models**|Shashi Kumar et.al.|[2407.04439](http://arxiv.org/abs/2407.04439)|null|\n", "2407.04368": "|**2024-07-05**|**Romanization Encoding For Multilingual ASR**|Wen Ding et.al.|[2407.04368](http://arxiv.org/abs/2407.04368)|null|\n", "2407.04280": "|**2024-07-05**|**LearnerVoice: A Dataset of Non-Native English Learners' Spontaneous Speech**|Haechan Kim et.al.|[2407.04280](http://arxiv.org/abs/2407.04280)|null|\n", "2407.04219": "|**2024-07-05**|**Semi-supervised Learning for Code-Switching ASR with Large Language Model Filter**|Yu Xi et.al.|[2407.04219](http://arxiv.org/abs/2407.04219)|null|\n", "2407.04051": "|**2024-07-11**|**FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs**|Keyu An et.al.|[2407.04051](http://arxiv.org/abs/2407.04051)|**[link](https://github.com/FunAudioLLM/SenseVoice)**|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.03966": "|**2024-07-04**|**Serialized Output Training by Learned Dominance**|Ying Shi et.al.|[2407.03966](http://arxiv.org/abs/2407.03966)|null|\n", "2407.03809": "|**2024-07-04**|**Finetuning End-to-End Models for Estonian Conversational Spoken Language Translation**|Tiia Sildam et.al.|[2407.03809](http://arxiv.org/abs/2407.03809)|null|\n", "2407.03734": "|**2024-07-04**|**Improving Self-supervised Pre-training using Accent-Specific Codebooks**|Darshan Prabhu et.al.|[2407.03734](http://arxiv.org/abs/2407.03734)|**[link](https://github.com/csalt-research/accented-codebooks-asr)**|\n", "2407.03718": "|**2024-07-24**|**Multi-Convformer: Extending Conformer with Multiple Convolution Kernels**|Darshan Prabhu et.al.|[2407.03718](http://arxiv.org/abs/2407.03718)|**[link](https://github.com/espnet/espnet)**|\n", "2407.03563": "|**2024-07-04**|**Learning Video Temporal Dynamics with Cross-Modal Attention for Robust Audio-Visual Speech Recognition**|Sungnyun Kim et.al.|[2407.03563](http://arxiv.org/abs/2407.03563)|null|\n", "2407.03495": "|**2024-07-03**|**Codec-ASR: Training Performant Automatic Speech Recognition Systems with Discrete Speech Representations**|Kunal Dhawan et.al.|[2407.03495](http://arxiv.org/abs/2407.03495)|null|\n", "2407.03440": "|**2024-07-03**|**Advanced Framework for Animal Sound Classification With Features Optimization**|Qiang Yang et.al.|[2407.03440](http://arxiv.org/abs/2407.03440)|null|\n", "2407.03026": "|**2024-07-03**|**Qifusion-Net: Layer-adapted Stream/Non-stream Model for End-to-End Multi-Accent Speech Recognition**|Jinming Chen et.al.|[2407.03026](http://arxiv.org/abs/2407.03026)|null|\n", "2407.13782": "|**2024-07-03**|**Self-supervised ASR Models and Features For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2407.13782](http://arxiv.org/abs/2407.13782)|null|\n", "2407.02052": "|**2024-07-02**|**The USTC-NERCSLIP Systems for The ICMC-ASR Challenge**|Minghui Wu et.al.|[2407.02052](http://arxiv.org/abs/2407.02052)|null|\n", "2407.02543": "|**2024-07-02**|**Towards the Next Frontier in Speech Representation Learning Using Disentanglement**|Varun Krishna et.al.|[2407.02543](http://arxiv.org/abs/2407.02543)|null|\n", "2407.01909": "|**2024-07-02**|**Pinyin Regularization in Error Correction for Chinese Speech Recognition with Large Language Models**|Zhiyuan Tang et.al.|[2407.01909](http://arxiv.org/abs/2407.01909)|**[link](https://github.com/tzyll/ChineseHP)**|\n", "2407.17477": "|**2024-07-30**|**Toward Automated Detection of Biased Social Signals from the Content of Clinical Conversations**|Feng Chen et.al.|[2407.17477](http://arxiv.org/abs/2407.17477)|null|\n", "2407.00756": "|**2024-06-30**|**Less Forgetting for Better Generalization: Exploring Continual-learning Fine-tuning Methods for Speech Self-supervised Representations**|Salah Zaiem et.al.|[2407.00756](http://arxiv.org/abs/2407.00756)|null|\n", "2407.00518": "|**2024-06-29**|**When Robots Get Chatty: Grounding Multimodal Human-Robot Conversation and Collaboration**|Philipp Allgeuer et.al.|[2407.00518](http://arxiv.org/abs/2407.00518)|null|\n", "2407.12817": "|**2024-06-29**|**Error Correction by Paying Attention to Both Acoustic and Confidence References for Automatic Speech Recognition**|Yuchun Shu et.al.|[2407.12817](http://arxiv.org/abs/2407.12817)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2407.12029": "|**2024-06-29**|**A Quality-Aware Voltage Overscaling Framework to Improve the Energy Efficiency and Lifetime of TPUs based on Statistical Error Modeling**|Alireza Senobari et.al.|[2407.12029](http://arxiv.org/abs/2407.12029)|null|\n", "2407.12028": "|**2024-06-28**|**TreeSeg: Hierarchical Topic Segmentation of Large Transcripts**|Dimitrios C. Gklezakos et.al.|[2407.12028](http://arxiv.org/abs/2407.12028)|null|\n", "2406.19706": "|**2024-06-28**|**SAML: Speaker Adaptive Mixture of LoRA Experts for End-to-End ASR**|Qiuming Zhao et.al.|[2406.19706](http://arxiv.org/abs/2406.19706)|null|\n", "2406.19674": "|**2024-06-28**|**Less is More: Accurate Speech Recognition & Translation without Web-Scale Data**|Krishna C. Puvvada et.al.|[2406.19674](http://arxiv.org/abs/2406.19674)|null|\n", "2406.19564": "|**2024-06-27**|**Voices Unheard: NLP Resources and Models for Yor\u00f9b\u00e1 Regional Dialects**|Orevaoghene Ahia et.al.|[2406.19564](http://arxiv.org/abs/2406.19564)|**[link](https://github.com/orevaahia/yorulect)**|\n", "2406.19363": "|**2024-06-27**|**Tradition or Innovation: A Comparison of Modern ASR Methods for Forced Alignment**|Rotem Rousso et.al.|[2406.19363](http://arxiv.org/abs/2406.19363)|null|\n", "2406.19311": "|**2024-06-27**|**Zero-Query Adversarial Attack on Black-box Automatic Speech Recognition Systems**|Zheng Fang et.al.|[2406.19311](http://arxiv.org/abs/2406.19311)|null|\n", "2406.18972": "|**2024-06-27**|**Applying LLMs for Rescoring N-best ASR Hypotheses of Casual Conversations: Effects of Domain Adaptation and Context Carry-over**|Atsunori Ogawa et.al.|[2406.18972](http://arxiv.org/abs/2406.18972)|null|\n", "2406.18928": "|**2024-06-27**|**Enhanced ASR Robustness to Packet Loss with a Front-End Adaptation Network**|Yehoshua Dissen et.al.|[2406.18928](http://arxiv.org/abs/2406.18928)|null|\n", "2406.18862": "|**2024-06-27**|**Streaming Decoder-Only Automatic Speech Recognition with Discrete Speech Units: A Pilot Study**|Peikun Chen et.al.|[2406.18862](http://arxiv.org/abs/2406.18862)|**[link](https://github.com/chenpk00/IS2024_stream_decoder_only_asr)**|\n", "2406.18373": "|**2024-06-26**|**Dynamic Data Pruning for Automatic Speech Recognition**|Qiao Xiao et.al.|[2406.18373](http://arxiv.org/abs/2406.18373)|null|\n", "2406.18301": "|**2024-06-26**|**MSR-86K: An Evolving, Multilingual Corpus with 86,300 Hours of Transcribed Audio for Speech Recognition Research**|Song Li et.al.|[2406.18301](http://arxiv.org/abs/2406.18301)|null|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18120": "|**2024-07-12**|**ArzEn-LLM: Code-Switched Egyptian Arabic-English Translation and Speech Recognition Using LLMs**|Ahmed Heakl et.al.|[2406.18120](http://arxiv.org/abs/2406.18120)|**[link](https://github.com/ahmedheakl/arazn-llm)**|\n", "2406.18021": "|**2024-06-26**|**SC-MoE: Switch Conformer Mixture of Experts for Unified Streaming and Non-streaming Code-Switching ASR**|Shuaishuai Ye et.al.|[2406.18021](http://arxiv.org/abs/2406.18021)|null|\n", "2406.17935": "|**2024-06-25**|**Sequential Editing for Lifelong Training of Speech Recognition Models**|Devang Kulshreshtha et.al.|[2406.17935](http://arxiv.org/abs/2406.17935)|null|\n", "2406.17926": "|**2024-06-25**|**FASA: a Flexible and Automatic Speech Aligner for Extracting High-quality Aligned Children Speech Data**|Dancheng Liu et.al.|[2406.17926](http://arxiv.org/abs/2406.17926)|**[link](https://github.com/DanchengLiu/FASA)**|\n", "2406.17618": "|**2024-06-25**|**Towards Building an End-to-End Multilingual Automatic Lyrics Transcription Model**|Jiawen Huang et.al.|[2406.17618](http://arxiv.org/abs/2406.17618)|**[link](https://github.com/jhuang448/MultilingualALT)**|\n", "2406.17614": "|**2024-06-25**|**MSRS: Training Multimodal Speech Recognition Models from Scratch with Sparse Mask Optimization**|Adriana Fernandez-Lopez et.al.|[2406.17614](http://arxiv.org/abs/2406.17614)|null|\n", "2406.17825": "|**2024-06-25**|**Automatic speech recognition for the Nepali language using CNN, bidirectional LSTM and ResNet**|Manish Dhakal et.al.|[2406.17825](http://arxiv.org/abs/2406.17825)|**[link](https://github.com/manishdhakal/asr-nepali-using-cnn-bilstm-resnet)**|\n", "2406.17272": "|**2024-06-25**|**A Comprehensive Solution to Connect Speech Encoder and Large Language Model for ASR**|Van Tung Pham et.al.|[2406.17272](http://arxiv.org/abs/2406.17272)|null|\n", "2406.17124": "|**2024-06-24**|**Investigating Confidence Estimation Measures for Speaker Diarization**|Anurag Chowdhury et.al.|[2406.17124](http://arxiv.org/abs/2406.17124)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16777": "|**2024-06-24**|**Blending LLMs into Cascaded Speech Translation: KIT's Offline Speech Translation System for IWSLT 2024**|Sai Koneru et.al.|[2406.16777](http://arxiv.org/abs/2406.16777)|null|\n", "2406.16120": "|**2024-06-23**|**Contextualized End-to-end Automatic Speech Recognition with Intermediate Biasing Loss**|Muhammad Shakeel et.al.|[2406.16120](http://arxiv.org/abs/2406.16120)|null|\n", "2406.16107": "|**2024-08-01**|**Decoder-only Architecture for Streaming End-to-end Speech Recognition**|Emiru Tsunoo et.al.|[2406.16107](http://arxiv.org/abs/2406.16107)|null|\n", "2406.15723": "|**2024-06-22**|**Acoustic Feature Mixup for Balanced Multi-aspect Pronunciation Assessment**|Heejin Do et.al.|[2406.15723](http://arxiv.org/abs/2406.15723)|null|\n", "2406.15668": "|**2024-06-21**|**PI-Whisper: An Adaptive and Incremental ASR Framework for Diverse and Evolving Speaker Characteristics**|Amir Nassereldine et.al.|[2406.15668](http://arxiv.org/abs/2406.15668)|null|\n", "2406.15265": "|**2024-06-21**|**Perception of Phonological Assimilation by Neural Speech Recognition Models**|Charlotte Pouw et.al.|[2406.15265](http://arxiv.org/abs/2406.15265)|null|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14747": "|**2024-06-20**|**An Adapter-Based Unified Model for Multiple Spoken Language Processing Tasks**|Varsha Suresh et.al.|[2406.14747](http://arxiv.org/abs/2406.14747)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.14266": "|**2024-06-20**|**Intelligent Interface: Enhancing Lecture Engagement with Didactic Activity Summaries**|Anna Wr\u00f3blewska et.al.|[2406.14266](http://arxiv.org/abs/2406.14266)|null|\n", "2406.13842": "|**2024-06-19**|**Joint vs Sequential Speaker-Role Detection and Automatic Speech Recognition for Air-traffic Control**|Alexander Blatt et.al.|[2406.13842](http://arxiv.org/abs/2406.13842)|null|\n", "2406.13502": "|**2024-06-19**|**ManWav: The First Manchu ASR Model**|Jean Seo et.al.|[2406.13502](http://arxiv.org/abs/2406.13502)|null|\n", "2406.13431": "|**2024-06-24**|**Children's Speech Recognition through Discrete Token Enhancement**|Vrunda N. Sukhadia et.al.|[2406.13431](http://arxiv.org/abs/2406.13431)|null|\n", "2406.12699": "|**2024-06-18**|**Bridging the Gap: Integrating Pre-trained Speech Enhancement and Recognition Models for Robust Speech Recognition**|Kuan-Chen Wang et.al.|[2406.12699](http://arxiv.org/abs/2406.12699)|null|\n", "2406.12674": "|**2024-06-18**|**Transcribe, Align and Segment: Creating speech datasets for low-resource languages**|Taras Sereda et.al.|[2406.12674](http://arxiv.org/abs/2406.12674)|null|\n", "2406.12621": "|**2024-06-18**|**Growing Trees on Sounds: Assessing Strategies for End-to-End Dependency Parsing of Speech**|Adrien Pupier et.al.|[2406.12621](http://arxiv.org/abs/2406.12621)|**[link](https://github.com/Pupiera/Growing_tree_on_sound)**|\n", "2406.12611": "|**2024-06-18**|**Rapid Language Adaptation for Multilingual E2E Speech Recognition Using Encoder Prompting**|Yosuke Kashiwagi et.al.|[2406.12611](http://arxiv.org/abs/2406.12611)|null|\n", "2406.12503": "|**2024-06-18**|**Unsupervised Online Continual Learning for Automatic Speech Recognition**|Steven Vander Eeckt et.al.|[2406.12503](http://arxiv.org/abs/2406.12503)|**[link](https://github.com/stevenvdeeckt/unsupervised-ocl-for-asr)**|\n", "2406.12387": "|**2024-06-18**|**Performant ASR Models for Medical Entities in Accented Speech**|Tejumade Afonja et.al.|[2406.12387](http://arxiv.org/abs/2406.12387)|null|\n", "2406.12317": "|**2024-06-18**|**Finding Task-specific Subnetworks in Multi-task Spoken Language Understanding Model**|Hayato Futami et.al.|[2406.12317](http://arxiv.org/abs/2406.12317)|null|\n", "2406.12233": "|**2024-06-18**|**SyncVSR: Data-Efficient Visual Speech Recognition with End-to-End Crossmodal Audio Token Synchronization**|Young Jin Ahn et.al.|[2406.12233](http://arxiv.org/abs/2406.12233)|**[link](https://github.com/KAIST-AILab/SyncVSR)**|\n", "2406.11546": "|**2024-06-17**|**GigaSpeech 2: An Evolving, Large-Scale and Multi-domain ASR Corpus for Low-Resource Languages with Automated Crawling, Transcription and Refinement**|Yifan Yang et.al.|[2406.11546](http://arxiv.org/abs/2406.11546)|**[link](https://github.com/SpeechColab/GigaSpeech2)**|\n", "2406.12937": "|**2024-06-17**|**Self-Train Before You Transcribe**|Robert Flynn et.al.|[2406.12937](http://arxiv.org/abs/2406.12937)|**[link](https://github.com/robflynnyh/Self-Train-Before-You-Transcribe)**|\n", "2406.11064": "|**2024-06-16**|**Continual Test-time Adaptation for End-to-end Speech Recognition on Noisy Speech**|Guan-Ting Lin et.al.|[2406.11064](http://arxiv.org/abs/2406.11064)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.11025": "|**2024-06-16**|**Large Language Models for Dysfluency Detection in Stuttered Speech**|Dominik Wagner et.al.|[2406.11025](http://arxiv.org/abs/2406.11025)|null|\n", "2406.11022": "|**2024-06-16**|**Outlier Reduction with Gated Attention for Improved Post-training Quantization in Large Sequence-to-sequence Speech Foundation Models**|Dominik Wagner et.al.|[2406.11022](http://arxiv.org/abs/2406.11022)|null|\n", "2406.11016": "|**2024-06-16**|**Optimized Speculative Sampling for GPU Hardware Accelerators**|Dominik Wagner et.al.|[2406.11016](http://arxiv.org/abs/2406.11016)|null|\n", "2406.10993": "|**2024-06-16**|**CoSTA: Code-Switched Speech Translation using Aligned Speech-Text Interleaving**|Bhavani Shankar et.al.|[2406.10993](http://arxiv.org/abs/2406.10993)|null|\n", "2406.10932": "|**2024-06-16**|**Imperceptible Rhythm Backdoor Attacks: Exploring Rhythm Transformation for Embedding Undetectable Vulnerabilities on Speech Recognition**|Wenhan Yao et.al.|[2406.10932](http://arxiv.org/abs/2406.10932)|null|\n", "2406.12931": "|**2024-06-16**|**Automatic Speech Recognition for Biomedical Data in Bengali Language**|Shariar Kabir et.al.|[2406.12931](http://arxiv.org/abs/2406.12931)|null|\n", "2406.10741": "|**2024-06-15**|**Speech Emotion Recognition Using CNN and Its Use Case in Digital Healthcare**|Nishargo Nigar et.al.|[2406.10741](http://arxiv.org/abs/2406.10741)|null|\n", "2406.10719": "|**2024-06-21**|**Trading Devil: Robust backdoor attack via Stochastic investment models and Bayesian approach**|Orson Mengara et.al.|[2406.10719](http://arxiv.org/abs/2406.10719)|null|\n", "2406.10177": "|**2024-06-14**|**Inclusive ASR for Disfluent Speech: Cascaded Large-Scale Self-Supervised Learning with Targeted Fine-Tuning and Data Augmentation**|Dena Mujtaba et.al.|[2406.10177](http://arxiv.org/abs/2406.10177)|null|\n", "2406.10083": "|**2024-06-14**|**On the Evaluation of Speech Foundation Models for Spoken Language Understanding**|Siddhant Arora et.al.|[2406.10083](http://arxiv.org/abs/2406.10083)|null|\n", "2406.10082": "|**2024-06-14**|**Whisper-Flamingo: Integrating Visual Features into Whisper for Audio-Visual Speech Recognition and Translation**|Andrew Rouditchenko et.al.|[2406.10082](http://arxiv.org/abs/2406.10082)|**[link](https://github.com/roudimit/whisper-flamingo)**|\n", "2406.10052": "|**2024-06-14**|**Simul-Whisper: Attention-Guided Streaming Whisper with Truncation Detection**|Haoyu Wang et.al.|[2406.10052](http://arxiv.org/abs/2406.10052)|**[link](https://github.com/backspacetg/simul_whisper)**|\n", "2406.09999": "|**2024-06-14**|**ROAR: Reinforcing Original to Augmented Data Ratio Dynamics for Wav2Vec2.0 Based ASR**|Vishwanath Pratap Singh et.al.|[2406.09999](http://arxiv.org/abs/2406.09999)|null|\n", "2406.10313": "|**2024-06-14**|**CNVSRC 2023: The First Chinese Continuous Visual Speech Recognition Challenge**|Chen Chen et.al.|[2406.10313](http://arxiv.org/abs/2406.10313)|null|\n", "2406.09950": "|**2024-06-14**|**An efficient text augmentation approach for contextualized Mandarin speech recognition**|Naijun Zheng et.al.|[2406.09950](http://arxiv.org/abs/2406.09950)|null|\n", "2406.09873": "|**2024-06-14**|**Perceiver-Prompt: Flexible Speaker Adaptation in Whisper for Chinese Disordered Speech Recognition**|Yicong Jiang et.al.|[2406.09873](http://arxiv.org/abs/2406.09873)|null|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.09676": "|**2024-06-14**|**Optimizing Byte-level Representation for End-to-end ASR**|Roger Hsiao et.al.|[2406.09676](http://arxiv.org/abs/2406.09676)|null|\n", "2406.09662": "|**2024-06-14**|**Learning Language Structures through Grounding**|Freda Shi et.al.|[2406.09662](http://arxiv.org/abs/2406.09662)|null|\n", "2406.09618": "|**2024-06-13**|**Multi-Modal Retrieval For Large Language Model Based Speech Recognition**|Jari Kolehmainen et.al.|[2406.09618](http://arxiv.org/abs/2406.09618)|null|\n", "2406.09569": "|**2024-06-13**|**Speech ReaLLM -- Real-time Streaming Speech Recognition with Multimodal LLMs by Teaching the Flow of Time**|Frank Seide et.al.|[2406.09569](http://arxiv.org/abs/2406.09569)|null|\n", "2406.09494": "|**2024-06-13**|**The Second DISPLACE Challenge : DIarization of SPeaker and LAnguage in Conversational Environments**|Shareef Babu Kalluri et.al.|[2406.09494](http://arxiv.org/abs/2406.09494)|null|\n", "2406.09202": "|**2024-06-13**|**Language Complexity and Speech Recognition Accuracy: Orthographic Complexity Hurts, Phonological Complexity Doesn't**|Chihiro Taguchi et.al.|[2406.09202](http://arxiv.org/abs/2406.09202)|**[link](https://github.com/ctaguchi/asrcomplexity)**|\n", "2406.09153": "|**2024-06-13**|**LASER: Learning by Aligning Self-supervised Representations of Speech for Improving Content-related Tasks**|Amit Meghanani et.al.|[2406.09153](http://arxiv.org/abs/2406.09153)|**[link](https://github.com/Trikaldarshi/LASER)**|\n", "2406.08914": "|**2024-06-13**|**Transcription-Free Fine-Tuning of Speech Separation Models for Noisy and Reverberant Multi-Speaker Automatic Speech Recognition**|William Ravenscroft et.al.|[2406.08914](http://arxiv.org/abs/2406.08914)|null|\n", "2406.08904": "|**2024-06-13**|**AdaPTwin: Low-Cost Adaptive Compression of Product Twins in Transformers**|Emil Biju et.al.|[2406.08904](http://arxiv.org/abs/2406.08904)|null|\n", "2406.08641": "|**2024-06-12**|**ML-SUPERB 2.0: Benchmarking Multilingual Speech Models Across Modeling Constraints, Languages, and Datasets**|Jiatong Shi et.al.|[2406.08641](http://arxiv.org/abs/2406.08641)|null|\n", "2406.08396": "|**2024-06-12**|**Neural Blind Source Separation and Diarization for Distant Speech Recognition**|Yoshiaki Bando et.al.|[2406.08396](http://arxiv.org/abs/2406.08396)|null|\n", "2406.08380": "|**2024-06-12**|**Towards Unsupervised Speech Recognition Without Pronunciation Models**|Junrui Ni et.al.|[2406.08380](http://arxiv.org/abs/2406.08380)|null|\n", "2406.08353": "|**2024-06-12**|**Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study on Word Error Rate and Fusion Techniques**|Yuanchao Li et.al.|[2406.08353](http://arxiv.org/abs/2406.08353)|**[link](https://github.com/yc-li20/SER-on-WER-and-Fusion)**|\n", "2406.08266": "|**2024-06-13**|**Refining Self-Supervised Learnt Speech Representation using Brain Activations**|Hengyu Li et.al.|[2406.08266](http://arxiv.org/abs/2406.08266)|null|\n", "2406.08207": "|**2024-06-12**|**Transformer-based Model for ASR N-Best Rescoring and Rewriting**|Iwen E. Kang et.al.|[2406.08207](http://arxiv.org/abs/2406.08207)|null|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.10284": "|**2024-06-12**|**Improving child speech recognition with augmented child-like speech**|Yuanyuan Zhang et.al.|[2406.10284](http://arxiv.org/abs/2406.10284)|null|\n", "2406.07914": "|**2024-06-14**|**Can Large Language Models Understand Spatial Audio?**|Changli Tang et.al.|[2406.07914](http://arxiv.org/abs/2406.07914)|null|\n", "2406.07909": "|**2024-06-12**|**Guiding Frame-Level CTC Alignments Using Self-knowledge Distillation**|Eungbeom Kim et.al.|[2406.07909](http://arxiv.org/abs/2406.07909)|null|\n", "2406.07846": "|**2024-06-12**|**DualVC 3: Leveraging Language Model Generated Pseudo Context for End-to-end Low Latency Streaming Voice Conversion**|Ziqian Ning et.al.|[2406.07846](http://arxiv.org/abs/2406.07846)|null|\n", "2406.07842": "|**2024-06-12**|**Dual-Pipeline with Low-Rank Adaptation for New Language Integration in Multilingual ASR**|Yerbolat Khassanov et.al.|[2406.07842](http://arxiv.org/abs/2406.07842)|null|\n", "2406.07823": "|**2024-06-12**|**PRoDeliberation: Parallel Robust Deliberation for End-to-End Spoken Language Understanding**|Trang Le et.al.|[2406.07823](http://arxiv.org/abs/2406.07823)|null|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.09443": "|**2024-06-12**|**Comparative Analysis of Personalized Voice Activity Detection Systems: Assessing Real-World Effectiveness**|Satyam Kumar et.al.|[2406.09443](http://arxiv.org/abs/2406.09443)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07256": "|**2024-06-11**|**AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection**|Rong Gong et.al.|[2406.07256](http://arxiv.org/abs/2406.07256)|null|\n", "2406.07589": "|**2024-06-11**|**Tag and correct: high precision post-editing approach to correction of speech recognition errors**|Tomasz Zi\u0119tkiewicz et.al.|[2406.07589](http://arxiv.org/abs/2406.07589)|null|\n", "2406.07096": "|**2024-06-11**|**Fast Context-Biasing for CTC and Transducer ASR models with CTC-based Word Spotter**|Andrei Andrusenko et.al.|[2406.07096](http://arxiv.org/abs/2406.07096)|null|\n", "2406.07090": "|**2024-07-29**|**Spoken Language Corpora Augmentation with Domain-Specific Voice-Cloned Speech**|Mateusz Czy\u017cnikiewicz et.al.|[2406.07090](http://arxiv.org/abs/2406.07090)|null|\n", "2406.07060": "|**2024-06-11**|**Reading Miscue Detection in Primary School through Automatic Speech Recognition**|Lingyun Gao et.al.|[2406.07060](http://arxiv.org/abs/2406.07060)|null|\n", "2406.06729": "|**2024-06-10**|**Synthetic Query Generation using Large Language Models for Virtual Assistants**|Sonal Sannigrahi et.al.|[2406.06729](http://arxiv.org/abs/2406.06729)|null|\n", "2406.06664": "|**2024-06-13**|**ASTRA: Aligning Speech and Text Representations for Asr without Sampling**|Neeraj Gaur et.al.|[2406.06664](http://arxiv.org/abs/2406.06664)|null|\n", "2406.06329": "|**2024-06-10**|**A Parameter-efficient Language Extension Framework for Multilingual ASR**|Wei Liu et.al.|[2406.06329](http://arxiv.org/abs/2406.06329)|null|\n", "2406.05968": "|**2024-06-10**|**Prompting Large Language Models with Audio for General-Purpose Speech Summarization**|Wonjune Kang et.al.|[2406.05968](http://arxiv.org/abs/2406.05968)|**[link](https://github.com/wonjune-kang/llm-speech-summarization)**|\n", "2406.05806": "|**2024-07-18**|**Do Prompts Really Prompt? Exploring the Prompt Understanding Capability of Whisper**|Chih-Kai Yang et.al.|[2406.05806](http://arxiv.org/abs/2406.05806)|null|\n", "2406.05784": "|**2024-07-20**|**Optimizing Multi-Stuttered Speech Classification: Leveraging Whisper's Encoder for Efficient Parameter Reduction in Automated Assessment**|Huma Ameer et.al.|[2406.05784](http://arxiv.org/abs/2406.05784)|null|\n", "2406.05661": "|**2024-06-09**|**MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked Language Modelling methods for learning Speech Representations**|Hemant Yadav et.al.|[2406.05661](http://arxiv.org/abs/2406.05661)|null|\n", "2406.04927": "|**2024-06-07**|**LLM-based speaker diarization correction: A generalizable approach**|Georgios Efstathiadis et.al.|[2406.04927](http://arxiv.org/abs/2406.04927)|**[link](https://github.com/GeorgeEfstathiadis/LLM-Diarize-ASR-Agnostic)**|\n", "2406.04791": "|**2024-07-02**|**Speaker-Smoothed kNN Speaker Adaptation for End-to-End ASR**|Shaojun Li et.al.|[2406.04791](http://arxiv.org/abs/2406.04791)|null|\n", "2406.06619": "|**2024-06-07**|**LoRA-Whisper: Parameter-Efficient and Extensible Multilingual ASR**|Zheshu Song et.al.|[2406.06619](http://arxiv.org/abs/2406.06619)|null|\n", "2406.04595": "|**2024-06-07**|**Pitch-Aware RNN-T for Mandarin Chinese Mispronunciation Detection and Diagnosis**|Xintong Wang et.al.|[2406.04595](http://arxiv.org/abs/2406.04595)|null|\n", "2406.04552": "|**2024-06-06**|**Flexible Multichannel Speech Enhancement for Noise-Robust Frontend**|Ante Juki\u0107 et.al.|[2406.04552](http://arxiv.org/abs/2406.04552)|null|\n", "2406.04541": "|**2024-06-06**|**Label-Synchronous Neural Transducer for E2E Simultaneous Speech Translation**|Keqi Deng et.al.|[2406.04541](http://arxiv.org/abs/2406.04541)|**[link](https://github.com/D-Keqi/LS-Transducer-SST)**|\n", "2406.04512": "|**2024-06-06**|**To Distill or Not to Distill? On the Robustness of Robust Knowledge Distillation**|Abdul Waheed et.al.|[2406.04512](http://arxiv.org/abs/2406.04512)|null|\n", "2406.04432": "|**2024-06-06**|**LipGER: Visually-Conditioned Generative Error Correction for Robust Automatic Speech Recognition**|Sreyan Ghosh et.al.|[2406.04432](http://arxiv.org/abs/2406.04432)|**[link](https://github.com/sreyan88/lipger)**|\n", "2406.04269": "|**2024-06-06**|**Beyond Performance Plateaus: A Comprehensive Study on Scalability in Speech Enhancement**|Wangyou Zhang et.al.|[2406.04269](http://arxiv.org/abs/2406.04269)|**[link](https://github.com/emrys365/se-scaling)**|\n", "2406.04240": "|**2024-07-02**|**Hypernetworks for Personalizing ASR to Atypical Speech**|Max M\u00fcller-Eberstein et.al.|[2406.04240](http://arxiv.org/abs/2406.04240)|null|\n", "2406.04123": "|**2024-06-06**|**Helsinki Speech Challenge 2024**|Martin Ludvigsen et.al.|[2406.04123](http://arxiv.org/abs/2406.04123)|null|\n", "2406.03872": "|**2024-06-06**|**BLSP-Emo: Towards Empathetic Large Speech-Language Models**|Chen Wang et.al.|[2406.03872](http://arxiv.org/abs/2406.03872)|**[link](https://github.com/cwang621/blsp-emo)**|\n", "2406.03814": "|**2024-06-14**|**Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and Gated Monolingual Datastores**|Jiaming Zhou et.al.|[2406.03814](http://arxiv.org/abs/2406.03814)|null|\n", "2406.03791": "|**2024-06-06**|**Speed of Light Exact Greedy Decoding for RNN-T Speech Recognition Models on GPU**|Daniel Galvez et.al.|[2406.03791](http://arxiv.org/abs/2406.03791)|null|\n", "2406.03274": "|**2024-06-11**|**Enhancing CTC-based speech recognition with diverse modeling units**|Shiyi Han et.al.|[2406.03274](http://arxiv.org/abs/2406.03274)|null|\n", "2406.03235": "|**2024-06-05**|**Error-preserving Automatic Speech Recognition of Young English Learners' Language**|Janick Michot et.al.|[2406.03235](http://arxiv.org/abs/2406.03235)|**[link](https://github.com/mict-zhaw/chall_e2e_stt)**|\n", "2406.03049": "|**2024-06-05**|**StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning**|Shaolei Zhang et.al.|[2406.03049](http://arxiv.org/abs/2406.03049)|**[link](https://github.com/ictnlp/streamspeech)**|\n", "2406.02950": "|**2024-06-05**|**4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and Mask Predict Decoders**|Yui Sudo et.al.|[2406.02950](http://arxiv.org/abs/2406.02950)|null|\n", "2406.02925": "|**2024-06-15**|**Task Arithmetic can Mitigate Synthetic-to-Real Gap in Automatic Speech Recognition**|Hsuan Su et.al.|[2406.02925](http://arxiv.org/abs/2406.02925)|null|\n", "2406.02921": "|**2024-06-11**|**Text Injection for Neural Contextual Biasing**|Zhong Meng et.al.|[2406.02921](http://arxiv.org/abs/2406.02921)|null|\n", "2406.06582": "|**2024-06-25**|**Discrete Multimodal Transformers with a Pretrained Large Language Model for Mixed-Supervision Speech Processing**|Viet Anh Trinh et.al.|[2406.06582](http://arxiv.org/abs/2406.06582)|null|\n", "2406.02649": "|**2024-06-04**|**Keyword-Guided Adaptation of Automatic Speech Recognition**|Aviv Shamsian et.al.|[2406.02649](http://arxiv.org/abs/2406.02649)|null|\n", "2406.02166": "|**2024-06-04**|**Whistle: Data-Efficient Multilingual and Crosslingual Speech Recognition via Weakly Phonetic Supervision**|Saierdaer Yusuyin et.al.|[2406.02166](http://arxiv.org/abs/2406.02166)|**[link](https://github.com/thu-spmi/cat)**|\n", "2406.02004": "|**2024-06-05**|**Efficiently Train ASR Models that Memorize Less and Perform Better with Per-core Clipping**|Lun Wang et.al.|[2406.02004](http://arxiv.org/abs/2406.02004)|null|\n", "2406.01446": "|**2024-06-03**|**Enabling ASR for Low-Resource Languages: A Comprehensive Dataset Creation Approach**|Ara Yeroyan et.al.|[2406.01446](http://arxiv.org/abs/2406.01446)|null|\n", "2406.01314": "|**2024-06-03**|**Compute-Efficient Medical Image Classification with Softmax-Free Transformers and Sequence Normalization**|Firas Khader et.al.|[2406.01314](http://arxiv.org/abs/2406.01314)|null|\n", "2406.00899": "|**2024-06-02**|**YODAS: Youtube-Oriented Dataset for Audio and Speech**|Xinjian Li et.al.|[2406.00899](http://arxiv.org/abs/2406.00899)|null|\n", "2406.00522": "|**2024-06-01**|**Wav2Prompt: End-to-End Speech Prompt Generation and Tuning For LLM in Zero and Few-shot Learning**|Keqi Deng et.al.|[2406.00522](http://arxiv.org/abs/2406.00522)|null|\n", "2407.11982": "|**2024-05-31**|**Open the Data! Chuvash Datasets**|Nikolay Plotnikov et.al.|[2407.11982](http://arxiv.org/abs/2407.11982)|null|\n", "2405.18669": "|**2024-05-31**|**Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities**|Vicky Zayats et.al.|[2405.18669](http://arxiv.org/abs/2405.18669)|null|\n", "2405.18537": "|**2024-05-28**|**Augmented Conversation with Embedded Speech-Driven On-the-Fly Referencing in AR**|Shivesh Jadon et.al.|[2405.18537](http://arxiv.org/abs/2405.18537)|null|\n", "2405.18346": "|**2024-05-28**|**Intelligent Clinical Documentation: Harnessing Generative AI for Patient-Centric Clinical Note Generation**|Anjanava Biswas et.al.|[2405.18346](http://arxiv.org/abs/2405.18346)|null|\n", "2405.17874": "|**2024-05-28**|**NUTS, NARS, and Speech**|D. van der Sluis et.al.|[2405.17874](http://arxiv.org/abs/2405.17874)|null|\n", "2405.17809": "|**2024-05-28**|**TransVIP: Speech to Speech Translation System with Voice and Isochrony Preservation**|Chenyang Le et.al.|[2405.17809](http://arxiv.org/abs/2405.17809)|null|\n", "2405.17376": "|**2024-05-27**|**Federating Dynamic Models using Early-Exit Architectures for Automatic Speech Recognition on Heterogeneous Clients**|Mohamed Nabih Ali et.al.|[2405.17376](http://arxiv.org/abs/2405.17376)|null|\n", "2405.17250": "|**2024-05-27**|**\"Pass the butter\": A study on desktop-classic multitasking robotic arm based on advanced YOLOv7 and BERT**|Haohua Que et.al.|[2405.17250](http://arxiv.org/abs/2405.17250)|null|\n", "2406.00038": "|**2024-05-27**|**ViSpeR: Multilingual Audio-Visual Speech Recognition**|Sanath Narayan et.al.|[2406.00038](http://arxiv.org/abs/2406.00038)|null|\n", "2405.16952": "|**2024-05-27**|**A Variance-Preserving Interpolation Approach for Diffusion Models with Applications to Single Channel Speech Enhancement and Recognition**|Zilu Guo et.al.|[2405.16952](http://arxiv.org/abs/2405.16952)|**[link](https://github.com/zelokuo/VPIDM)**|\n", "2405.15216": "|**2024-05-24**|**Denoising LM: Pushing the Limits of Error Correction Models for Speech Recognition**|Zijin Gu et.al.|[2405.15216](http://arxiv.org/abs/2405.15216)|null|\n", "2405.15097": "|**2024-05-23**|**Contrastive and Consistency Learning for Neural Noisy-Channel Model in Spoken Language Understanding**|Suyoung Kim et.al.|[2405.15097](http://arxiv.org/abs/2405.15097)|**[link](https://github.com/syoung7388/ccl)**|\n", "2405.14259": "|**2024-06-02**|**Let's Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Multi-modal Text Recognition**|Chan-Jan Hsu et.al.|[2405.14259](http://arxiv.org/abs/2405.14259)|**[link](https://github.com/mtkresearch/generative-fusion-decoding)**|\n", "2405.14161": "|**2024-05-23**|**Self-Taught Recognizer: Toward Unsupervised Adaptation for Speech Foundation Models**|Yuchen Hu et.al.|[2405.14161](http://arxiv.org/abs/2405.14161)|**[link](https://github.com/yuchen005/star-adapt)**|\n", "2405.14093": "|**2024-05-23**|**A Survey on Vision-Language-Action Models for Embodied AI**|Yueen Ma et.al.|[2405.14093](http://arxiv.org/abs/2405.14093)|null|\n", "2405.13903": "|**2024-05-22**|**ST-Gait++: Leveraging spatio-temporal convolutions for gait-based emotion recognition on videos**|Maria Lu\u00edsa Lima et.al.|[2405.13903](http://arxiv.org/abs/2405.13903)|null|\n", "2405.13514": "|**2024-05-22**|**Joint Optimization of Streaming and Non-Streaming Automatic Speech Recognition with Multi-Decoder and Knowledge Distillation**|Muhammad Shakeel et.al.|[2405.13514](http://arxiv.org/abs/2405.13514)|null|\n", "2405.13477": "|**2024-05-22**|**A Near-Real-Time Processing Ego Speech Filtering Pipeline Designed for Speech Interruption During Human-Robot Interaction**|Yue Li et.al.|[2405.13477](http://arxiv.org/abs/2405.13477)|null|\n", "2405.13379": "|**2024-05-22**|**You don't understand me!: Comparing ASR results for L1 and L2 speakers of Swedish**|Ronald Cumbal et.al.|[2405.13379](http://arxiv.org/abs/2405.13379)|null|\n", "2405.13344": "|**2024-05-22**|**Contextualized Automatic Speech Recognition with Dynamic Vocabulary**|Yui Sudo et.al.|[2405.13344](http://arxiv.org/abs/2405.13344)|null|\n", "2405.13166": "|**2024-05-28**|**FairLENS: Assessing Fairness in Law Enforcement Speech Recognition**|Yicheng Wang et.al.|[2405.13166](http://arxiv.org/abs/2405.13166)|null|\n", "2405.13162": "|**2024-05-21**|**Non-autoregressive real-time Accent Conversion model with voice cloning**|Vladimir Nechaev et.al.|[2405.13162](http://arxiv.org/abs/2405.13162)|null|\n", "2405.12815": "|**2024-05-21**|**Could a Computer Architect Understand our Brain?**|Valentin Puente-Varona et.al.|[2405.12815](http://arxiv.org/abs/2405.12815)|null|\n", "2405.12609": "|**2024-07-01**|**Mamba in Speech: Towards an Alternative to Self-Attention**|Xiangyu Zhang et.al.|[2405.12609](http://arxiv.org/abs/2405.12609)|null|\n", "2405.12018": "|**2024-05-20**|**Continuous Sign Language Recognition with Adapted Conformer via Unsupervised Pretraining**|Neena Aloysius et.al.|[2405.12018](http://arxiv.org/abs/2405.12018)|null|\n", "2405.11078": "|**2024-05-17**|**Acoustic modeling for Overlapping Speech Recognition: JHU Chime-5 Challenge System**|Vimal Manohar et.al.|[2405.11078](http://arxiv.org/abs/2405.11078)|**[link](https://github.com/fgnt/nara_wpe)**|\n", "2405.10025": "|**2024-05-16**|**Listen Again and Choose the Right Answer: A New Paradigm for Automatic Speech Recognition with Large Language Models**|Yuchen Hu et.al.|[2405.10025](http://arxiv.org/abs/2405.10025)|null|\n", "2405.09708": "|**2024-05-15**|**No More Mumbles: Enhancing Robot Intelligibility through Speech Adaptation**|Qiaoqiao Ren et.al.|[2405.09708](http://arxiv.org/abs/2405.09708)|**[link](https://github.com/qiaoqiao2323/robot-speech-intelligibility)**|\n", "2405.09470": "|**2024-05-15**|**Towards Evaluating the Robustness of Automatic Speech Recognition Systems via Audio Style Transfer**|Weifei Jin et.al.|[2405.09470](http://arxiv.org/abs/2405.09470)|null|\n", "2405.13018": "|**2024-05-15**|**Continued Pretraining for Domain Adaptation of Wav2vec2.0 in Automatic Speech Recognition for Elementary Math Classroom Settings**|Ahmed Adel Attia et.al.|[2405.13018](http://arxiv.org/abs/2405.13018)|null|\n", "2405.19342": "|**2024-05-14**|**Sonos Voice Control Bias Assessment Dataset: A Methodology for Demographic Bias Assessment in Voice Assistants**|Chlo\u00e9 Sekkat et.al.|[2405.19342](http://arxiv.org/abs/2405.19342)|null|\n", "2405.08402": "|**2024-05-14**|**Investigating the 'Autoencoder Behavior' in Speech Self-Supervised Models: a focus on HuBERT's Pretraining**|Valentin Vielzeuf et.al.|[2405.08402](http://arxiv.org/abs/2405.08402)|null|\n", "2405.08295": "|**2024-05-31**|**SpeechVerse: A Large-scale Generalizable Audio Language Model**|Nilaksh Das et.al.|[2405.08295](http://arxiv.org/abs/2405.08295)|null|\n", "2405.07442": "|**2024-06-07**|**Rene: A Pre-trained Multi-modal Architecture for Auscultation of Respiratory Diseases**|Pengfei Zhang et.al.|[2405.07442](http://arxiv.org/abs/2405.07442)|**[link](https://github.com/zpforlove/rene)**|\n", "2405.07354": "|**2024-05-12**|**SoccerNet-Echoes: A Soccer Game Audio Commentary Dataset**|Sushant Gautam et.al.|[2405.07354](http://arxiv.org/abs/2405.07354)|**[link](https://github.com/SoccerNet/sn-echoes)**|\n", "2405.13001": "|**2024-05-12**|**Large Language Models for Education: A Survey**|Hanyi Xu et.al.|[2405.13001](http://arxiv.org/abs/2405.13001)|null|\n", "2405.06368": "|**2024-07-22**|**DP-DyLoRA: Fine-Tuning Transformer-Based Models On-Device under Differentially Private Federated Learning using Dynamic Low-Rank Adaptation**|Jie Xu et.al.|[2405.06368](http://arxiv.org/abs/2405.06368)|null|\n", "2405.06150": "|**2024-05-10**|**Lost in Transcription: Identifying and Quantifying the Accuracy Biases of Automatic Speech Recognition Systems Against Disfluent Speech**|Dena Mujtaba et.al.|[2405.06150](http://arxiv.org/abs/2405.06150)|null|\n", "2405.06134": "|**2024-07-17**|**Muting Whisper: A Universal Acoustic Adversarial Attack on Speech Foundation Models**|Vyas Raina et.al.|[2405.06134](http://arxiv.org/abs/2405.06134)|**[link](https://github.com/rainavyas/prepend_acoustic_attack)**|\n", "2405.05498": "|**2024-05-09**|**The RoyalFlush Automatic Speech Diarization and Recognition System for In-Car Multi-Channel Automatic Speech Recognition Challenge**|Jingguang Tian et.al.|[2405.05498](http://arxiv.org/abs/2405.05498)|null|\n", "2405.04296": "|**2024-05-07**|**Open Implementation and Study of BEST-RQ for Speech Processing**|Ryan Whetten et.al.|[2405.04296](http://arxiv.org/abs/2405.04296)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2405.03484": "|**2024-05-06**|**Whispy: Adapting STT Whisper Models to Real-Time Environments**|Antonio Bevilacqua et.al.|[2405.03484](http://arxiv.org/abs/2405.03484)|null|\n", "2405.03152": "|**2024-05-06**|**MMGER: Multi-modal and Multi-granularity Generative Error Correction with LLM for Joint Accent and Speech Recognition**|Bingshen Mu et.al.|[2405.03152](http://arxiv.org/abs/2405.03152)|null|\n", "2405.02995": "|**2024-05-11**|**Analysis about Theoretical Foundations for Method to Enhancing ASR Performance using OCR Word Frequency Differences**|Kyudan Jung et.al.|[2405.02995](http://arxiv.org/abs/2405.02995)|null|\n", "2405.02578": "|**2024-05-04**|**Mixat: A Data Set of Bilingual Emirati-English Speech**|Maryam Al Ali et.al.|[2405.02578](http://arxiv.org/abs/2405.02578)|**[link](https://github.com/mbzuai-nlp/mixat)**|\n", "2406.02566": "|**2024-05-03**|**Combining X-Vectors and Bayesian Batch Active Learning: Two-Stage Active Learning Pipeline for Speech Recognition**|Ognjen Kundacina et.al.|[2406.02566](http://arxiv.org/abs/2406.02566)|null|\n", "2405.02132": "|**2024-05-06**|**Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets**|Xuelong Geng et.al.|[2405.02132](http://arxiv.org/abs/2405.02132)|null|\n", "2406.02565": "|**2024-05-02**|**Sequence-to-sequence models in peer-to-peer learning: A practical application**|Robert \u0160ajina et.al.|[2406.02565](http://arxiv.org/abs/2406.02565)|null|\n", "2405.01293": "|**2024-05-02**|**Low-resource speech recognition and dialect identification of Irish in a multi-task framework**|Liam Lonergan et.al.|[2405.01293](http://arxiv.org/abs/2405.01293)|null|\n", "2405.01207": "|**2024-05-02**|**Improving Membership Inference in ASR Model Auditing with Perturbed Loss Features**|Francisco Teixeira et.al.|[2405.01207](http://arxiv.org/abs/2405.01207)|null|\n", "2405.01004": "|**2024-05-02**|**Deep Learning Models in Speech Recognition: Measuring GPU Energy Consumption, Impact of Noise and Model Quantization for Edge Deployment**|Aditya Chakravarty et.al.|[2405.01004](http://arxiv.org/abs/2405.01004)|**[link](https://github.com/zzadiues3338/asr-energy-jetson)**|\n", "2405.00966": "|**2024-05-02**|**Efficient Compression of Multitask Multilingual Speech Models**|Thomas Palmeira Ferraz et.al.|[2405.00966](http://arxiv.org/abs/2405.00966)|null|\n", "2405.01601": "|**2024-05-01**|**Efficient Sample-Specific Encoder Perturbations**|Yassir Fathullah et.al.|[2405.01601](http://arxiv.org/abs/2405.01601)|null|\n", "2405.00307": "|**2024-05-01**|**Active Learning with Task Adaptation Pre-training for Speech Emotion Recognition**|Dongyuan Li et.al.|[2405.00307](http://arxiv.org/abs/2405.00307)|null|\n", "2405.00223": "|**2024-07-24**|**Confides: A Visual Analytics Solution for Automated Speech Recognition Analysis and Exploration**|Sunwoo Ha et.al.|[2405.00223](http://arxiv.org/abs/2405.00223)|null|\n", "2404.19310": "|**2024-05-09**|**Does Whisper understand Swiss German? An automatic, qualitative, and human evaluation**|Eyal Liron Dolev et.al.|[2404.19310](http://arxiv.org/abs/2404.19310)|null|\n", "2404.19214": "|**2024-04-30**|**EfficientASR: Speech Recognition Network Compression via Attention Redundancy and Chunk-Level FFN Optimization**|Jianzong Wang et.al.|[2404.19214](http://arxiv.org/abs/2404.19214)|null|\n", "2404.18739": "|**2024-04-29**|**Towards Dog Bark Decoding: Leveraging Human Speech Processing for Automated Bark Classification**|Artem Abzaliev et.al.|[2404.18739](http://arxiv.org/abs/2404.18739)|null|\n", "2406.02563": "|**2024-04-29**|**A cost minimization approach to fix the vocabulary size in a tokenizer for an End-to-End ASR system**|Sunil Kumar Kopparapu et.al.|[2406.02563](http://arxiv.org/abs/2406.02563)|null|\n", "2404.17394": "|**2024-04-26**|**Child Speech Recognition in Human-Robot Interaction: Problem Solved?**|Ruben Janssens et.al.|[2404.17394](http://arxiv.org/abs/2404.17394)|null|\n", "2404.16743": "|**2024-04-26**|**Automatic Speech Recognition System-Independent Word Error Rate Estimation**|Chanho Park et.al.|[2404.16743](http://arxiv.org/abs/2404.16743)|null|\n", "2404.16547": "|**2024-04-25**|**Developing Acoustic Models for Automatic Speech Recognition in Swedish**|Giampiero Salvi et.al.|[2404.16547](http://arxiv.org/abs/2404.16547)|null|\n", "2404.16407": "|**2024-04-25**|**U2++ MoE: Scaling 4.7x parameters with minimal impact on RTF**|Xingchen Song et.al.|[2404.16407](http://arxiv.org/abs/2404.16407)|null|\n", "2404.16112": "|**2024-04-24**|**Mamba-360: Survey of State Space Models as Transformer Alternative for Long Sequence Modelling: Methods, Applications, and Challenges**|Badri Narayana Patro et.al.|[2404.16112](http://arxiv.org/abs/2404.16112)|**[link](https://github.com/badripatro/mamba360)**|\n", "2406.02562": "|**2024-04-24**|**Gated Low-rank Adaptation for personalized Code-Switching Automatic Speech Recognition on the low-spec devices**|Gwantae Kim et.al.|[2406.02562](http://arxiv.org/abs/2406.02562)|null|\n", "2404.15501": "|**2024-04-23**|**Killkan: The Automatic Speech Recognition Dataset for Kichwa with Morphosyntactic Information**|Chihiro Taguchi et.al.|[2404.15501](http://arxiv.org/abs/2404.15501)|**[link](https://github.com/ctaguchi/killkan)**|\n", "2406.02561": "|**2024-04-23**|**Breaking Walls: Pioneering Automatic Speech Recognition for Central Kurdish: End-to-End Transformer Paradigm**|Abdulhady Abas Abdullah et.al.|[2406.02561](http://arxiv.org/abs/2406.02561)|null|\n", "2404.14860": "|**2024-04-23**|**Rethinking Processing Distortions: Disentangling the Impact of Speech Enhancement Errors on Speech Recognition Performance**|Tsubasa Ochiai et.al.|[2404.14860](http://arxiv.org/abs/2404.14860)|null|\n", "2404.14605": "|**2024-04-22**|**Assessment of Sign Language-Based versus Touch-Based Input for Deaf Users Interacting with Intelligent Personal Assistants**|Nina Tran et.al.|[2404.14605](http://arxiv.org/abs/2404.14605)|null|\n", "2406.02560": "|**2024-07-18**|**Less Peaky and More Accurate CTC Forced Alignment by Label Priors**|Ruizhe Huang et.al.|[2406.02560](http://arxiv.org/abs/2406.02560)|**[link](https://github.com/huangruizhe/audio)**|\n", "2404.14024": "|**2024-04-22**|**Exploring neural oscillations during speech perception via surrogate gradient spiking neural networks**|Alexandre Bittar et.al.|[2404.14024](http://arxiv.org/abs/2404.14024)|null|\n", "2404.13362": "|**2024-04-20**|**Semantically Corrected Amharic Automatic Speech Recognition**|Samuael Adnew et.al.|[2404.13362](http://arxiv.org/abs/2404.13362)|**[link](https://github.com/samuael/postprocessed_geez_asr)**|\n", "2404.12888": "|**2024-04-19**|**Learn2Talk: 3D Talking Face Learns from 2D Talking Face**|Yixiang Zhuang et.al.|[2404.12888](http://arxiv.org/abs/2404.12888)|null|\n", "2404.12628": "|**2024-04-19**|**Efficient infusion of self-supervised representations in Automatic Speech Recognition**|Darshan Prabhu et.al.|[2404.12628](http://arxiv.org/abs/2404.12628)|null|\n", "2404.15168": "|**2024-04-18**|**Artificial Neural Networks to Recognize Speakers Division from Continuous Bengali Speech**|Hasmot Ali et.al.|[2404.15168](http://arxiv.org/abs/2404.15168)|null|\n", "2404.10922": "|**2024-04-16**|**Teaching a Multilingual Large Language Model to Understand Multilingual Speech via Multi-Instructional Training**|Pavel Denisov et.al.|[2404.10922](http://arxiv.org/abs/2404.10922)|**[link](https://github.com/akreal/bloomzmms)**|\n", "2404.09841": "|**2024-04-16**|**Anatomy of Industrial Scale Multilingual ASR**|Francis McCann Ramirez et.al.|[2404.09841](http://arxiv.org/abs/2404.09841)|null|\n", "2404.09754": "|**2024-04-15**|**Resilience of Large Language Models for Noisy Instructions**|Bin Wang et.al.|[2404.09754](http://arxiv.org/abs/2404.09754)|null|\n", "2406.09425": "|**2024-04-13**|**SGPRS: Seamless GPU Partitioning Real-Time Scheduler for Periodic Deep Learning Workloads**|Amir Fakhim Babaei et.al.|[2406.09425](http://arxiv.org/abs/2406.09425)|null|\n", "2404.08424": "|**2024-04-12**|**Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction in an Object Categorization Task**|Hassan Ali et.al.|[2404.08424](http://arxiv.org/abs/2404.08424)|null|\n", "2404.08368": "|**2024-07-26**|**Automatic Speech Recognition Advancements for Indigenous Languages of the Americas**|Monica Romero et.al.|[2404.08368](http://arxiv.org/abs/2404.08368)|null|\n", "2404.07575": "|**2024-04-12**|**An Effective Automated Speaking Assessment Approach to Mitigating Data Scarcity and Imbalanced Distribution**|Tien-Hong Lo et.al.|[2404.07575](http://arxiv.org/abs/2404.07575)|null|\n", "2404.07341": "|**2024-04-12**|**Conformer-1: Robust ASR via Large-Scale Semisupervised Bootstrapping**|Kevin Zhang et.al.|[2404.07341](http://arxiv.org/abs/2404.07341)|null|\n", "2404.08011": "|**2024-04-10**|**An inclusive review on deep learning techniques and their scope in handwriting recognition**|Sukhdeep Singh et.al.|[2404.08011](http://arxiv.org/abs/2404.08011)|null|\n", "2404.06079": "|**2024-04-10**|**The X-LANCE Technical Report for Interspeech 2024 Speech Processing Using Discrete Speech Unit Challenge**|Yiwei Guo et.al.|[2404.06079](http://arxiv.org/abs/2404.06079)|null|\n", "2404.05659": "|**2024-05-28**|**VietMed: A Dataset and Benchmark for Automatic Speech Recognition of Vietnamese in the Medical Domain**|Khai Le-Duc et.al.|[2404.05659](http://arxiv.org/abs/2404.05659)|**[link](https://github.com/leduckhai/multimed)**|\n", "2404.04769": "|**2024-04-07**|**Safeguarding Voice Privacy: Harnessing Near-Ultrasonic Interference To Protect Against Unauthorized Audio Recording**|Forrest McKee et.al.|[2404.04769](http://arxiv.org/abs/2404.04769)|null|\n", "2404.04295": "|**2024-04-04**|**Transducers with Pronunciation-aware Embeddings for Automatic Speech Recognition**|Hainan Xu et.al.|[2404.04295](http://arxiv.org/abs/2404.04295)|null|\n", "2404.03073": "|**2024-04-03**|**Mai Ho'om\u0101una i ka 'Ai: Language Models Improve Automatic Speech Recognition in Hawaiian**|Kaavya Chaparala et.al.|[2404.03073](http://arxiv.org/abs/2404.03073)|null|\n", "2404.02408": "|**2024-04-03**|**CMULAB: An Open-Source Framework for Training and Deployment of Natural Language Processing Models**|Zaid Sheikh et.al.|[2404.02408](http://arxiv.org/abs/2404.02408)|**[link](https://github.com/neulab/cmulab)**|\n", "2404.02098": "|**2024-04-02**|**BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory Speech Recognition**|Alexandros Haliassos et.al.|[2404.02098](http://arxiv.org/abs/2404.02098)|**[link](https://github.com/ahaliassos/raven)**|\n", "2404.02052": "|**2024-04-02**|**Noise Masking Attacks and Defenses for Pretrained Speech Models**|Matthew Jagielski et.al.|[2404.02052](http://arxiv.org/abs/2404.02052)|null|\n", "2404.01991": "|**2024-04-02**|**Kallaama: A Transcribed Speech Dataset about Agriculture in the Three Most Widely Spoken Languages in Senegal**|Elodie Gauthier et.al.|[2404.01991](http://arxiv.org/abs/2404.01991)|**[link](https://github.com/gauthelo/kallaama-speech-dataset)**|\n", "2404.01737": "|**2024-04-02**|**Transfer Learning from Whisper for Microscopic Intelligibility Prediction**|Paul Best et.al.|[2404.01737](http://arxiv.org/abs/2404.01737)|null|\n", "2404.07226": "|**2024-03-31**|**Houston we have a Divergence: A Subgroup Performance Analysis of ASR Models**|Alkis Koudounas et.al.|[2404.07226](http://arxiv.org/abs/2404.07226)|null|\n", "2403.20262": "|**2024-07-22**|**ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language Models**|Thibaut Thonet et.al.|[2403.20262](http://arxiv.org/abs/2403.20262)|**[link](https://github.com/utter-project/elitr-bench)**|\n", "2403.19822": "|**2024-03-28**|**Multi-Stage Multi-Modal Pre-Training for Automatic Speech Recognition**|Yash Jain et.al.|[2403.19822](http://arxiv.org/abs/2403.19822)|null|\n", "2403.19224": "|**2024-03-28**|**Emotion Neural Transducer for Fine-Grained Speech Emotion Recognition**|Siyuan Shen et.al.|[2403.19224](http://arxiv.org/abs/2403.19224)|**[link](https://github.com/ecnu-cross-innovation-lab/ent)**|\n", "2403.19207": "|**2024-03-28**|**LV-CTC: Non-autoregressive ASR with CTC and latent variable models**|Yuya Fujita et.al.|[2403.19207](http://arxiv.org/abs/2403.19207)|null|\n", "2403.18721": "|**2024-06-04**|**PhysicsAssistant: An LLM-Powered Interactive Learning Robot for Physics Lab Investigations**|Ehsan Latif et.al.|[2403.18721](http://arxiv.org/abs/2403.18721)|null|\n", "2406.02555": "|**2024-03-27**|**PhoWhisper: Automatic Speech Recognition for Vietnamese**|Thanh-Thien Le et.al.|[2406.02555](http://arxiv.org/abs/2406.02555)|**[link](https://github.com/vinairesearch/phowhisper)**|\n", "2403.18182": "|**2024-03-27**|**ZAEBUC-Spoken: A Multilingual Multidialectal Arabic-English Speech Corpus**|Injy Hamed et.al.|[2403.18182](http://arxiv.org/abs/2403.18182)|null|\n", "2403.17645": "|**2024-04-11**|**DANCER: Entity Description Augmented Named Entity Corrector for Automatic Speech Recognition**|Yi-Cheng Wang et.al.|[2403.17645](http://arxiv.org/abs/2403.17645)|null|\n", "2403.17363": "|**2024-03-26**|**Extracting Biomedical Entities from Noisy Audio Transcripts**|Nima Ebadi et.al.|[2403.17363](http://arxiv.org/abs/2403.17363)|null|\n", "2403.19709": "|**2024-03-25**|**Hierarchical Recurrent Adapters for Efficient Multi-Task Adaptation of Large Speech Models**|Tsendsuren Munkhdalai et.al.|[2403.19709](http://arxiv.org/abs/2403.19709)|null|\n", "2403.16655": "|**2024-03-25**|**Grammatical vs Spelling Error Correction: An Investigation into the Responsiveness of Transformer-based Language Models using BART and MarianMT**|Rohit Raju et.al.|[2403.16655](http://arxiv.org/abs/2403.16655)|null|\n", "2403.15510": "|**2024-03-22**|**Privacy-Preserving End-to-End Spoken Language Understanding**|Yinggui Wang et.al.|[2403.15510](http://arxiv.org/abs/2403.15510)|null|\n", "2403.14438": "|**2024-03-26**|**A Multimodal Approach to Device-Directed Speech Detection with Large Language Models**|Dominik Wagner et.al.|[2403.14438](http://arxiv.org/abs/2403.14438)|null|\n", "2403.14402": "|**2024-03-21**|**XLAVS-R: Cross-Lingual Audio-Visual Speech Representation Learning for Noise-Robust Speech Perception**|HyoJung Han et.al.|[2403.14402](http://arxiv.org/abs/2403.14402)|null|\n", "2403.14168": "|**2024-06-04**|**M$^3$AV: A Multimodal, Multigenre, and Multipurpose Audio-Visual Academic Lecture Dataset**|Zhe Chen et.al.|[2403.14168](http://arxiv.org/abs/2403.14168)|null|\n", "2403.13960": "|**2024-03-20**|**Open Access NAO (OAN): a ROS2-based software framework for HRI applications with the NAO robot**|Antonio Bono et.al.|[2403.13960](http://arxiv.org/abs/2403.13960)|null|\n", "2403.13465": "|**2024-03-20**|**BanglaNum -- A Public Dataset for Bengali Digit Recognition from Speech**|Mir Sayeed Mohammad et.al.|[2403.13465](http://arxiv.org/abs/2403.13465)|null|\n", "2403.13423": "|**2024-03-20**|**Advanced Long-Content Speech Recognition With Factorized Neural Transducer**|Xun Gong et.al.|[2403.13423](http://arxiv.org/abs/2403.13423)|null|\n", "2403.15469": "|**2024-03-20**|**Isometric Neural Machine Translation using Phoneme Count Ratio Reward-based Reinforcement Learning**|Shivam Ratnakant Mhaskar et.al.|[2403.15469](http://arxiv.org/abs/2403.15469)|null|\n", "2403.12821": "|**2024-03-21**|**FlowerFormer: Empowering Neural Architecture Encoding using a Flow-aware Graph Transformer**|Dongyeong Hwang et.al.|[2403.12821](http://arxiv.org/abs/2403.12821)|**[link](https://github.com/y0ngjaenius/cvpr2024_flowerformer)**|\n", "2403.12477": "|**2024-03-19**|**Real-time Speech Extraction Using Spatially Regularized Independent Low-rank Matrix Analysis and Rank-constrained Spatial Covariance Matrix Estimation**|Yuto Ishikawa et.al.|[2403.12477](http://arxiv.org/abs/2403.12477)|null|\n", "2403.12273": "|**2024-03-18**|**Multimodal Human-Autonomous Agents Interaction Using Pre-Trained Language and Visual Foundation Models**|Linus Nwankwo et.al.|[2403.12273](http://arxiv.org/abs/2403.12273)|null|\n", "2403.11578": "|**2024-03-18**|**AdaMER-CTC: Connectionist Temporal Classification with Adaptive Maximum Entropy Regularization for Automatic Speech Recognition**|SooHwan Eom et.al.|[2403.11578](http://arxiv.org/abs/2403.11578)|null|\n", "2403.15442": "|**2024-07-21**|**Artificial Intelligence for Cochlear Implants: Review of Strategies, Challenges, and Perspectives**|Billel Essaid et.al.|[2403.15442](http://arxiv.org/abs/2403.15442)|null|\n", "2403.10961": "|**2024-03-16**|**Energy-Based Models with Applications to Speech and Language Processing**|Zhijian Ou et.al.|[2403.10961](http://arxiv.org/abs/2403.10961)|null|\n", "2403.10937": "|**2024-03-16**|**Initial Decoding with Minimally Augmented Language Model for Improved Lattice Rescoring in Low Resource ASR**|Savitha Murthy et.al.|[2403.10937](http://arxiv.org/abs/2403.10937)|null|\n", "2403.10420": "|**2024-03-15**|**Neural Networks Hear You Loud And Clear: Hearing Loss Compensation Using Deep Neural Networks**|Peter Leer et.al.|[2403.10420](http://arxiv.org/abs/2403.10420)|null|\n", "2403.09753": "|**2024-03-14**|**SpokeN-100: A Cross-Lingual Benchmarking Dataset for The Classification of Spoken Numbers in Different Languages**|Ren\u00e9 Groh et.al.|[2403.09753](http://arxiv.org/abs/2403.09753)|**[link](https://github.com/ankilab/spoken-100)**|\n", "2403.09298": "|**2024-03-14**|**More than words: Advancements and challenges in speech recognition for singing**|Anna Kruspe et.al.|[2403.09298](http://arxiv.org/abs/2403.09298)|null|\n", "2405.12983": "|**2024-03-14**|**Multilingual Audio-Visual Speech Recognition with Hybrid CTC/RNN-T Fast Conformer**|Maxime Burchi et.al.|[2405.12983](http://arxiv.org/abs/2405.12983)|null|\n", "2403.08258": "|**2024-05-21**|**Skipformer: A Skip-and-Recover Strategy for Efficient Speech Recognition**|Wenjing Zhu et.al.|[2403.08258](http://arxiv.org/abs/2403.08258)|null|\n", "2403.08196": "|**2024-03-13**|**SpeechColab Leaderboard: An Open-Source Platform for Automatic Speech Recognition Evaluation**|Jiayu Du et.al.|[2403.08196](http://arxiv.org/abs/2403.08196)|**[link](https://github.com/speechcolab/leaderboard)**|\n", "2403.08187": "|**2024-03-13**|**Automatic Speech Recognition (ASR) for the Diagnosis of pronunciation of Speech Sound Disorders in Korean children**|Taekyung Ahn et.al.|[2403.08187](http://arxiv.org/abs/2403.08187)|null|\n", "2403.08011": "|**2024-03-12**|**Gujarati-English Code-Switching Speech Recognition using ensemble prediction of spoken language**|Yash Sharma et.al.|[2403.08011](http://arxiv.org/abs/2403.08011)|null|\n", "2403.07767": "|**2024-03-12**|**Beyond the Labels: Unveiling Text-Dependency in Paralinguistic Speech Recognition Datasets**|Jan Pe\u0161\u00e1n et.al.|[2403.07767](http://arxiv.org/abs/2403.07767)|null|\n", "2403.07947": "|**2024-03-11**|**The evaluation of a code-switched Sepedi-English automatic speech recognition system**|Amanda Phaladi et.al.|[2403.07947](http://arxiv.org/abs/2403.07947)|null|\n", "2403.06734": "|**2024-03-11**|**Real-Time Multimodal Cognitive Assistant for Emergency Medical Services**|Keshara Weerasinghe et.al.|[2403.06734](http://arxiv.org/abs/2403.06734)|**[link](https://github.com/uva-dsa/ems-pipeline)**|\n", "2403.06387": "|**2024-03-11**|**Towards Decoupling Frontend Enhancement and Backend Recognition in Monaural Robust ASR**|Yufeng Yang et.al.|[2403.06387](http://arxiv.org/abs/2403.06387)|null|\n", "2403.06260": "|**2024-03-10**|**SCORE: Self-supervised Correspondence Fine-tuning for Improved Content Representations**|Amit Meghanani et.al.|[2403.06260](http://arxiv.org/abs/2403.06260)|**[link](https://github.com/trikaldarshi/score_finetuning)**|\n", "2403.05887": "|**2024-03-09**|**Aligning Speech to Languages to Enhance Code-switching Speech Recognition**|Hexin Liu et.al.|[2403.05887](http://arxiv.org/abs/2403.05887)|null|\n", "2403.07937": "|**2024-03-08**|**Speech Robust Bench: A Robustness Benchmark For Speech Recognition**|Muhammad A. Shah et.al.|[2403.07937](http://arxiv.org/abs/2403.07937)|null|\n", "2403.04445": "|**2024-03-07**|**Classist Tools: Social Class Correlates with Performance in NLP**|Amanda Cercas Curry et.al.|[2403.04445](http://arxiv.org/abs/2403.04445)|null|\n", "2403.04280": "|**2024-05-30**|**A New Benchmark for Evaluating Automatic Speech Recognition in the Arabic Call Domain**|Qusai Abo Obaidah et.al.|[2403.04280](http://arxiv.org/abs/2403.04280)|null|\n", "2403.04245": "|**2024-03-07**|**A Study of Dropout-Induced Modality Bias on Robustness to Missing Video Frames for Audio-Visual Speech Recognition**|Yusheng Dai et.al.|[2403.04245](http://arxiv.org/abs/2403.04245)|**[link](https://github.com/dalision/modalbiasavsr)**|\n", "2403.03538": "|**2024-03-06**|**RADIA -- Radio Advertisement Detection with Intelligent Analytics**|Jorge \u00c1lvarez et.al.|[2403.03538](http://arxiv.org/abs/2403.03538)|null|\n", "2403.03522": "|**2024-03-13**|**Non-verbal information in spontaneous speech -- towards a new framework of analysis**|Tirza Biron et.al.|[2403.03522](http://arxiv.org/abs/2403.03522)|null|\n", "2403.02938": "|**2024-03-05**|**AIx Speed: Playback Speed Optimization Using Listening Comprehension of Speech Recognition Models**|Kazuki Kawamura et.al.|[2403.02938](http://arxiv.org/abs/2403.02938)|null|\n", "2403.02288": "|**2024-03-04**|**PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings**|Joonas Kalda et.al.|[2403.02288](http://arxiv.org/abs/2403.02288)|**[link](https://github.com/joonaskalda/pixit)**|\n", "2403.02173": "|**2024-03-04**|**What has LeBenchmark Learnt about French Syntax?**|Zdravko Dugonji\u0107 et.al.|[2403.02173](http://arxiv.org/abs/2403.02173)|null|\n", "2403.02010": "|**2024-03-04**|**SA-SOT: Speaker-Aware Serialized Output Training for Multi-Talker ASR**|Zhiyun Fan et.al.|[2403.02010](http://arxiv.org/abs/2403.02010)|null|\n", "2403.01983": "|**2024-03-04**|**Language and Speech Technology for Central Kurdish Varieties**|Sina Ahmadi et.al.|[2403.01983](http://arxiv.org/abs/2403.01983)|**[link](https://github.com/sinaahmadi/cordi)**|\n", "2403.18843": "|**2024-03-04**|**JEP-KD: Joint-Embedding Predictive Architecture Based Knowledge Distillation for Visual Speech Recognition**|Chang Sun et.al.|[2403.18843](http://arxiv.org/abs/2403.18843)|null|\n", "2403.01369": "|**2024-03-03**|**A Closer Look at Wav2Vec2 Embeddings for On-Device Single-Channel Speech Enhancement**|Ravi Shankar et.al.|[2403.01369](http://arxiv.org/abs/2403.01369)|null|\n", "2403.05583": "|**2024-03-02**|**A Cross-Modal Approach to Silent Speech with LLM-Enhanced Recognition**|Tyler Benster et.al.|[2403.05583](http://arxiv.org/abs/2403.05583)|**[link](https://github.com/tbenst/silent_speech)**|\n", "2403.01255": "|**2024-04-18**|**Automatic Speech Recognition using Advanced Deep Learning Approaches: A survey**|Hamza Kheddar et.al.|[2403.01255](http://arxiv.org/abs/2403.01255)|null|\n", "2403.00370": "|**2024-03-01**|**Post-decoder Biasing for End-to-End Speech Recognition of Multi-turn Medical Interview**|Heyang Liu et.al.|[2403.00370](http://arxiv.org/abs/2403.00370)|null|\n", "2402.19443": "|**2024-02-29**|**Probing the Information Encoded in Neural-based Acoustic Models of Automatic Speech Recognition Systems**|Quentin Raymondaud et.al.|[2402.19443](http://arxiv.org/abs/2402.19443)|null|\n", "2402.18923": "|**2024-02-29**|**Inappropriate Pause Detection In Dysarthric Speech Using Large-Scale Speech Recognition**|Jeehyun Lee et.al.|[2402.18923](http://arxiv.org/abs/2402.18923)|null|\n", "2402.18275": "|**2024-06-04**|**Exploration of Adapter for Noise Robust Automatic Speech Recognition**|Hao Shi et.al.|[2402.18275](http://arxiv.org/abs/2402.18275)|null|\n", "2402.17954": "|**2024-06-19**|**Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps**|Giuseppe Attanasio et.al.|[2402.17954](http://arxiv.org/abs/2402.17954)|**[link](https://github.com/g8a9/multilingual-asr-gender-gap)**|\n", "2402.17189": "|**2024-02-27**|**An Effective Mixture-Of-Experts Approach For Code-Switching Speech Recognition Leveraging Encoder Disentanglement**|Tzu-Ting Yang et.al.|[2402.17189](http://arxiv.org/abs/2402.17189)|null|\n", "2402.17184": "|**2024-02-27**|**Extreme Encoder Output Frame Rate Reduction: Improving Computational Latencies of Large End-to-End Models**|Rohit Prabhavalkar et.al.|[2402.17184](http://arxiv.org/abs/2402.17184)|null|\n", "2402.15733": "|**2024-04-01**|**ArEEG_Chars: Dataset for Envisioned Speech Recognition using EEG for Arabic Characters**|Hazem Darwish et.al.|[2402.15733](http://arxiv.org/abs/2402.15733)|null|\n", "2402.15151": "|**2024-05-14**|**Where Visual Speech Meets Language: VSP-LLM Framework for Efficient and Context-Aware Visual Speech Processing**|Jeong Hun Yeo et.al.|[2402.15151](http://arxiv.org/abs/2402.15151)|**[link](https://github.com/sally-sh/vsp-llm)**|\n", "2402.14563": "|**2024-02-22**|**Wizard of Oz Experimentation for Language Technology Applications: Challenges and Tools**|Stephan Schl\u00f6gl et.al.|[2402.14563](http://arxiv.org/abs/2402.14563)|null|\n", "2402.14888": "|**2024-02-22**|**Efficient data selection employing Semantic Similarity-based Graph Structures for model training**|Roxana Petcu et.al.|[2402.14888](http://arxiv.org/abs/2402.14888)|null|\n", "2402.14185": "|**2024-02-22**|**HINT: High-quality INPainting Transformer with Mask-Aware Encoding and Enhanced Attention**|Shuang Chen et.al.|[2402.14185](http://arxiv.org/abs/2402.14185)|**[link](https://github.com/chrischen1023/hint)**|\n", "2402.13687": "|**2024-02-21**|**An Augmented Lagrangian Method for Training Recurrent Neural Networks**|Yue Wang et.al.|[2402.13687](http://arxiv.org/abs/2402.13687)|null|\n", "2402.13511": "|**2024-02-22**|**Mel-FullSubNet: Mel-Spectrogram Enhancement for Improving Both Speech Quality and ASR**|Rui Zhou et.al.|[2402.13511](http://arxiv.org/abs/2402.13511)|null|\n", "2402.13208": "|**2024-02-20**|**How do Hyenas deal with Human Speech? Speech Recognition and Translation with ConfHyena**|Marco Gaido et.al.|[2402.13208](http://arxiv.org/abs/2402.13208)|**[link](https://github.com/hlt-mt/fbk-fairseq)**|\n", "2402.13076": "|**2024-02-20**|**Not All Weights Are Created Equal: Enhancing Energy Efficiency in On-Device Streaming Speech Recognition**|Yang Li et.al.|[2402.13076](http://arxiv.org/abs/2402.13076)|null|\n", "2402.13004": "|**2024-02-20**|**Comparison of Conventional Hybrid and CTC/Attention Decoders for Continuous Visual Speech Recognition**|David Gimeno-G\u00f3mez et.al.|[2402.13004](http://arxiv.org/abs/2402.13004)|null|\n", "2402.12654": "|**2024-06-16**|**OWSM-CTC: An Open Encoder-Only Speech Foundation Model for Speech Recognition, Translation, and Language Identification**|Yifan Peng et.al.|[2402.12654](http://arxiv.org/abs/2402.12654)|null|\n", "2402.11954": "|**2024-02-19**|**Multimodal Emotion Recognition from Raw Audio with Sinc-convolution**|Xiaohui Zhang et.al.|[2402.11954](http://arxiv.org/abs/2402.11954)|null|\n", "2402.11571": "|**2024-02-18**|**Ain't Misbehavin' -- Using LLMs to Generate Expressive Robot Behavior in Conversations with the Tabletop Robot Haru**|Zining Wang et.al.|[2402.11571](http://arxiv.org/abs/2402.11571)|null|\n", "2402.11520": "|**2024-02-18**|**Cross-Attention Fusion of Visual and Geometric Features for Large Vocabulary Arabic Lipreading**|Samar Daou et.al.|[2402.11520](http://arxiv.org/abs/2402.11520)|null|\n", "2402.09797": "|**2024-02-15**|**A cross-talk robust multichannel VAD model for multiparty agent interactions trained using synthetic re-recordings**|Hyewon Han et.al.|[2402.09797](http://arxiv.org/abs/2402.09797)|null|\n", "2402.08932": "|**2024-02-14**|**Listening to Multi-talker Conversations: Modular and End-to-end Perspectives**|Desh Raj et.al.|[2402.08932](http://arxiv.org/abs/2402.08932)|null|\n", "2402.08898": "|**2024-02-14**|**UniEnc-CASSNAT: An Encoder-only Non-autoregressive ASR for Speech SSL Models**|Ruchao Fan et.al.|[2402.08898](http://arxiv.org/abs/2402.08898)|null|\n", "2402.08846": "|**2024-02-13**|**An Embarrassingly Simple Approach for LLM with Strong ASR Capacity**|Ziyang Ma et.al.|[2402.08846](http://arxiv.org/abs/2402.08846)|**[link](https://github.com/X-LANCE/SLAM-LLM)**|\n", "2402.08788": "|**2024-02-13**|**Syllable based DNN-HMM Cantonese Speech to Text System**|Timothy Wong et.al.|[2402.08788](http://arxiv.org/abs/2402.08788)|null|\n", "2402.08021": "|**2024-05-03**|**Careless Whisper: Speech-to-Text Hallucination Harms**|Allison Koenecke et.al.|[2402.08021](http://arxiv.org/abs/2402.08021)|**[link](https://github.com/koenecke/hallucination_harms)**|\n", "2402.07729": "|**2024-07-26**|**AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension**|Qian Yang et.al.|[2402.07729](http://arxiv.org/abs/2402.07729)|**[link](https://github.com/ofa-sys/air-bench)**|\n", "2402.07658": "|**2024-02-12**|**The Sound of Healthcare: Improving Medical Transcription ASR Accuracy with Large Language Models**|Ayo Adedeji et.al.|[2402.07658](http://arxiv.org/abs/2402.07658)|null|\n", "2402.07513": "|**2024-02-12**|**The Balancing Act: Unmasking and Alleviating ASR Biases in Portuguese**|Ajinkya Kulkarni et.al.|[2402.07513](http://arxiv.org/abs/2402.07513)|null|\n", "2402.07431": "|**2024-02-13**|**SALAD: Smart AI Language Assistant Daily**|Ragib Amin Nihal et.al.|[2402.07431](http://arxiv.org/abs/2402.07431)|null|\n", "2402.07095": "|**2024-02-11**|**Does ChatGPT and Whisper Make Humanoid Robots More Relatable?**|Xiaohui Chen et.al.|[2402.07095](http://arxiv.org/abs/2402.07095)|null|\n", "2402.06966": "|**2024-02-10**|**DeepCover: Advancing RNN Test Coverage and Online Error Prediction using State Machine Extraction**|Pouria Golshanrad et.al.|[2402.06966](http://arxiv.org/abs/2402.06966)|**[link](https://github.com/pouriagr/deep-cover)**|\n", "2402.06923": "|**2024-02-10**|**CochCeps-Augment: A Novel Self-Supervised Contrastive Learning Using Cochlear Cepstrum-based Masking for Speech Emotion Recognition**|Ioannis Ziogas et.al.|[2402.06923](http://arxiv.org/abs/2402.06923)|null|\n", "2402.06592": "|**2024-02-09**|**Self-consistent context aware conformer transducer for speech recognition**|Konstantin Kolokolov et.al.|[2402.06592](http://arxiv.org/abs/2402.06592)|null|\n", "2402.05706": "|**2024-02-08**|**Unified Speech-Text Pretraining for Spoken Dialog Modeling**|Heeseung Kim et.al.|[2402.05706](http://arxiv.org/abs/2402.05706)|null|\n", "2402.05457": "|**2024-02-08**|**It's Never Too Late: Fusing Acoustic Information into Large Language Models for Automatic Speech Recognition**|Chen Chen et.al.|[2402.05457](http://arxiv.org/abs/2402.05457)|null|\n", "2402.04805": "|**2024-02-07**|**Progressive unsupervised domain adaptation for ASR using ensemble models and multi-stage training**|Rehan Ahmad et.al.|[2402.04805](http://arxiv.org/abs/2402.04805)|null|\n", "2402.03988": "|**2024-05-28**|**REBORN: Reinforcement-Learned Boundary Segmentation with Iterative Training for Unsupervised ASR**|Liang-Hsuan Tseng et.al.|[2402.03988](http://arxiv.org/abs/2402.03988)|**[link](https://github.com/andybi7676/reborn-uasr)**|\n", "2402.03519": "|**2024-02-05**|**Resolving Transcription Ambiguity in Spanish: A Hybrid Acoustic-Lexical System for Punctuation Restoration**|Xiliang Zhu et.al.|[2402.03519](http://arxiv.org/abs/2402.03519)|null|\n", "2402.03050": "|**2024-02-05**|**A Comprehensive Study of the Current State-of-the-Art in Nepali Automatic Speech Recognition Systems**|Rupak Raj Ghimire et.al.|[2402.03050](http://arxiv.org/abs/2402.03050)|null|\n", "2402.02302": "|**2024-02-03**|**Predicting positive transfer for improved low-resource speech recognition using acoustic pseudo-tokens**|Nay San et.al.|[2402.02302](http://arxiv.org/abs/2402.02302)|null|\n", "2402.01931": "|**2024-02-02**|**Digits micro-model for accurate and secure transactions**|Chirag Chhablani et.al.|[2402.01931](http://arxiv.org/abs/2402.01931)|null|\n", "2402.01917": "|**2024-02-02**|**Whispering in Norwegian: Navigating Orthographic and Dialectic Challenges**|Per E Kummervold et.al.|[2402.01917](http://arxiv.org/abs/2402.01917)|null|\n", "2402.01172": "|**2024-02-02**|**Streaming Sequence Transduction through Dynamic Compression**|Weiting Tan et.al.|[2402.01172](http://arxiv.org/abs/2402.01172)|**[link](https://github.com/steventan0110/star)**|\n", "2402.01152": "|**2024-02-05**|**AccentFold: A Journey through African Accents for Zero-Shot ASR Adaptation to Target Accents**|Abraham Toluwase Owodunni et.al.|[2402.01152](http://arxiv.org/abs/2402.01152)|null|\n", "2402.01778": "|**2024-02-01**|**Introduction to speech recognition**|Gabriel Dauphin et.al.|[2402.01778](http://arxiv.org/abs/2402.01778)|null|\n", "2402.00632": "|**2024-02-01**|**Prosody in Cascade and Direct Speech-to-Text Translation: a case study on Korean Wh-Phrases**|Giulio Zhou et.al.|[2402.00632](http://arxiv.org/abs/2402.00632)|null|\n", "2402.00235": "|**2024-01-31**|**Exploring the limits of decoder-only models trained on public speech recognition corpora**|Ankit Gupta et.al.|[2402.00235](http://arxiv.org/abs/2402.00235)|null|\n", "2401.18045": "|**2024-01-31**|**SpeechComposer: Unifying Multiple Speech Tasks with Prompt Composition**|Yihan Wu et.al.|[2401.18045](http://arxiv.org/abs/2401.18045)|null|\n", "2401.17604": "|**2024-02-08**|**Computation and Parameter Efficient Multi-Modal Fusion Transformer for Cued Speech Recognition**|Lei Liu et.al.|[2401.17604](http://arxiv.org/abs/2401.17604)|null|\n", "2401.16658": "|**2024-06-16**|**OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer**|Yifan Peng et.al.|[2401.16658](http://arxiv.org/abs/2401.16658)|null|\n", "2401.15704": "|**2024-01-28**|**Phoneme-Based Proactive Anti-Eavesdropping with Controlled Recording Privilege**|Peng Huang et.al.|[2401.15704](http://arxiv.org/abs/2401.15704)|null|\n", "2401.15676": "|**2024-01-28**|**On Speaker Attribution with SURT**|Desh Raj et.al.|[2401.15676](http://arxiv.org/abs/2401.15676)|**[link](https://github.com/k2-fsa/icefall)**|\n", "2401.15532": "|**2024-01-28**|**Byte Pair Encoding Is All You Need For Automatic Bengali Speech Recognition**|Ahnaf Mozib Samin et.al.|[2401.15532](http://arxiv.org/abs/2401.15532)|null|\n", "2401.15385": "|**2024-01-27**|**Towards Event Extraction from Speech with Contextual Clues**|Jingqi Kang et.al.|[2401.15385](http://arxiv.org/abs/2401.15385)|**[link](https://github.com/jodie-kang/speechee)**|\n", "2401.14890": "|**2024-01-26**|**Comparison of parameters of vowel sounds of russian and english languages**|V. I. Fedoseev et.al.|[2401.14890](http://arxiv.org/abs/2401.14890)|null|\n", "2401.14625": "|**2024-01-26**|**Toward Practical Automatic Speech Recognition and Post-Processing: a Call for Explainable Error Benchmark Guideline**|Seonmin Koo et.al.|[2401.14625](http://arxiv.org/abs/2401.14625)|null|\n", "2401.14185": "|**2024-01-25**|**TDFNet: An Efficient Audio-Visual Speech Separation Model with Top-down Fusion**|Samuel Pegg et.al.|[2401.14185](http://arxiv.org/abs/2401.14185)|**[link](https://github.com/spkgyk/TDFNet)**|\n", "2401.13575": "|**2024-01-24**|**CNN architecture extraction on edge GPU**|Peter Horvath et.al.|[2401.13575](http://arxiv.org/abs/2401.13575)|null|\n", "2401.13463": "|**2024-03-18**|**SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken Question Answering**|Chyi-Jiunn Lin et.al.|[2401.13463](http://arxiv.org/abs/2401.13463)|null|\n", "2401.13260": "|**2024-05-28**|**MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion, Asr Error Detection, and Asr Error Correction**|Jiajun He et.al.|[2401.13260](http://arxiv.org/abs/2401.13260)|null|\n", "2401.13146": "|**2024-01-23**|**Locality enhanced dynamic biasing and sampling strategies for contextual ASR**|Md Asif Jalal et.al.|[2401.13146](http://arxiv.org/abs/2401.13146)|null|\n", "2401.12789": "|**2024-01-23**|**Multilingual and Fully Non-Autoregressive ASR with Large Language Model Fusion: A Comprehensive Study**|W. Ronny Huang et.al.|[2401.12789](http://arxiv.org/abs/2401.12789)|null|\n", "2401.12085": "|**2024-01-22**|**Consistency Based Unsupervised Self-training For ASR Personalisation**|Jisi Zhang et.al.|[2401.12085](http://arxiv.org/abs/2401.12085)|null|\n", "2401.11983": "|**2024-01-22**|**Lightweight Protection for Privacy in Offloaded Speech Understanding**|Dongqi Cai et.al.|[2401.11983](http://arxiv.org/abs/2401.11983)|null|\n", "2401.11700": "|**2024-01-22**|**Keep Decoding Parallel with Effective Knowledge Distillation from Language Models to End-to-end Speech Recognisers**|Michael Hentschel et.al.|[2401.11700](http://arxiv.org/abs/2401.11700)|null|\n", "2401.11382": "|**2024-06-06**|**Using Large Language Model for End-to-End Chinese ASR and NER**|Yuang Li et.al.|[2401.11382](http://arxiv.org/abs/2401.11382)|null|\n", "2401.11268": "|**2024-02-02**|**Word-Level ASR Quality Estimation for Efficient Corpus Sampling and Post-Editing through Analyzing Attentions of a Reference-Free Metric**|Golara Javadi et.al.|[2401.11268](http://arxiv.org/abs/2401.11268)|**[link](https://github.com/aixplain/NoRefER)**|\n", "2401.11132": "|**2024-01-20**|**ConceptThread: Visualizing Threaded Concepts in MOOC Videos**|Zhiguang Zhou et.al.|[2401.11132](http://arxiv.org/abs/2401.11132)|null|\n", "2401.10449": "|**2024-01-19**|**Contextualized Automatic Speech Recognition with Attention-Based Bias Phrase Boosted Beam Search**|Yui Sudo et.al.|[2401.10449](http://arxiv.org/abs/2401.10449)|null|\n", "2401.10447": "|**2024-01-19**|**Investigating Training Strategies and Model Robustness of Low-Rank Adaptation for Language Modeling in Speech Recognition**|Yu Yu et.al.|[2401.10447](http://arxiv.org/abs/2401.10447)|null|\n", "2401.10446": "|**2024-01-19**|**Large Language Models are Efficient Learners of Noise-Robust Speech Recognition**|Yuchen Hu et.al.|[2401.10446](http://arxiv.org/abs/2401.10446)|**[link](https://github.com/yuchen005/robustger)**|\n", "2401.10411": "|**2024-01-18**|**AGADIR: Towards Array-Geometry Agnostic Directional Speech Recognition**|Ju Lin et.al.|[2401.10411](http://arxiv.org/abs/2401.10411)|null|\n", "2401.10070": "|**2024-01-18**|**Communication-Efficient Personalized Federated Learning for Speech-to-Text Tasks**|Yichao Du et.al.|[2401.10070](http://arxiv.org/abs/2401.10070)|null|\n", "2401.09802": "|**2024-07-18**|**Efficient Training for Multilingual Visual Speech Recognition: Pre-training with Discretized Visual Speech Representation**|Minsu Kim et.al.|[2401.09802](http://arxiv.org/abs/2401.09802)|null|\n", "2401.09759": "|**2024-07-02**|**SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech Recognition**|Hao Wang et.al.|[2401.09759](http://arxiv.org/abs/2401.09759)|null|\n", "2401.09315": "|**2024-01-17**|**On Speech Pre-emphasis as a Simple and Inexpensive Method to Boost Speech Enhancement**|Iv\u00e1n L\u00f3pez-Espejo et.al.|[2401.09315](http://arxiv.org/abs/2401.09315)|null|\n", "2401.08916": "|**2024-01-17**|**Two-pass Endpoint Detection for Speech Recognition**|Anirudh Raju et.al.|[2401.08916](http://arxiv.org/abs/2401.08916)|null|\n", "2401.08887": "|**2024-01-16**|**NOTSOFAR-1 Challenge: New Datasets, Baseline, and Tasks for Distant Meeting Transcription**|Alon Vinnikov et.al.|[2401.08887](http://arxiv.org/abs/2401.08887)|null|\n", "2401.08835": "|**2024-01-16**|**Improving ASR Contextual Biasing with Guided Attention**|Jiyang Tang et.al.|[2401.08835](http://arxiv.org/abs/2401.08835)|null|\n", "2401.08833": "|**2024-01-16**|**Revisiting Self-supervised Learning of Speech Representation from a Mutual Information Perspective**|Alexander H. Liu et.al.|[2401.08833](http://arxiv.org/abs/2401.08833)|null|\n", "2401.08052": "|**2024-03-01**|**Multi-Input Multi-Output Target-Speaker Voice Activity Detection For Unified, Flexible, and Robust Audio-Visual Speaker Diarization**|Ming Cheng et.al.|[2401.08052](http://arxiv.org/abs/2401.08052)|null|\n", "2401.07957": "|**2024-01-15**|**Machine Perceptual Quality: Evaluating the Impact of Severe Lossy Compression on Audio and Image Models**|Dan Jacobellis et.al.|[2401.07957](http://arxiv.org/abs/2401.07957)|**[link](https://github.com/danjacobellis/mpq)**|\n", "2401.07575": "|**2024-07-24**|**Cascaded Cross-Modal Transformer for Audio-Textual Classification**|Nicolae-Catalin Ristea et.al.|[2401.07575](http://arxiv.org/abs/2401.07575)|**[link](https://github.com/ristea/ccmt)**|\n", "2401.07506": "|**2024-01-15**|**SeMaScore : a new evaluation metric for automatic speech recognition tasks**|Zitha Sasindran et.al.|[2401.07506](http://arxiv.org/abs/2401.07506)|null|\n", "2401.07360": "|**2024-01-14**|**Promptformer: Prompted Conformer Transducer for ASR**|Sergio Duarte-Torres et.al.|[2401.07360](http://arxiv.org/abs/2401.07360)|null|\n", "2401.06980": "|**2024-01-13**|**Joint Unsupervised and Supervised Training for Automatic Speech Recognition via Bilevel Optimization**|A F M Saif et.al.|[2401.06980](http://arxiv.org/abs/2401.06980)|**[link](https://github.com/afmsaif/joint-unsupervised-and-supervised-training-for-automatic-speech-recognition-via-bilevel-optimization)**|\n", "2401.09354": "|**2024-01-12**|**Transcending Controlled Environments Assessing the Transferability of ASRRobust NLU Models to Real-World Applications**|Hania Khan et.al.|[2401.09354](http://arxiv.org/abs/2401.09354)|null|\n", "2401.06588": "|**2024-01-12**|**Dynamic Behaviour of Connectionist Speech Recognition with Strong Latency Constraints**|Giampiero Salvi et.al.|[2401.06588](http://arxiv.org/abs/2401.06588)|null|\n", "2401.06832": "|**2024-01-12**|**XLS-R Deep Learning Model for Multilingual ASR on Low- Resource Languages: Indonesian, Javanese, and Sundanese**|Panji Arisaputra et.al.|[2401.06832](http://arxiv.org/abs/2401.06832)|null|\n", "2401.06390": "|**2024-01-12**|**LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition**|Fan Yu et.al.|[2401.06390](http://arxiv.org/abs/2401.06390)|**[link](https://github.com/alibaba-damo-academy/FunASR)**|\n", "2401.05689": "|**2024-01-11**|**UCorrect: An Unsupervised Framework for Automatic Speech Recognition Error Correction**|Jiaxin Guo et.al.|[2401.05689](http://arxiv.org/abs/2401.05689)|null|\n", "2401.06183": "|**2024-01-11**|**End to end Hindi to English speech conversion using Bark, mBART and a finetuned XLSR Wav2Vec2**|Aniket Tathe et.al.|[2401.06183](http://arxiv.org/abs/2401.06183)|null|\n", "2401.05551": "|**2024-01-10**|**Useful Blunders: Can Automated Speech Recognition Errors Improve Downstream Dementia Classification?**|Changye Li et.al.|[2401.05551](http://arxiv.org/abs/2401.05551)|null|\n", "2401.05336": "|**2024-01-10**|**Towards Online Sign Language Recognition and Translation**|Ronglai Zuo et.al.|[2401.05336](http://arxiv.org/abs/2401.05336)|**[link](https://github.com/FangyunWei/SLRT)**|\n", "2401.04482": "|**2024-07-17**|**Continuously Learning New Words in Automatic Speech Recognition**|Christian Huber et.al.|[2401.04482](http://arxiv.org/abs/2401.04482)|null|\n", "2401.04235": "|**2024-01-08**|**High-precision Voice Search Query Correction via Retrievable Speech-text Embedings**|Christopher Li et.al.|[2401.04235](http://arxiv.org/abs/2401.04235)|null|\n", "2401.04152": "|**2024-07-22**|**Cross-Speaker Encoding Network for Multi-Talker Speech Recognition**|Jiawen Kang et.al.|[2401.04152](http://arxiv.org/abs/2401.04152)|**[link](https://github.com/kjw11/csenet-asr)**|\n", "2401.03936": "|**2024-01-08**|**Exploratory Evaluation of Speech Content Masking**|Jennifer Williams et.al.|[2401.03936](http://arxiv.org/abs/2401.03936)|null|\n", "2401.03697": "|**2024-03-07**|**An audio-quality-based multi-strategy approach for target speaker extraction in the MISP 2023 Challenge**|Runduo Han et.al.|[2401.03697](http://arxiv.org/abs/2401.03697)|null|\n", "2401.03689": "|**2024-06-10**|**LUPET: Incorporating Hierarchical Information Path into Multilingual ASR**|Wei Liu et.al.|[2401.03689](http://arxiv.org/abs/2401.03689)|null|\n", "2401.03687": "|**2024-01-08**|**BS-PLCNet: Band-split Packet Loss Concealment Network with Multi-task Learning Framework and Multi-discriminators**|Zihan Zhang et.al.|[2401.03687](http://arxiv.org/abs/2401.03687)|null|\n", "2401.03506": "|**2024-07-22**|**DiarizationLM: Speaker Diarization Post-Processing with Large Language Models**|Quan Wang et.al.|[2401.03506](http://arxiv.org/abs/2401.03506)|**[link](https://github.com/google/speaker-id)**|\n", "2401.06788": "|**2024-02-29**|**The NPU-ASLP-LiAuto System Description for Visual Speech Recognition in CNVSRC 2023**|He Wang et.al.|[2401.06788](http://arxiv.org/abs/2401.06788)|**[link](https://github.com/mkt-dataoceanai/cnvsrc2023baseline)**|\n", "2401.03473": "|**2024-02-21**|**ICMC-ASR: The ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition Challenge**|He Wang et.al.|[2401.03473](http://arxiv.org/abs/2401.03473)|null|\n", "2401.03468": "|**2024-01-07**|**Multichannel AV-wav2vec2: A Framework for Learning Multichannel Multi-Modal Speech Representation**|Qiushi Zhu et.al.|[2401.03468](http://arxiv.org/abs/2401.03468)|**[link](https://github.com/zqs01/multi-channel-wav2vec2)**|\n", "2401.03424": "|**2024-04-08**|**MLCA-AVSR: Multi-Layer Cross Attention Fusion based Audio-Visual Speech Recognition**|He Wang et.al.|[2401.03424](http://arxiv.org/abs/2401.03424)|null|\n", "2401.03251": "|**2024-01-06**|**TeLeS: Temporal Lexeme Similarity Score to Estimate Confidence in End-to-End ASR**|Nagarathna Ravi et.al.|[2401.03251](http://arxiv.org/abs/2401.03251)|**[link](https://github.com/madhavlab/2023_teles_wlc)**|\n", "2401.03175": "|**2024-01-06**|**Part-of-Speech Tagger for Bodo Language using Deep Learning approach**|Dhrubajyoti Pathak et.al.|[2401.03175](http://arxiv.org/abs/2401.03175)|null|\n", "2401.02921": "|**2024-01-05**|**Towards ASR Robust Spoken Language Understanding Through In-Context Learning With Word Confusion Networks**|Kevin Everson et.al.|[2401.02921](http://arxiv.org/abs/2401.02921)|null|\n", "2401.02890": "|**2024-01-05**|**Nonlinear functional regression by functional deep neural network with kernel embedding**|Zhongjie Shi et.al.|[2401.02890](http://arxiv.org/abs/2401.02890)|null|\n", "2401.02673": "|**2024-01-05**|**A unified multichannel far-field speech recognition system: combining neural beamforming with attention based end-to-end model**|Dongdi Zhao et.al.|[2401.02673](http://arxiv.org/abs/2401.02673)|null|\n", "2401.02417": "|**2024-01-04**|**Task Oriented Dialogue as a Catalyst for Self-Supervised Automatic Speech Recognition**|David M. Chan et.al.|[2401.02417](http://arxiv.org/abs/2401.02417)|**[link](https://github.com/amazon-science/amazon-od3)**|\n", "2402.10218": "|**2024-01-04**|**AntiDeepFake: AI for Deep Fake Speech Recognition**|Enkhtogtokh Togootogtokh et.al.|[2402.10218](http://arxiv.org/abs/2402.10218)|null|\n", "2401.02046": "|**2024-01-04**|**CTC Blank Triggered Dynamic Layer-Skipping for Efficient CTC-based Speech Recognition**|Junfeng Hou et.al.|[2401.02046](http://arxiv.org/abs/2401.02046)|null|\n", "2401.01572": "|**2024-01-03**|**Hallucinations in Neural Automatic Speech Recognition: Identifying Errors and Hallucinatory Models**|Rita Frieske et.al.|[2401.01572](http://arxiv.org/abs/2401.01572)|null|\n", "2401.01537": "|**2024-06-04**|**The Art of Deception: Robust Backdoor Attack using Dynamic Stacking of Triggers**|Orson Mengara et.al.|[2401.01537](http://arxiv.org/abs/2401.01537)|null|\n", "2401.00662": "|**2024-01-01**|**Enhancing Pre-trained ASR System Fine-tuning for Dysarthric Speech Recognition using Adversarial Data Augmentation**|Huimeng Wang et.al.|[2401.00662](http://arxiv.org/abs/2401.00662)|null|\n", "2312.17279": "|**2024-05-02**|**Stateful Conformer with Cache-based Inference for Streaming Automatic Speech Recognition**|Vahid Noroozi et.al.|[2312.17279](http://arxiv.org/abs/2312.17279)|null|\n", "2312.16002": "|**2023-12-26**|**The NUS-HLT System for ICASSP2024 ICMC-ASR Grand Challenge**|Meng Ge et.al.|[2312.16002](http://arxiv.org/abs/2312.16002)|null|\n", "2312.15922": "|**2023-12-26**|**Towards Probing Contact Center Large Language Models**|Varun Nathan et.al.|[2312.15922](http://arxiv.org/abs/2312.15922)|null|\n", "2312.15499": "|**2023-12-24**|**Exploring data augmentation in bias mitigation against non-native-accented speech**|Yuanyuan Zhang et.al.|[2312.15499](http://arxiv.org/abs/2312.15499)|null|\n", "2312.14609": "|**2023-12-22**|**BLSTM-Based Confidence Estimation for End-to-End Speech Recognition**|Atsunori Ogawa et.al.|[2312.14609](http://arxiv.org/abs/2312.14609)|null|\n", "2312.14378": "|**2024-02-09**|**Multimodal Attention Merging for Improved Speech Recognition and Audio Event Classification**|Anirudh S. Sundar et.al.|[2312.14378](http://arxiv.org/abs/2312.14378)|null|\n", "2312.14055": "|**2024-07-22**|**Multi-Sentence Grounding for Long-term Instructional Video**|Zeqian Li et.al.|[2312.14055](http://arxiv.org/abs/2312.14055)|null|\n", "2312.14020": "|**2023-12-21**|**BANSpEmo: A Bangla Emotional Speech Recognition Dataset**|Md Gulzar Hussain et.al.|[2312.14020](http://arxiv.org/abs/2312.14020)|null|\n", "2312.13873": "|**2023-12-21**|**Self-Supervised Adaptive AV Fusion Module for Pre-Trained ASR Models**|Christopher Simic et.al.|[2312.13873](http://arxiv.org/abs/2312.13873)|null|\n", "2312.13560": "|**2024-02-03**|**kNN-CTC: Enhancing ASR via Retrieval of CTC Pseudo Labels**|Jiaming Zhou et.al.|[2312.13560](http://arxiv.org/abs/2312.13560)|**[link](https://github.com/nku-hlt/knn-ctc)**|\n", "2408.02582": "|**2024-08-05**|**Clustering and Mining Accented Speech for Inclusive and Fair Speech Recognition**|Jaeyoung Kim et.al.|[2408.02582](http://arxiv.org/abs/2408.02582)|null|\n", "2408.02369": "|**2024-08-08**|**The NPU-ASLP System Description for Visual Speech Recognition in CNVSRC 2024**|He Wang et.al.|[2408.02369](http://arxiv.org/abs/2408.02369)|**[link](https://gitlab.com/csltstu/sunine)**|\n", "2408.02178": "|**2024-08-05**|**StreamVoice+: Evolving into End-to-end Streaming Zero-shot Voice Conversion**|Zhichao Wang et.al.|[2408.02178](http://arxiv.org/abs/2408.02178)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.02978": "|**2024-08-06**|**ASR-enhanced Multimodal Representation Learning for Cross-Domain Product Retrieval**|Ruixiang Zhao et.al.|[2408.02978](http://arxiv.org/abs/2408.02978)|null|\n", "2408.02945": "|**2024-08-06**|**Self-Supervised Learning for Multi-Channel Neural Transducer**|Atsushi Kojima et.al.|[2408.02945](http://arxiv.org/abs/2408.02945)|null|\n", "2408.04325": "|**2024-08-08**|**HydraFormer: One Encoder For All Subsampling Rates**|Yaoxun Xu et.al.|[2408.04325](http://arxiv.org/abs/2408.04325)|**[link](https://github.com/hydraformer/hydraformer)**|\n", "2408.04306": "|**2024-08-08**|**Preserving spoken content in voice anonymisation with character-level vocoder conditioning**|Michele Panariello et.al.|[2408.04306](http://arxiv.org/abs/2408.04306)|**[link](https://github.com/m-pana/spk_anon_nac_lm)**|\n", "2408.04174": "|**2024-08-08**|**wav2graph: A Framework for Supervised Learning Knowledge Graph from Speech**|Khai Le-Duc et.al.|[2408.04174](http://arxiv.org/abs/2408.04174)|**[link](https://github.com/leduckhai/wav2graph)**|\n", "2408.03979": "|**2024-08-07**|**Speaker Adaptation for Quantised End-to-End ASR Models**|Qiuming Zhao et.al.|[2408.03979](http://arxiv.org/abs/2408.03979)|null|\n", "2408.05101": "|**2024-08-09**|**MooER: LLM-based Speech Recognition and Translation Models from Moore Threads**|Junhao Xu et.al.|[2408.05101](http://arxiv.org/abs/2408.05101)|**[link](https://github.com/moorethreads/mooer)**|\n", "2408.06264": "|**2024-08-12**|**Audio Enhancement for Computer Audition -- An Iterative Training Paradigm Using Sample Importance**|Manuel Milling et.al.|[2408.06264](http://arxiv.org/abs/2408.06264)|null|\n", "2408.06043": "|**2024-08-12**|**Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning**|Wonjun Lee et.al.|[2408.06043](http://arxiv.org/abs/2408.06043)|null|\n", "2408.05769": "|**2024-08-11**|**LI-TTA: Language Informed Test-Time Adaptation for Automatic Speech Recognition**|Eunseop Yoon et.al.|[2408.05769](http://arxiv.org/abs/2408.05769)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.05554": "|**2024-08-10**|**Improving Whisper's Recognition Performance for Under-Represented Language Kazakh Leveraging Unpaired Speech and Text**|Jinpeng Li et.al.|[2408.05554](http://arxiv.org/abs/2408.05554)|null|\n", "2408.06484": "|**2024-08-12**|**Cross-Lingual Conversational Speech Summarization with Large Language Models**|Max Nelson et.al.|[2408.06484](http://arxiv.org/abs/2408.06484)|null|\n", "2408.07388": "|**2024-08-14**|**DPSNN: Spiking Neural Network for Low-Latency Streaming Speech Enhancement**|Tao Sun et.al.|[2408.07388](http://arxiv.org/abs/2408.07388)|null|\n", "2408.08027": "|**2024-08-15**|**Enhancing Large Language Model-based Speech Recognition by Contextualization for Rare and Ambiguous Words**|Kento Nozawa et.al.|[2408.08027](http://arxiv.org/abs/2408.08027)|null|\n", "2408.07851": "|**2024-08-14**|**SER Evals: In-domain and Out-of-domain Benchmarking for Speech Emotion Recognition**|Mohamed Osman et.al.|[2408.07851](http://arxiv.org/abs/2408.07851)|**[link](https://github.com/spaghettiSystems/serval)**|\n", "2408.07081": "|**2024-08-16**|**MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical Expressions into $LaTeX$ Formulas for Improved Readability**|Kyudan Jung et.al.|[2408.07081](http://arxiv.org/abs/2408.07081)|null|\n", "2408.09688": "|**2024-08-19**|**Recording for Eyes, Not Echoing to Ears: Contextualized Spoken-to-Written Conversion of ASR Transcripts**|Jiaqing Liu et.al.|[2408.09688](http://arxiv.org/abs/2408.09688)|null|\n", "2408.09491": "|**2024-08-18**|**A Transcription Prompt-based Efficient Audio Large Language Model for Robust Speech Recognition**|Yangze Li et.al.|[2408.09491](http://arxiv.org/abs/2408.09491)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10524": "|**2024-08-20**|**XCB: an effective contextual biasing approach to bias cross-lingual phrases in speech recognition**|Xucheng Wan et.al.|[2408.10524](http://arxiv.org/abs/2408.10524)|null|\n", "2408.11804": "|**2024-08-21**|**Approaching Deep Learning through the Spectral Dynamics of Weights**|David Yunis et.al.|[2408.11804](http://arxiv.org/abs/2408.11804)|**[link](https://github.com/dyunis/spectral_dynamics)**|\n", "2408.11258": "|**2024-08-21**|**Improving Speech Recognition Error Prediction for Modern and Off-the-shelf Speech Recognizers**|Prashant Serai et.al.|[2408.11258](http://arxiv.org/abs/2408.11258)|null|\n", "2408.12500": "|**2024-08-22**|**WhisperMask: A Noise Suppressive Mask-Type Microphone for Whisper Speech**|Hirotaka Hiraki et.al.|[2408.12500](http://arxiv.org/abs/2408.12500)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12279": "|**2024-08-22**|**Developing vocal system impaired patient-aimed voice quality assessment approach using ASR representation-included multiple features**|Shaoxiang Dang et.al.|[2408.12279](http://arxiv.org/abs/2408.12279)|null|\n", "2408.11940": "|**2024-08-21**|**The State of Commercial Automatic French Legal Speech Recognition Systems and their Impact on Court Reporters et al**|Nicolad Garneau et.al.|[2408.11940](http://arxiv.org/abs/2408.11940)|null|\n", "2408.11873": "|**2024-08-19**|**Parameter-Efficient Transfer Learning under Federated Learning for Automatic Speech Recognition**|Xuan Kan et.al.|[2408.11873](http://arxiv.org/abs/2408.11873)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13106": "|**2024-08-28**|**NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks**|He Huang et.al.|[2408.13106](http://arxiv.org/abs/2408.13106)|null|\n", "2408.13008": "|**2024-08-23**|**Focused Discriminative Training For Streaming CTC-Trained Automatic Speech Recognition Models**|Adnan Haider et.al.|[2408.13008](http://arxiv.org/abs/2408.13008)|null|\n", "2408.12734": "|**2024-08-22**|**Towards measuring fairness in speech recognition: Fair-Speech dataset**|Irina-Elena Veliche et.al.|[2408.12734](http://arxiv.org/abs/2408.12734)|null|\n", "2408.14418": "|**2024-08-26**|**MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR Errors with LLM-generated Synthetic Dialogues**|Kuluhan Binici et.al.|[2408.14418](http://arxiv.org/abs/2408.14418)|null|\n", "2408.14262": "|**2024-08-26**|**Self-supervised Speech Representations Still Struggle with African American Vernacular English**|Kalvin Chang et.al.|[2408.14262](http://arxiv.org/abs/2408.14262)|**[link](https://github.com/cmu-llab/s3m-aave)**|\n", "2408.14082": "|**2024-08-26**|**Automatic recognition and detection of aphasic natural speech**|Mara Barberis et.al.|[2408.14082](http://arxiv.org/abs/2408.14082)|null|\n", "2408.13996": "|**2024-08-28**|**Research Advances and New Paradigms for Biology-inspired Spiking Neural Networks**|Tianyu Zheng et.al.|[2408.13996](http://arxiv.org/abs/2408.13996)|null|\n", "2408.13739": "|**2024-08-25**|**Literary and Colloquial Tamil Dialect Identification**|M. Nanmalar et.al.|[2408.13739](http://arxiv.org/abs/2408.13739)|null|\n", "2408.13644": "|**2024-08-24**|**Studying the Effect of Audio Filters in Pre-Trained Models for Environmental Sound Classification**|Aditya Dawn et.al.|[2408.13644](http://arxiv.org/abs/2408.13644)|null|\n", "2408.14991": "|**2024-08-27**|**Speech Recognition Transformers: Topological-lingualism Perspective**|Shruti Singh et.al.|[2408.14991](http://arxiv.org/abs/2408.14991)|null|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.15616": "|**2024-08-28**|**Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error Rate Computations And Granular Error Classifications**|Korbinian Kuhn et.al.|[2408.15616](http://arxiv.org/abs/2408.15616)|**[link](https://github.com/shuffle-project/beyond-levenshtein)**|\n", "2408.15585": "|**2024-08-28**|**Whisper-PMFA: Partial Multi-Scale Feature Aggregation for Speaker Verification using Whisper Models**|Yiyang Zhao et.al.|[2408.15585](http://arxiv.org/abs/2408.15585)|null|\n", "2408.16589": "|**2024-08-29**|**CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions**|Laurin Wagner et.al.|[2408.16589](http://arxiv.org/abs/2408.16589)|null|\n", "2408.16564": "|**2024-08-29**|**Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing Interaction and Causal Processing**|Qianhui Liu et.al.|[2408.16564](http://arxiv.org/abs/2408.16564)|null|\n", "2408.16287": "|**2024-08-29**|**Measuring the Accuracy of Automatic Speech Recognition Solutions**|Korbinian Kuhn et.al.|[2408.16287](http://arxiv.org/abs/2408.16287)|**[link](https://github.com/shuffle-project/asr-comparison)**|\n", "2408.16204": "|**2024-08-29**|**Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient Manipulation**|Lun Wang et.al.|[2408.16204](http://arxiv.org/abs/2408.16204)|null|\n", "2408.16180": "|**2024-08-29**|**Benchmarking Japanese Speech Recognition on ASR-LLM Setups with Multi-Pass Augmented Generative Error Correction**|Yuka Ko et.al.|[2408.16180](http://arxiv.org/abs/2408.16180)|null|\n"}, "TTS": {"2408.06227": "|**2024-08-12**|**FLEURS-R: A Restored Multilingual Speech Corpus for Generation Tasks**|Min Ma et.al.|[2408.06227](http://arxiv.org/abs/2408.06227)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.03887": "|**2024-08-06**|**Central Kurdish Text-to-Speech Synthesis with Novel End-to-End Transformer Training**|Hawraz A. Ahmad et.al.|[2408.03887](http://arxiv.org/abs/2408.03887)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.00284": "|**2024-08-01**|**Bailing-TTS: Chinese Dialectal Speech Synthesis Towards Human-like Spontaneous Representation**|Xinhan Di et.al.|[2408.00284](http://arxiv.org/abs/2408.00284)|null|\n", "2407.21491": "|**2024-08-01**|**Generative Expressive Conversational Speech Synthesis**|Rui Liu et.al.|[2407.21491](http://arxiv.org/abs/2407.21491)|**[link](https://github.com/ai-s2-lab/gpt-talker)**|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18541": "|**2024-07-26**|**Towards Improving NAM-to-Speech Synthesis Intelligibility using Self-Supervised Speech Models**|Neil Shah et.al.|[2407.18541](http://arxiv.org/abs/2407.18541)|null|\n", "2407.18505": "|**2024-07-26**|**VoxSim: A perceptual voice similarity dataset**|Junseok Ahn et.al.|[2407.18505](http://arxiv.org/abs/2407.18505)|null|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17167": "|**2024-07-24**|**Zero-Shot vs. Few-Shot Multi-Speaker TTS Using Pre-trained Czech SpeechT5 Model**|Jan Lehe\u010dka et.al.|[2407.17167](http://arxiv.org/abs/2407.17167)|null|\n", "2407.16840": "|**2024-07-23**|**Synth4Kws: Synthesized Speech for User Defined Keyword Spotting in Low Resource Environments**|Pai Zhu et.al.|[2407.16840](http://arxiv.org/abs/2407.16840)|null|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15188": "|**2024-07-21**|**Overview of Speaker Modeling and Its Applications: From the Lens of Deep Speaker Representation Learning**|Shuai Wang et.al.|[2407.15188](http://arxiv.org/abs/2407.15188)|null|\n", "2407.14212": "|**2024-07-19**|**Braille-to-Speech Generator: Audio Generation Based on Joint Fine-Tuning of CLIP and Fastspeech2**|Chun Xu et.al.|[2407.14212](http://arxiv.org/abs/2407.14212)|null|\n", "2407.14056": "|**2024-07-19**|**Rasa: Building Expressive Speech Synthesis Systems for Indian Languages in Low-resource Settings**|Praveen Srinivasa Varadhan et.al.|[2407.14056](http://arxiv.org/abs/2407.14056)|**[link](https://github.com/AI4Bharat/Rasa)**|\n", "2407.14006": "|**2024-07-19**|**MSceneSpeech: A Multi-Scene Speech Dataset For Expressive Speech Synthesis**|Qian Yang et.al.|[2407.14006](http://arxiv.org/abs/2407.14006)|null|\n", "2407.13509": "|**2024-07-18**|**Spontaneous Style Text-to-Speech Synthesis with Controllable Spontaneous Behaviors Based on Language Models**|Weiqin Li et.al.|[2407.13509](http://arxiv.org/abs/2407.13509)|null|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.12707": "|**2024-07-22**|**TTSDS -- Text-to-Speech Distribution Score**|Christoph Minixhofer et.al.|[2407.12707](http://arxiv.org/abs/2407.12707)|**[link](https://github.com/ttsds/ttsds)**|\n", "2408.00788": "|**2024-07-17**|**SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network**|Kexin Wang et.al.|[2408.00788](http://arxiv.org/abs/2408.00788)|null|\n", "2407.12229": "|**2024-07-17**|**Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-to-Speech**|Haibin Wu et.al.|[2407.12229](http://arxiv.org/abs/2407.12229)|null|\n", "2407.12206": "|**2024-07-16**|**A Language Modeling Approach to Diacritic-Free Hebrew TTS**|Amit Roth et.al.|[2407.12206](http://arxiv.org/abs/2407.12206)|null|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.09370": "|**2024-07-17**|**Learning High-Frequency Functions Made Easy with Sinusoidal Positional Encoding**|Chuanhao Sun et.al.|[2407.09370](http://arxiv.org/abs/2407.09370)|**[link](https://github.com/zhyuan11/SPE)**|\n", "2407.08551": "|**2024-07-11**|**Autoregressive Speech Synthesis without Vector Quantization**|Lingwei Meng et.al.|[2407.08551](http://arxiv.org/abs/2407.08551)|null|\n", "2407.08248": "|**2024-07-11**|**Toward accessible comics for blind and low vision readers**|Christophe Rigaud et.al.|[2407.08248](http://arxiv.org/abs/2407.08248)|null|\n", "2407.08016": "|**2024-07-10**|**Source Tracing of Audio Deepfake Systems**|Nicholas Klein et.al.|[2407.08016](http://arxiv.org/abs/2407.08016)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05471": "|**2024-07-07**|**Fine-Grained and Interpretable Neural Speech Editing**|Max Morrison et.al.|[2407.05471](http://arxiv.org/abs/2407.05471)|**[link](https://github.com/maxrmorrison/torbi)**|\n", "2407.05421": "|**2024-07-07**|**ASRRL-TTS: Agile Speaker Representation Reinforcement Learning for Text-to-Speech Speaker Adaptation**|Ruibo Fu et.al.|[2407.05421](http://arxiv.org/abs/2407.05421)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.04575": "|**2024-07-05**|**FA-GAN: Artifacts-free and Phase-aware High-fidelity GAN-based Vocoder**|Rubing Shen et.al.|[2407.04575](http://arxiv.org/abs/2407.04575)|null|\n", "2407.04291": "|**2024-07-05**|**We Need Variations in Speech Synthesis: Sub-center Modelling for Speaker Embeddings**|Ismail Rasim Ulgen et.al.|[2407.04291](http://arxiv.org/abs/2407.04291)|null|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.04034": "|**2024-07-04**|**Optimizing a-DCF for Spoofing-Robust Speaker Verification**|O\u011fuzhan Kurnaz et.al.|[2407.04034](http://arxiv.org/abs/2407.04034)|null|\n", "2407.03892": "|**2024-07-04**|**On the Effectiveness of Acoustic BPE in Decoder-Only TTS**|Bohan Li et.al.|[2407.03892](http://arxiv.org/abs/2407.03892)|null|\n", "2407.03236": "|**2024-07-14**|**CATT: Character-based Arabic Tashkeel Transformer**|Faris Alasmary et.al.|[2407.03236](http://arxiv.org/abs/2407.03236)|**[link](https://github.com/abjadai/catt)**|\n", "2407.02937": "|**2024-07-03**|**Probing the Feasibility of Multilingual Speaker Anonymization**|Sarina Meyer et.al.|[2407.02937](http://arxiv.org/abs/2407.02937)|**[link](https://github.com/digitalphonetics/speaker-anonymization)**|\n", "2407.02243": "|**2024-07-02**|**Robust Zero-Shot Text-to-Speech Synthesis with Reverse Inference Optimization**|Yuchen Hu et.al.|[2407.02243](http://arxiv.org/abs/2407.02243)|null|\n", "2407.01927": "|**2024-07-02**|**TTSlow: Slow Down Text-to-Speech with Efficiency Robustness Evaluations**|Xiaoxue Gao et.al.|[2407.01927](http://arxiv.org/abs/2407.01927)|null|\n", "2407.01291": "|**2024-07-01**|**Lightweight Zero-shot Text-to-Speech with Mixture of Adapters**|Kenichi Fujita et.al.|[2407.01291](http://arxiv.org/abs/2407.01291)|null|\n", "2407.12038": "|**2024-07-31**|**ICAGC 2024: Inspirational and Convincing Audio Generation Challenge 2024**|Ruibo Fu et.al.|[2407.12038](http://arxiv.org/abs/2407.12038)|null|\n", "2407.00826": "|**2024-06-30**|**NAIST Simultaneous Speech Translation System for IWSLT 2024**|Yuka Ko et.al.|[2407.00826](http://arxiv.org/abs/2407.00826)|null|\n", "2407.00766": "|**2024-06-30**|**An Attribute Interpolation Method in Speech Synthesis by Model Merging**|Masato Murata et.al.|[2407.00766](http://arxiv.org/abs/2407.00766)|null|\n", "2407.00753": "|**2024-06-30**|**FLY-TTS: Fast, Lightweight and High-Quality End-to-End Text-to-Speech Synthesis**|Yinlin Guo et.al.|[2407.00753](http://arxiv.org/abs/2407.00753)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2406.19243": "|**2024-06-27**|**Application of ASV for Voice Identification after VC and Duration Predictor Improvement in TTS Models**|Borodin Kirill Nikolayevich et.al.|[2406.19243](http://arxiv.org/abs/2406.19243)|null|\n", "2406.19135": "|**2024-06-27**|**DEX-TTS: Diffusion-based EXpressive Text-to-Speech with Style Modeling on Time Variability**|Hyun Joon Park et.al.|[2406.19135](http://arxiv.org/abs/2406.19135)|**[link](https://github.com/winddori2002/dex-tts)**|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18089": "|**2024-06-26**|**A Study on Synthesizing Expressive Violin Performances: Approaches and Comparisons**|Tzu-Yun Hung et.al.|[2406.18089](http://arxiv.org/abs/2406.18089)|null|\n", "2406.18088": "|**2024-06-29**|**LLM-Driven Multimodal Opinion Expression Identification**|Bonian Jia et.al.|[2406.18088](http://arxiv.org/abs/2406.18088)|null|\n", "2406.18009": "|**2024-06-26**|**E2 TTS: Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS**|Sefik Emre Eskimez et.al.|[2406.18009](http://arxiv.org/abs/2406.18009)|null|\n", "2406.17957": "|**2024-06-25**|**Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment**|Paarth Neekhara et.al.|[2406.17957](http://arxiv.org/abs/2406.17957)|null|\n", "2406.17310": "|**2024-06-25**|**High Fidelity Text-to-Speech Via Discrete Tokens Using Token Transducer and Group Masked Language Model**|Joun Yeop Lee et.al.|[2406.17310](http://arxiv.org/abs/2406.17310)|null|\n", "2406.17257": "|**2024-06-25**|**Leveraging Parameter-Efficient Transfer Learning for Multi-Lingual Text-to-Speech Adaptation**|Yingting Li et.al.|[2406.17257](http://arxiv.org/abs/2406.17257)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16751": "|**2024-07-07**|**Towards Zero-Shot Text-To-Speech for Arabic Dialects**|Khai Duy Doan et.al.|[2406.16751](http://arxiv.org/abs/2406.16751)|null|\n", "2406.16716": "|**2024-06-24**|**One-Class Learning with Adaptive Centroid Shift for Audio Deepfake Detection**|Hyun Myung Kim et.al.|[2406.16716](http://arxiv.org/abs/2406.16716)|null|\n", "2406.17801": "|**2024-06-22**|**A multi-speaker multi-lingual voice cloning system based on vits2 for limmits 2024 challenge**|Xiaopeng Wang et.al.|[2406.17801](http://arxiv.org/abs/2406.17801)|null|\n", "2406.15752": "|**2024-06-22**|**TacoLM: GaTed Attention Equipped Codec Language Model are Efficient Zero-Shot Text to Speech Synthesizers**|Yakun Song et.al.|[2406.15752](http://arxiv.org/abs/2406.15752)|**[link](https://github.com/Ereboas/TacoLM)**|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14875": "|**2024-06-21**|**GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech**|Wenbin Wang et.al.|[2406.14875](http://arxiv.org/abs/2406.14875)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.12946": "|**2024-06-18**|**Instruction Data Generation and Unsupervised Adaptation for Speech Language Models**|Vahid Noroozi et.al.|[2406.12946](http://arxiv.org/abs/2406.12946)|null|\n", "2406.12164": "|**2024-07-09**|**A Mel Spectrogram Enhancement Paradigm Based on CWT in Speech Synthesis**|Guoqiang Hu et.al.|[2406.12164](http://arxiv.org/abs/2406.12164)|null|\n", "2406.11727": "|**2024-06-27**|**1000 African Voices: Advancing inclusive multi-speaker multi-accent speech synthesis**|Sewade Ogun et.al.|[2406.11727](http://arxiv.org/abs/2406.11727)|null|\n", "2406.11427": "|**2024-06-17**|**DiTTo-TTS: Efficient and Scalable Zero-Shot Text-to-Speech with Diffusion Transformer**|Keon Lee et.al.|[2406.11427](http://arxiv.org/abs/2406.11427)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.10844": "|**2024-06-16**|**Multi-Scale Accent Modeling with Disentangling for Multi-Speaker Multi-Accent TTS Synthesis**|Xuehao Zhou et.al.|[2406.10844](http://arxiv.org/abs/2406.10844)|null|\n", "2406.10514": "|**2024-06-15**|**GTR-Voice: Articulatory Phonetics Informed Controllable Expressive Speech Synthesis**|Zehua Kcriss Li et.al.|[2406.10514](http://arxiv.org/abs/2406.10514)|null|\n", "2406.10422": "|**2024-06-14**|**Phoneme Discretized Saliency Maps for Explainable Detection of AI-Generated Voice**|Shubham Gupta et.al.|[2406.10422](http://arxiv.org/abs/2406.10422)|null|\n", "2406.10056": "|**2024-06-14**|**UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner**|Dongchao Yang et.al.|[2406.10056](http://arxiv.org/abs/2406.10056)|**[link](https://github.com/yangdongchao/llm-codec)**|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.08989": "|**2024-06-13**|**ToneUnit: A Speech Discretization Approach for Tonal Language Speech Synthesis**|Dehua Tao et.al.|[2406.08989](http://arxiv.org/abs/2406.08989)|null|\n", "2406.08820": "|**2024-06-13**|**DisfluencySpeech -- Single-Speaker Conversational Speech Dataset with Paralanguage**|Kyra Wang et.al.|[2406.08820](http://arxiv.org/abs/2406.08820)|null|\n", "2406.08812": "|**2024-06-13**|**Generating Speakers by Prompting Listener Impressions for Pre-trained Multi-Speaker Text-to-Speech Systems**|Zhengyang Chen et.al.|[2406.08812](http://arxiv.org/abs/2406.08812)|null|\n", "2406.08802": "|**2024-06-13**|**DubWise: Video-Guided Speech Duration Control in Multimodal LLM-based Text-to-Speech for Dubbing**|Neha Sahipjohn et.al.|[2406.08802](http://arxiv.org/abs/2406.08802)|null|\n", "2406.08568": "|**2024-06-12**|**Training Data Augmentation for Dysarthric Automatic Speech Recognition by Text-to-Dysarthric-Speech Synthesis**|Wing-Zin Leung et.al.|[2406.08568](http://arxiv.org/abs/2406.08568)|null|\n", "2406.08416": "|**2024-06-20**|**TokSing: Singing Voice Synthesis based on Discrete Tokens**|Yuning Wu et.al.|[2406.08416](http://arxiv.org/abs/2406.08416)|null|\n", "2406.08196": "|**2024-06-12**|**FreeV: Free Lunch For Vocoders Through Pseudo Inversed Mel Filter**|Yuanjun Lv et.al.|[2406.08196](http://arxiv.org/abs/2406.08196)|**[link](https://github.com/bakerbunker/freev)**|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.08076": "|**2024-06-12**|**VECL-TTS: Voice identity and Emotional style controllable Cross-Lingual Text-to-Speech**|Ashishkumar Gudmalwar et.al.|[2406.08076](http://arxiv.org/abs/2406.08076)|null|\n", "2406.07969": "|**2024-06-12**|**LibriTTS-P: A Corpus with Speaking Style and Speaker Identity Prompts for Text-to-Speech and Style Captioning**|Masaya Kawamura et.al.|[2406.07969](http://arxiv.org/abs/2406.07969)|**[link](https://github.com/line/libritts-p)**|\n", "2406.07855": "|**2024-06-12**|**VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment**|Bing Han et.al.|[2406.07855](http://arxiv.org/abs/2406.07855)|null|\n", "2406.07803": "|**2024-06-12**|**EmoSphere-TTS: Emotional Style and Intensity Modeling via Spherical Emotion Vector for Controllable Emotional Text-to-Speech**|Deok-Hyeon Cho et.al.|[2406.07803](http://arxiv.org/abs/2406.07803)|**[link](https://github.com/Choddeok/EmoSphere-TTS)**|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07289": "|**2024-06-11**|**Can We Achieve High-quality Direct Speech-to-Speech Translation without Parallel Speech Data?**|Qingkai Fang et.al.|[2406.07289](http://arxiv.org/abs/2406.07289)|null|\n", "2406.07237": "|**2024-06-11**|**CodecFake: Enhancing Anti-Spoofing Models Against Deepfake Audios from Codec-Based Speech Synthesis Systems**|Haibin Wu et.al.|[2406.07237](http://arxiv.org/abs/2406.07237)|null|\n", "2406.06979": "|**2024-06-11**|**AudioMarkBench: Benchmarking Robustness of Audio Watermarking**|Hongbin Liu et.al.|[2406.06979](http://arxiv.org/abs/2406.06979)|**[link](https://github.com/moyangkuo/audiomarkbench)**|\n", "2406.06406": "|**2024-06-11**|**Controlling Emotion in Text-to-Speech with Natural Language Prompts**|Thomas Bott et.al.|[2406.06406](http://arxiv.org/abs/2406.06406)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06403": "|**2024-06-10**|**Meta Learning Text-to-Speech Synthesis in over 7000 Languages**|Florian Lux et.al.|[2406.06403](http://arxiv.org/abs/2406.06403)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06111": "|**2024-06-10**|**JenGAN: Stacked Shifted Filters in GAN-Based Speech Synthesis**|Hyunjae Cho et.al.|[2406.06111](http://arxiv.org/abs/2406.06111)|null|\n", "2406.05965": "|**2024-06-10**|**MakeSinger: A Semi-Supervised Training Method for Data-Efficient Singing Voice Synthesis via Classifier-free Diffusion Guidance**|Semin Kim et.al.|[2406.05965](http://arxiv.org/abs/2406.05965)|null|\n", "2406.05763": "|**2024-06-19**|**WenetSpeech4TTS: A 12,800-hour Mandarin TTS Corpus for Large Speech Generation Model Benchmark**|Linhan Ma et.al.|[2406.05763](http://arxiv.org/abs/2406.05763)|**[link](https://github.com/dukGuo/valle-audiodec)**|\n", "2406.05699": "|**2024-06-09**|**An Investigation of Noise Robustness for Flow-Matching-Based Zero-Shot TTS**|Xiaofei Wang et.al.|[2406.05699](http://arxiv.org/abs/2406.05699)|null|\n", "2406.05681": "|**2024-06-11**|**Towards Expressive Zero-Shot Speech Synthesis with Hierarchical Prosody Modeling**|Yuepeng Jiang et.al.|[2406.05681](http://arxiv.org/abs/2406.05681)|null|\n", "2406.05672": "|**2024-06-12**|**Text-aware and Context-aware Expressive Audiobook Speech Synthesis**|Dake Guo et.al.|[2406.05672](http://arxiv.org/abs/2406.05672)|null|\n", "2408.06906": "|**2024-08-13**|**VNet: A GAN-based Multi-Tier Discriminator Network for Speech Synthesis Vocoders**|Yubing Cao et.al.|[2408.06906](http://arxiv.org/abs/2408.06906)|null|\n", "2408.06858": "|**2024-08-13**|**SaSLaW: Dialogue Speech Corpus with Audio-visual Egocentric Information Toward Environment-adaptive Dialogue Speech Synthesis**|Osamu Take et.al.|[2408.06858](http://arxiv.org/abs/2408.06858)|**[link](https://github.com/sarulab-speech/saslaw)**|\n", "2408.06827": "|**2024-08-13**|**PRESENT: Zero-Shot Text-to-Prosody Control**|Perry Lam et.al.|[2408.06827](http://arxiv.org/abs/2408.06827)|**[link](https://github.com/iamanigeeit/present)**|\n", "2408.07547": "|**2024-08-14**|**PeriodWave: Multi-Period Flow Matching for High-Fidelity Waveform Generation**|Sang-Hoon Lee et.al.|[2408.07547](http://arxiv.org/abs/2408.07547)|**[link](https://github.com/sh-lee-prml/periodwave)**|\n", "2408.07414": "|**2024-08-14**|**WavLM model ensemble for audio deepfake detection**|David Combei et.al.|[2408.07414](http://arxiv.org/abs/2408.07414)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10852": "|**2024-08-20**|**EELE: Exploring Efficient and Extensible LoRA Integration in Emotional Text-to-Speech**|Xin Qi et.al.|[2408.10852](http://arxiv.org/abs/2408.10852)|null|\n", "2408.10771": "|**2024-08-20**|**SSL-TTS: Leveraging Self-Supervised Embeddings and kNN Retrieval for Zero-Shot Multi-speaker TTS**|Karl El Hajal et.al.|[2408.10771](http://arxiv.org/abs/2408.10771)|null|\n", "2408.10549": "|**2024-08-20**|**AI-Based IVR**|Gassyrbek Kosherbay et.al.|[2408.10549](http://arxiv.org/abs/2408.10549)|null|\n", "2408.10463": "|**2024-08-20**|**Adversarial training of Keyword Spotting to Minimize TTS Data Overfitting**|Hyun Jin Park et.al.|[2408.10463](http://arxiv.org/abs/2408.10463)|null|\n", "2408.10207": "|**2024-07-01**|**A Comprehensive Survey on Diffusion Models and Their Applications**|Md Manjurul Ahsan et.al.|[2408.10207](http://arxiv.org/abs/2408.10207)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12170": "|**2024-08-22**|**VoiceX: A Text-To-Speech Framework for Custom Voices**|Silvan Mertes et.al.|[2408.12170](http://arxiv.org/abs/2408.12170)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13240": "|**2024-08-23**|**Which Prosodic Features Matter Most for Pragmatics?**|Nigel G. Ward et.al.|[2408.13240](http://arxiv.org/abs/2408.13240)|null|\n", "2408.14423": "|**2024-08-27**|**DualSpeech: Enhancing Speaker-Fidelity and Text-Intelligibility Through Dual Classifier-Free Guidance**|Jinhyeok Yang et.al.|[2408.14423](http://arxiv.org/abs/2408.14423)|null|\n", "2408.13970": "|**2024-08-26**|**Anonymization of Voices in Spaces for Civic Dialogue: Measuring Impact on Empathy, Trust, and Feeling Heard**|Wonjune Kang et.al.|[2408.13970](http://arxiv.org/abs/2408.13970)|null|\n", "2408.13893": "|**2024-08-28**|**SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models**|Dongchao Yang et.al.|[2408.13893](http://arxiv.org/abs/2408.13893)|null|\n", "2408.13608": "|**2024-08-24**|**SpeechCraft: A Fine-grained Expressive Speech Dataset with Natural Language Description**|Zeyu Jin et.al.|[2408.13608](http://arxiv.org/abs/2408.13608)|**[link](https://github.com/thuhcsi/speechcraft)**|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.14739": "|**2024-08-28**|**VoiceTailor: Lightweight Plug-In Adapter for Diffusion-Based Personalized Text-to-Speech**|Heeseung Kim et.al.|[2408.14739](http://arxiv.org/abs/2408.14739)|null|\n", "2408.14713": "|**2024-08-27**|**StyleSpeech: Parameter-efficient Fine Tuning for Pre-trained Controllable Text-to-Speech**|Haowei Lou et.al.|[2408.14713](http://arxiv.org/abs/2408.14713)|null|\n", "2408.15916": "|**2024-08-28**|**Multi-modal Adversarial Training for Zero-Shot Voice Cloning**|John Janiczek et.al.|[2408.15916](http://arxiv.org/abs/2408.15916)|null|\n", "2408.15775": "|**2024-08-29**|**Easy, Interpretable, Effective: openSMILE for voice deepfake detection**|Octavian Pascu et.al.|[2408.15775](http://arxiv.org/abs/2408.15775)|null|\n", "2408.15676": "|**2024-08-28**|**VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling**|Yixuan Zhou et.al.|[2408.15676](http://arxiv.org/abs/2408.15676)|null|\n", "2408.16725": "|**2024-08-30**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|**[link](https://github.com/gpt-omni/mini-omni)**|\n", "2408.16546": "|**2024-08-29**|**RAVE for Speech: Efficient Voice Conversion at High Sampling Rates**|Anders R. Bargum et.al.|[2408.16546](http://arxiv.org/abs/2408.16546)|null|\n", "2408.16373": "|**2024-08-29**|**Enabling Beam Search for Language Model-Based Text-to-Speech Synthesis**|Zehai Tu et.al.|[2408.16373](http://arxiv.org/abs/2408.16373)|null|\n"}}
\ No newline at end of file
diff --git a/docs/asr-arxiv-daily.json b/docs/asr-arxiv-daily.json
index 70346a2..a35de80 100644
--- a/docs/asr-arxiv-daily.json
+++ b/docs/asr-arxiv-daily.json
@@ -1 +1 @@
-{"ASR": {"2408.00624": "|**2024-08-01**|**SynesLM: A Unified Approach for Audio-visual Speech Recognition and Translation via Language Model and Synthetic Data**|Yichen Lu et.al.|[2408.00624](http://arxiv.org/abs/2408.00624)|**[link](https://github.com/espnet/espnet)**|\n", "2408.00205": "|**2024-08-01**|**Sentence-wise Speech Summarization: Task, Datasets, and End-to-End Modeling with LM Knowledge Distillation**|Kohei Matsuura et.al.|[2408.00205](http://arxiv.org/abs/2408.00205)|null|\n", "2407.21783": "|**2024-08-15**|**The Llama 3 Herd of Models**|Abhimanyu Dubey et.al.|[2407.21783](http://arxiv.org/abs/2407.21783)|null|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.21414": "|**2024-07-31**|**Towards interfacing large language models with ASR systems using confidence measures and prompting**|Maryam Naderi et.al.|[2407.21414](http://arxiv.org/abs/2407.21414)|null|\n", "2407.21211": "|**2024-07-30**|**Self-Supervised Models in Automatic Whispered Speech Recognition**|Aref Farhadipour et.al.|[2407.21211](http://arxiv.org/abs/2407.21211)|null|\n", "2407.21066": "|**2024-07-28**|**ELP-Adapters: Parameter Efficient Adapter Tuning for Various Speech Processing Tasks**|Nakamasa Inoue et.al.|[2407.21066](http://arxiv.org/abs/2407.21066)|null|\n", "2407.21061": "|**2024-07-26**|**Improving noisy student training for low-resource languages in End-to-End ASR using CycleGAN and inter-domain losses**|Chia-Yu Li et.al.|[2407.21061](http://arxiv.org/abs/2407.21061)|null|\n", "2407.18581": "|**2024-08-07**|**Dynamic Language Group-Based MoE: Enhancing Code-Switching Speech Recognition with Hierarchical Routing**|Hukai Huang et.al.|[2407.18581](http://arxiv.org/abs/2407.18581)|**[link](https://github.com/kaihuhuang/language-group)**|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18461": "|**2024-07-26**|**Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation**|Shiyao Wang et.al.|[2407.18461](http://arxiv.org/abs/2407.18461)|**[link](https://github.com/nku-hlt/pb-dsr)**|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17874": "|**2024-07-25**|**Improving Domain-Specific ASR with LLM-Generated Contextual Descriptions**|Jiwon Suh et.al.|[2407.17874](http://arxiv.org/abs/2407.17874)|null|\n", "2407.17852": "|**2024-07-25**|**Scaling A Simple Approach to Zero-Shot Speech Recognition**|Jinming Zhao et.al.|[2407.17852](http://arxiv.org/abs/2407.17852)|**[link](https://github.com/facebookresearch/fairseq)**|\n", "2407.17605": "|**2024-07-24**|**Coupling Speech Encoders with Downstream Text Models**|Ciprian Chelba et.al.|[2407.17605](http://arxiv.org/abs/2407.17605)|null|\n", "2407.17160": "|**2024-07-24**|**A Comparative Analysis of Bilingual and Trilingual Wav2Vec Models for Automatic Speech Recognition in Multilingual Oral History Archives**|Jan Lehe\u010dka et.al.|[2407.17160](http://arxiv.org/abs/2407.17160)|null|\n", "2407.16537": "|**2024-07-23**|**Quantifying the Role of Textual Predictability in Automatic Speech Recognition**|Sean Robertson et.al.|[2407.16537](http://arxiv.org/abs/2407.16537)|null|\n", "2407.16447": "|**2024-07-23**|**The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization**|Samuele Cornell et.al.|[2407.16447](http://arxiv.org/abs/2407.16447)|null|\n", "2407.16370": "|**2024-07-23**|**Evolutionary Prompt Design for LLM-Based Post-ASR Error Correction**|Rithik Sachdev et.al.|[2407.16370](http://arxiv.org/abs/2407.16370)|**[link](https://github.com/rithiksachdev/PostASR-Correction-SLT2024)**|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15749": "|**2024-07-22**|**Robustness of Speech Separation Models for Similar-pitch Speakers**|Bunlong Lay et.al.|[2407.15749](http://arxiv.org/abs/2407.15749)|null|\n", "2407.15300": "|**2024-07-22**|**SELM: Enhancing Speech Emotion Recognition for Out-of-Domain Scenarios**|Hazim Bukhari et.al.|[2407.15300](http://arxiv.org/abs/2407.15300)|null|\n", "2407.14573": "|**2024-08-24**|**Trading Devil Final: Backdoor attack via Stock market and Bayesian Optimization**|Orson Mengara et.al.|[2407.14573](http://arxiv.org/abs/2407.14573)|null|\n", "2407.14021": "|**2024-07-19**|**GE2E-AC: Generalized End-to-End Loss Training for Accent Classification**|Chihiro Watanabe et.al.|[2407.14021](http://arxiv.org/abs/2407.14021)|null|\n", "2407.13982": "|**2024-07-19**|**Reexamining Racial Disparities in Automatic Speech Recognition Performance: The Role of Confounding by Provenance**|Changye Li et.al.|[2407.13982](http://arxiv.org/abs/2407.13982)|null|\n", "2408.00005": "|**2024-07-18**|**Framework for Curating Speech Datasets and Evaluating ASR Systems: A Case Study for Polish**|Micha\u0142 Junczyk et.al.|[2408.00005](http://arxiv.org/abs/2408.00005)|**[link](https://github.com/goodmike31/pl-asr-bigos-tools)**|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.13300": "|**2024-07-18**|**Robust ASR Error Correction with Conservative Data Filtering**|Takuma Udagawa et.al.|[2407.13300](http://arxiv.org/abs/2407.13300)|null|\n", "2407.13292": "|**2024-07-18**|**Low-Resourced Speech Recognition for Iu Mien Language via Weakly-Supervised Phoneme-based Multilingual Pre-training**|Lukuan Dong et.al.|[2407.13292](http://arxiv.org/abs/2407.13292)|null|\n", "2407.13266": "|**2024-07-18**|**How Private is Low-Frequency Speech Audio in the Wild? An Analysis of Verbal Intelligibility by Humans and Machines**|Ailin Liu et.al.|[2407.13266](http://arxiv.org/abs/2407.13266)|null|\n", "2407.13142": "|**2024-07-18**|**A light-weight and efficient punctuation and word casing prediction model for on-device streaming ASR**|Jian You et.al.|[2407.13142](http://arxiv.org/abs/2407.13142)|null|\n", "2407.12389": "|**2024-07-17**|**Morphosyntactic Analysis for CHILDES**|Houjun Liu et.al.|[2407.12389](http://arxiv.org/abs/2407.12389)|null|\n", "2407.12240": "|**2024-07-17**|**Adaptive Cascading Network for Continual Test-Time Adaptation**|Kien X. Nguyen et.al.|[2407.12240](http://arxiv.org/abs/2407.12240)|null|\n", "2407.12094": "|**2024-07-16**|**Identifying Speakers in Dialogue Transcripts: A Text-based Approach Using Pretrained Language Models**|Minh Nguyen et.al.|[2407.12094](http://arxiv.org/abs/2407.12094)|**[link](https://github.com/adobe-research/speaker-identification)**|\n", "2407.11828": "|**2024-07-17**|**Vibravox: A Dataset of French Speech Captured with Body-conduction Audio Sensors**|Julien Hauret et.al.|[2407.11828](http://arxiv.org/abs/2407.11828)|**[link](https://github.com/jhauret/vibravox)**|\n", "2407.11641": "|**2024-07-16**|**Investigating the Effect of Label Topology and Training Criterion on ASR Performance and Alignment Quality**|Tina Raissi et.al.|[2407.11641](http://arxiv.org/abs/2407.11641)|null|\n", "2407.11516": "|**2024-07-16**|**The VoicePrivacy 2022 Challenge: Progress and Perspectives in Voice Anonymisation**|Michele Panariello et.al.|[2407.11516](http://arxiv.org/abs/2407.11516)|null|\n", "2407.11345": "|**2024-07-16**|**Beyond Binary: Multiclass Paraphasia Detection with Generative Pretrained Transformers and End-to-End Models**|Matthew Perez et.al.|[2407.11345](http://arxiv.org/abs/2407.11345)|null|\n", "2407.10603": "|**2024-07-15**|**Leave No Knowledge Behind During Knowledge Distillation: Towards Practical and Effective Knowledge Distillation for Code-Switching ASR Using Realistic Data**|Liang-Hsuan Tseng et.al.|[2407.10603](http://arxiv.org/abs/2407.10603)|null|\n", "2407.10303": "|**2024-07-14**|**Improving Neural Biasing for Contextual Speech Recognition by Early Context Injection and Text Perturbation**|Ruizhe Huang et.al.|[2407.10303](http://arxiv.org/abs/2407.10303)|null|\n", "2407.10255": "|**2024-07-14**|**CUSIDE-T: Chunking, Simulating Future and Decoding for Transducer based Streaming ASR**|Wenbo Zhao et.al.|[2407.10255](http://arxiv.org/abs/2407.10255)|null|\n", "2407.10118": "|**2024-07-14**|**Textless Dependency Parsing by Labeled Sequence Prediction**|Shunsuke Kando et.al.|[2407.10118](http://arxiv.org/abs/2407.10118)|**[link](https://github.com/mynlp/speechparser)**|\n", "2407.10048": "|**2024-07-14**|**Whisper-SV: Adapting Whisper for Low-data-resource Speaker Verification**|Li Zhang et.al.|[2407.10048](http://arxiv.org/abs/2407.10048)|null|\n", "2407.09849": "|**2024-07-13**|**Text-Based Detection of On-Hold Scripts in Contact Center Calls**|Dmitrii Galimzianov et.al.|[2407.09849](http://arxiv.org/abs/2407.09849)|**[link](https://github.com/gal-dmitry/HOLD_DETECTION_PUBLIC)**|\n", "2407.09817": "|**2024-08-24**|**Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System**|Lingwei Meng et.al.|[2407.09817](http://arxiv.org/abs/2407.09817)|**[link](https://github.com/LingweiMeng/Whisper-Sidecar)**|\n", "2407.09807": "|**2024-07-13**|**A Streaming Multi-Channel End-to-End Speech Recognition System with Realistic Evaluations**|Xiangzhu Kong et.al.|[2407.09807](http://arxiv.org/abs/2407.09807)|**[link](https://github.com/thu-spmi/cat)**|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.08618": "|**2024-08-12**|**Tamil Language Computing: the Present and the Future**|Kengatharaiyer Sarveswaran et.al.|[2407.08618](http://arxiv.org/abs/2407.08618)|null|\n", "2407.08658": "|**2024-07-10**|**Evaluating Voice Command Pipelines for Drone Control: From STT and LLM to Direct Classification and Siamese Networks**|Lucca Emmanuel Pineli Sim\u00f5es et.al.|[2407.08658](http://arxiv.org/abs/2407.08658)|null|\n", "2407.07566": "|**2024-07-10**|**HebDB: a Weakly Supervised Dataset for Hebrew Speech Processing**|Arnon Turetzky et.al.|[2407.07566](http://arxiv.org/abs/2407.07566)|null|\n", "2407.18930": "|**2024-07-10**|**Dynamic Encoder Size Based on Data-Driven Layer-wise Pruning for Speech Recognition**|Jingjing Xu et.al.|[2407.18930](http://arxiv.org/abs/2407.18930)|null|\n", "2407.17416": "|**2024-07-10**|**Explaining Spectrograms in Machine Learning: A Study on Neural Networks for Speech Classification**|Jesin James et.al.|[2407.17416](http://arxiv.org/abs/2407.17416)|null|\n", "2407.06606": "|**2024-07-09**|**Tailored Design of Audio-Visual Speech Recognition Models using Branchformers**|David Gimeno-G\u00f3mez et.al.|[2407.06606](http://arxiv.org/abs/2407.06606)|**[link](https://github.com/david-gimeno/tailored-avsr)**|\n", "2407.06310": "|**2024-07-08**|**Homogeneous Speaker Features for On-the-Fly Dysarthric and Elderly Speaker Adaptation**|Mengzhe Geng et.al.|[2407.06310](http://arxiv.org/abs/2407.06310)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.14525": "|**2024-07-07**|**Morse Code-Enabled Speech Recognition for Individuals with Visual and Hearing Impairments**|Ritabrata Roy Choudhury et.al.|[2407.14525](http://arxiv.org/abs/2407.14525)|null|\n", "2407.04675": "|**2024-07-10**|**Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition**|Ye Bai et.al.|[2407.04675](http://arxiv.org/abs/2407.04675)|null|\n", "2407.04662": "|**2024-07-05**|**Multitaper mel-spectrograms for keyword spotting**|Douglas Baptista de Souza et.al.|[2407.04662](http://arxiv.org/abs/2407.04662)|null|\n", "2407.04652": "|**2024-07-05**|**Pretraining End-to-End Keyword Search with Automatically Discovered Acoustic Units**|Bolaji Yusuf et.al.|[2407.04652](http://arxiv.org/abs/2407.04652)|**[link](https://github.com/beer-asr/beer)**|\n", "2407.04641": "|**2024-07-05**|**Speculative Speech Recognition by Audio-Prefixed Low-Rank Adaptation of Language Models**|Bolaji Yusuf et.al.|[2407.04641](http://arxiv.org/abs/2407.04641)|null|\n", "2407.04601": "|**2024-07-05**|**Written Term Detection Improves Spoken Term Detection**|Bolaji Yusuf et.al.|[2407.04601](http://arxiv.org/abs/2407.04601)|**[link](https://github.com/bolajiy/golden-retriever)**|\n", "2407.04533": "|**2024-07-09**|**Performance Analysis of Speech Encoders for Low-Resource SLU and ASR in Tunisian Dialect**|Salima Mdhaffar et.al.|[2407.04533](http://arxiv.org/abs/2407.04533)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2407.04482": "|**2024-07-05**|**Controlling Whisper: Universal Acoustic Adversarial Attacks to Control Speech Foundation Models**|Vyas Raina et.al.|[2407.04482](http://arxiv.org/abs/2407.04482)|null|\n", "2407.04439": "|**2024-07-05**|**XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models**|Shashi Kumar et.al.|[2407.04439](http://arxiv.org/abs/2407.04439)|null|\n", "2407.04368": "|**2024-07-05**|**Romanization Encoding For Multilingual ASR**|Wen Ding et.al.|[2407.04368](http://arxiv.org/abs/2407.04368)|null|\n", "2407.04280": "|**2024-07-05**|**LearnerVoice: A Dataset of Non-Native English Learners' Spontaneous Speech**|Haechan Kim et.al.|[2407.04280](http://arxiv.org/abs/2407.04280)|null|\n", "2407.04219": "|**2024-07-05**|**Semi-supervised Learning for Code-Switching ASR with Large Language Model Filter**|Yu Xi et.al.|[2407.04219](http://arxiv.org/abs/2407.04219)|null|\n", "2407.04051": "|**2024-07-11**|**FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs**|Keyu An et.al.|[2407.04051](http://arxiv.org/abs/2407.04051)|**[link](https://github.com/FunAudioLLM/SenseVoice)**|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.03966": "|**2024-07-04**|**Serialized Output Training by Learned Dominance**|Ying Shi et.al.|[2407.03966](http://arxiv.org/abs/2407.03966)|null|\n", "2407.03809": "|**2024-07-04**|**Finetuning End-to-End Models for Estonian Conversational Spoken Language Translation**|Tiia Sildam et.al.|[2407.03809](http://arxiv.org/abs/2407.03809)|null|\n", "2407.03734": "|**2024-07-04**|**Improving Self-supervised Pre-training using Accent-Specific Codebooks**|Darshan Prabhu et.al.|[2407.03734](http://arxiv.org/abs/2407.03734)|**[link](https://github.com/csalt-research/accented-codebooks-asr)**|\n", "2407.03718": "|**2024-07-24**|**Multi-Convformer: Extending Conformer with Multiple Convolution Kernels**|Darshan Prabhu et.al.|[2407.03718](http://arxiv.org/abs/2407.03718)|**[link](https://github.com/espnet/espnet)**|\n", "2407.03563": "|**2024-07-04**|**Learning Video Temporal Dynamics with Cross-Modal Attention for Robust Audio-Visual Speech Recognition**|Sungnyun Kim et.al.|[2407.03563](http://arxiv.org/abs/2407.03563)|null|\n", "2407.03495": "|**2024-07-03**|**Codec-ASR: Training Performant Automatic Speech Recognition Systems with Discrete Speech Representations**|Kunal Dhawan et.al.|[2407.03495](http://arxiv.org/abs/2407.03495)|null|\n", "2407.03440": "|**2024-07-03**|**Advanced Framework for Animal Sound Classification With Features Optimization**|Qiang Yang et.al.|[2407.03440](http://arxiv.org/abs/2407.03440)|null|\n", "2407.03026": "|**2024-07-03**|**Qifusion-Net: Layer-adapted Stream/Non-stream Model for End-to-End Multi-Accent Speech Recognition**|Jinming Chen et.al.|[2407.03026](http://arxiv.org/abs/2407.03026)|null|\n", "2407.13782": "|**2024-07-03**|**Self-supervised ASR Models and Features For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2407.13782](http://arxiv.org/abs/2407.13782)|null|\n", "2407.02052": "|**2024-07-02**|**The USTC-NERCSLIP Systems for The ICMC-ASR Challenge**|Minghui Wu et.al.|[2407.02052](http://arxiv.org/abs/2407.02052)|null|\n", "2407.02543": "|**2024-07-02**|**Towards the Next Frontier in Speech Representation Learning Using Disentanglement**|Varun Krishna et.al.|[2407.02543](http://arxiv.org/abs/2407.02543)|null|\n", "2407.01909": "|**2024-07-02**|**Pinyin Regularization in Error Correction for Chinese Speech Recognition with Large Language Models**|Zhiyuan Tang et.al.|[2407.01909](http://arxiv.org/abs/2407.01909)|**[link](https://github.com/tzyll/ChineseHP)**|\n", "2407.17477": "|**2024-07-30**|**Toward Automated Detection of Biased Social Signals from the Content of Clinical Conversations**|Feng Chen et.al.|[2407.17477](http://arxiv.org/abs/2407.17477)|null|\n", "2407.00756": "|**2024-06-30**|**Less Forgetting for Better Generalization: Exploring Continual-learning Fine-tuning Methods for Speech Self-supervised Representations**|Salah Zaiem et.al.|[2407.00756](http://arxiv.org/abs/2407.00756)|null|\n", "2407.00518": "|**2024-06-29**|**When Robots Get Chatty: Grounding Multimodal Human-Robot Conversation and Collaboration**|Philipp Allgeuer et.al.|[2407.00518](http://arxiv.org/abs/2407.00518)|null|\n", "2407.12817": "|**2024-06-29**|**Error Correction by Paying Attention to Both Acoustic and Confidence References for Automatic Speech Recognition**|Yuchun Shu et.al.|[2407.12817](http://arxiv.org/abs/2407.12817)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2407.12029": "|**2024-06-29**|**A Quality-Aware Voltage Overscaling Framework to Improve the Energy Efficiency and Lifetime of TPUs based on Statistical Error Modeling**|Alireza Senobari et.al.|[2407.12029](http://arxiv.org/abs/2407.12029)|null|\n", "2407.12028": "|**2024-06-28**|**TreeSeg: Hierarchical Topic Segmentation of Large Transcripts**|Dimitrios C. Gklezakos et.al.|[2407.12028](http://arxiv.org/abs/2407.12028)|null|\n", "2406.19706": "|**2024-06-28**|**SAML: Speaker Adaptive Mixture of LoRA Experts for End-to-End ASR**|Qiuming Zhao et.al.|[2406.19706](http://arxiv.org/abs/2406.19706)|null|\n", "2406.19674": "|**2024-06-28**|**Less is More: Accurate Speech Recognition & Translation without Web-Scale Data**|Krishna C. Puvvada et.al.|[2406.19674](http://arxiv.org/abs/2406.19674)|null|\n", "2406.19564": "|**2024-06-27**|**Voices Unheard: NLP Resources and Models for Yor\u00f9b\u00e1 Regional Dialects**|Orevaoghene Ahia et.al.|[2406.19564](http://arxiv.org/abs/2406.19564)|**[link](https://github.com/orevaahia/yorulect)**|\n", "2406.19363": "|**2024-06-27**|**Tradition or Innovation: A Comparison of Modern ASR Methods for Forced Alignment**|Rotem Rousso et.al.|[2406.19363](http://arxiv.org/abs/2406.19363)|null|\n", "2406.19311": "|**2024-06-27**|**Zero-Query Adversarial Attack on Black-box Automatic Speech Recognition Systems**|Zheng Fang et.al.|[2406.19311](http://arxiv.org/abs/2406.19311)|null|\n", "2406.18972": "|**2024-06-27**|**Applying LLMs for Rescoring N-best ASR Hypotheses of Casual Conversations: Effects of Domain Adaptation and Context Carry-over**|Atsunori Ogawa et.al.|[2406.18972](http://arxiv.org/abs/2406.18972)|null|\n", "2406.18928": "|**2024-06-27**|**Enhanced ASR Robustness to Packet Loss with a Front-End Adaptation Network**|Yehoshua Dissen et.al.|[2406.18928](http://arxiv.org/abs/2406.18928)|null|\n", "2406.18862": "|**2024-06-27**|**Streaming Decoder-Only Automatic Speech Recognition with Discrete Speech Units: A Pilot Study**|Peikun Chen et.al.|[2406.18862](http://arxiv.org/abs/2406.18862)|**[link](https://github.com/chenpk00/IS2024_stream_decoder_only_asr)**|\n", "2406.18373": "|**2024-06-26**|**Dynamic Data Pruning for Automatic Speech Recognition**|Qiao Xiao et.al.|[2406.18373](http://arxiv.org/abs/2406.18373)|null|\n", "2406.18301": "|**2024-06-26**|**MSR-86K: An Evolving, Multilingual Corpus with 86,300 Hours of Transcribed Audio for Speech Recognition Research**|Song Li et.al.|[2406.18301](http://arxiv.org/abs/2406.18301)|null|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18120": "|**2024-07-12**|**ArzEn-LLM: Code-Switched Egyptian Arabic-English Translation and Speech Recognition Using LLMs**|Ahmed Heakl et.al.|[2406.18120](http://arxiv.org/abs/2406.18120)|**[link](https://github.com/ahmedheakl/arazn-llm)**|\n", "2406.18021": "|**2024-06-26**|**SC-MoE: Switch Conformer Mixture of Experts for Unified Streaming and Non-streaming Code-Switching ASR**|Shuaishuai Ye et.al.|[2406.18021](http://arxiv.org/abs/2406.18021)|null|\n", "2406.17935": "|**2024-06-25**|**Sequential Editing for Lifelong Training of Speech Recognition Models**|Devang Kulshreshtha et.al.|[2406.17935](http://arxiv.org/abs/2406.17935)|null|\n", "2406.17926": "|**2024-06-25**|**FASA: a Flexible and Automatic Speech Aligner for Extracting High-quality Aligned Children Speech Data**|Dancheng Liu et.al.|[2406.17926](http://arxiv.org/abs/2406.17926)|**[link](https://github.com/DanchengLiu/FASA)**|\n", "2406.17618": "|**2024-06-25**|**Towards Building an End-to-End Multilingual Automatic Lyrics Transcription Model**|Jiawen Huang et.al.|[2406.17618](http://arxiv.org/abs/2406.17618)|**[link](https://github.com/jhuang448/MultilingualALT)**|\n", "2406.17614": "|**2024-06-25**|**MSRS: Training Multimodal Speech Recognition Models from Scratch with Sparse Mask Optimization**|Adriana Fernandez-Lopez et.al.|[2406.17614](http://arxiv.org/abs/2406.17614)|null|\n", "2406.17825": "|**2024-06-25**|**Automatic speech recognition for the Nepali language using CNN, bidirectional LSTM and ResNet**|Manish Dhakal et.al.|[2406.17825](http://arxiv.org/abs/2406.17825)|**[link](https://github.com/manishdhakal/asr-nepali-using-cnn-bilstm-resnet)**|\n", "2406.17272": "|**2024-06-25**|**A Comprehensive Solution to Connect Speech Encoder and Large Language Model for ASR**|Van Tung Pham et.al.|[2406.17272](http://arxiv.org/abs/2406.17272)|null|\n", "2406.17124": "|**2024-06-24**|**Investigating Confidence Estimation Measures for Speaker Diarization**|Anurag Chowdhury et.al.|[2406.17124](http://arxiv.org/abs/2406.17124)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16777": "|**2024-06-24**|**Blending LLMs into Cascaded Speech Translation: KIT's Offline Speech Translation System for IWSLT 2024**|Sai Koneru et.al.|[2406.16777](http://arxiv.org/abs/2406.16777)|null|\n", "2406.16120": "|**2024-06-23**|**Contextualized End-to-end Automatic Speech Recognition with Intermediate Biasing Loss**|Muhammad Shakeel et.al.|[2406.16120](http://arxiv.org/abs/2406.16120)|null|\n", "2406.16107": "|**2024-08-01**|**Decoder-only Architecture for Streaming End-to-end Speech Recognition**|Emiru Tsunoo et.al.|[2406.16107](http://arxiv.org/abs/2406.16107)|null|\n", "2406.15723": "|**2024-06-22**|**Acoustic Feature Mixup for Balanced Multi-aspect Pronunciation Assessment**|Heejin Do et.al.|[2406.15723](http://arxiv.org/abs/2406.15723)|null|\n", "2406.15668": "|**2024-06-21**|**PI-Whisper: An Adaptive and Incremental ASR Framework for Diverse and Evolving Speaker Characteristics**|Amir Nassereldine et.al.|[2406.15668](http://arxiv.org/abs/2406.15668)|null|\n", "2406.15265": "|**2024-06-21**|**Perception of Phonological Assimilation by Neural Speech Recognition Models**|Charlotte Pouw et.al.|[2406.15265](http://arxiv.org/abs/2406.15265)|null|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14747": "|**2024-06-20**|**An Adapter-Based Unified Model for Multiple Spoken Language Processing Tasks**|Varsha Suresh et.al.|[2406.14747](http://arxiv.org/abs/2406.14747)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.14266": "|**2024-06-20**|**Intelligent Interface: Enhancing Lecture Engagement with Didactic Activity Summaries**|Anna Wr\u00f3blewska et.al.|[2406.14266](http://arxiv.org/abs/2406.14266)|null|\n", "2406.13842": "|**2024-06-19**|**Joint vs Sequential Speaker-Role Detection and Automatic Speech Recognition for Air-traffic Control**|Alexander Blatt et.al.|[2406.13842](http://arxiv.org/abs/2406.13842)|null|\n", "2406.13502": "|**2024-06-19**|**ManWav: The First Manchu ASR Model**|Jean Seo et.al.|[2406.13502](http://arxiv.org/abs/2406.13502)|null|\n", "2406.13431": "|**2024-06-24**|**Children's Speech Recognition through Discrete Token Enhancement**|Vrunda N. Sukhadia et.al.|[2406.13431](http://arxiv.org/abs/2406.13431)|null|\n", "2406.12699": "|**2024-06-18**|**Bridging the Gap: Integrating Pre-trained Speech Enhancement and Recognition Models for Robust Speech Recognition**|Kuan-Chen Wang et.al.|[2406.12699](http://arxiv.org/abs/2406.12699)|null|\n", "2406.12674": "|**2024-06-18**|**Transcribe, Align and Segment: Creating speech datasets for low-resource languages**|Taras Sereda et.al.|[2406.12674](http://arxiv.org/abs/2406.12674)|null|\n", "2406.12621": "|**2024-06-18**|**Growing Trees on Sounds: Assessing Strategies for End-to-End Dependency Parsing of Speech**|Adrien Pupier et.al.|[2406.12621](http://arxiv.org/abs/2406.12621)|**[link](https://github.com/Pupiera/Growing_tree_on_sound)**|\n", "2406.12611": "|**2024-06-18**|**Rapid Language Adaptation for Multilingual E2E Speech Recognition Using Encoder Prompting**|Yosuke Kashiwagi et.al.|[2406.12611](http://arxiv.org/abs/2406.12611)|null|\n", "2406.12503": "|**2024-06-18**|**Unsupervised Online Continual Learning for Automatic Speech Recognition**|Steven Vander Eeckt et.al.|[2406.12503](http://arxiv.org/abs/2406.12503)|**[link](https://github.com/stevenvdeeckt/unsupervised-ocl-for-asr)**|\n", "2406.12387": "|**2024-06-18**|**Performant ASR Models for Medical Entities in Accented Speech**|Tejumade Afonja et.al.|[2406.12387](http://arxiv.org/abs/2406.12387)|null|\n", "2406.12317": "|**2024-06-18**|**Finding Task-specific Subnetworks in Multi-task Spoken Language Understanding Model**|Hayato Futami et.al.|[2406.12317](http://arxiv.org/abs/2406.12317)|null|\n", "2406.12233": "|**2024-06-18**|**SyncVSR: Data-Efficient Visual Speech Recognition with End-to-End Crossmodal Audio Token Synchronization**|Young Jin Ahn et.al.|[2406.12233](http://arxiv.org/abs/2406.12233)|**[link](https://github.com/KAIST-AILab/SyncVSR)**|\n", "2406.11546": "|**2024-06-17**|**GigaSpeech 2: An Evolving, Large-Scale and Multi-domain ASR Corpus for Low-Resource Languages with Automated Crawling, Transcription and Refinement**|Yifan Yang et.al.|[2406.11546](http://arxiv.org/abs/2406.11546)|**[link](https://github.com/SpeechColab/GigaSpeech2)**|\n", "2406.12937": "|**2024-06-17**|**Self-Train Before You Transcribe**|Robert Flynn et.al.|[2406.12937](http://arxiv.org/abs/2406.12937)|**[link](https://github.com/robflynnyh/Self-Train-Before-You-Transcribe)**|\n", "2406.11064": "|**2024-06-16**|**Continual Test-time Adaptation for End-to-end Speech Recognition on Noisy Speech**|Guan-Ting Lin et.al.|[2406.11064](http://arxiv.org/abs/2406.11064)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.11025": "|**2024-06-16**|**Large Language Models for Dysfluency Detection in Stuttered Speech**|Dominik Wagner et.al.|[2406.11025](http://arxiv.org/abs/2406.11025)|null|\n", "2406.11022": "|**2024-06-16**|**Outlier Reduction with Gated Attention for Improved Post-training Quantization in Large Sequence-to-sequence Speech Foundation Models**|Dominik Wagner et.al.|[2406.11022](http://arxiv.org/abs/2406.11022)|null|\n", "2406.11016": "|**2024-06-16**|**Optimized Speculative Sampling for GPU Hardware Accelerators**|Dominik Wagner et.al.|[2406.11016](http://arxiv.org/abs/2406.11016)|null|\n", "2406.10993": "|**2024-06-16**|**CoSTA: Code-Switched Speech Translation using Aligned Speech-Text Interleaving**|Bhavani Shankar et.al.|[2406.10993](http://arxiv.org/abs/2406.10993)|null|\n", "2406.10932": "|**2024-06-16**|**Imperceptible Rhythm Backdoor Attacks: Exploring Rhythm Transformation for Embedding Undetectable Vulnerabilities on Speech Recognition**|Wenhan Yao et.al.|[2406.10932](http://arxiv.org/abs/2406.10932)|null|\n", "2406.12931": "|**2024-06-16**|**Automatic Speech Recognition for Biomedical Data in Bengali Language**|Shariar Kabir et.al.|[2406.12931](http://arxiv.org/abs/2406.12931)|null|\n", "2406.10741": "|**2024-06-15**|**Speech Emotion Recognition Using CNN and Its Use Case in Digital Healthcare**|Nishargo Nigar et.al.|[2406.10741](http://arxiv.org/abs/2406.10741)|null|\n", "2406.10719": "|**2024-06-21**|**Trading Devil: Robust backdoor attack via Stochastic investment models and Bayesian approach**|Orson Mengara et.al.|[2406.10719](http://arxiv.org/abs/2406.10719)|null|\n", "2406.10177": "|**2024-06-14**|**Inclusive ASR for Disfluent Speech: Cascaded Large-Scale Self-Supervised Learning with Targeted Fine-Tuning and Data Augmentation**|Dena Mujtaba et.al.|[2406.10177](http://arxiv.org/abs/2406.10177)|null|\n", "2406.10083": "|**2024-06-14**|**On the Evaluation of Speech Foundation Models for Spoken Language Understanding**|Siddhant Arora et.al.|[2406.10083](http://arxiv.org/abs/2406.10083)|null|\n", "2406.10082": "|**2024-06-14**|**Whisper-Flamingo: Integrating Visual Features into Whisper for Audio-Visual Speech Recognition and Translation**|Andrew Rouditchenko et.al.|[2406.10082](http://arxiv.org/abs/2406.10082)|**[link](https://github.com/roudimit/whisper-flamingo)**|\n", "2406.10052": "|**2024-06-14**|**Simul-Whisper: Attention-Guided Streaming Whisper with Truncation Detection**|Haoyu Wang et.al.|[2406.10052](http://arxiv.org/abs/2406.10052)|**[link](https://github.com/backspacetg/simul_whisper)**|\n", "2406.09999": "|**2024-06-14**|**ROAR: Reinforcing Original to Augmented Data Ratio Dynamics for Wav2Vec2.0 Based ASR**|Vishwanath Pratap Singh et.al.|[2406.09999](http://arxiv.org/abs/2406.09999)|null|\n", "2406.10313": "|**2024-06-14**|**CNVSRC 2023: The First Chinese Continuous Visual Speech Recognition Challenge**|Chen Chen et.al.|[2406.10313](http://arxiv.org/abs/2406.10313)|null|\n", "2406.09950": "|**2024-06-14**|**An efficient text augmentation approach for contextualized Mandarin speech recognition**|Naijun Zheng et.al.|[2406.09950](http://arxiv.org/abs/2406.09950)|null|\n", "2406.09873": "|**2024-06-14**|**Perceiver-Prompt: Flexible Speaker Adaptation in Whisper for Chinese Disordered Speech Recognition**|Yicong Jiang et.al.|[2406.09873](http://arxiv.org/abs/2406.09873)|null|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.09676": "|**2024-06-14**|**Optimizing Byte-level Representation for End-to-end ASR**|Roger Hsiao et.al.|[2406.09676](http://arxiv.org/abs/2406.09676)|null|\n", "2406.09662": "|**2024-06-14**|**Learning Language Structures through Grounding**|Freda Shi et.al.|[2406.09662](http://arxiv.org/abs/2406.09662)|null|\n", "2406.09618": "|**2024-06-13**|**Multi-Modal Retrieval For Large Language Model Based Speech Recognition**|Jari Kolehmainen et.al.|[2406.09618](http://arxiv.org/abs/2406.09618)|null|\n", "2406.09569": "|**2024-06-13**|**Speech ReaLLM -- Real-time Streaming Speech Recognition with Multimodal LLMs by Teaching the Flow of Time**|Frank Seide et.al.|[2406.09569](http://arxiv.org/abs/2406.09569)|null|\n", "2406.09494": "|**2024-06-13**|**The Second DISPLACE Challenge : DIarization of SPeaker and LAnguage in Conversational Environments**|Shareef Babu Kalluri et.al.|[2406.09494](http://arxiv.org/abs/2406.09494)|null|\n", "2406.09202": "|**2024-06-13**|**Language Complexity and Speech Recognition Accuracy: Orthographic Complexity Hurts, Phonological Complexity Doesn't**|Chihiro Taguchi et.al.|[2406.09202](http://arxiv.org/abs/2406.09202)|**[link](https://github.com/ctaguchi/asrcomplexity)**|\n", "2406.09153": "|**2024-06-13**|**LASER: Learning by Aligning Self-supervised Representations of Speech for Improving Content-related Tasks**|Amit Meghanani et.al.|[2406.09153](http://arxiv.org/abs/2406.09153)|**[link](https://github.com/Trikaldarshi/LASER)**|\n", "2406.08914": "|**2024-06-13**|**Transcription-Free Fine-Tuning of Speech Separation Models for Noisy and Reverberant Multi-Speaker Automatic Speech Recognition**|William Ravenscroft et.al.|[2406.08914](http://arxiv.org/abs/2406.08914)|null|\n", "2406.08904": "|**2024-06-13**|**AdaPTwin: Low-Cost Adaptive Compression of Product Twins in Transformers**|Emil Biju et.al.|[2406.08904](http://arxiv.org/abs/2406.08904)|null|\n", "2406.08641": "|**2024-06-12**|**ML-SUPERB 2.0: Benchmarking Multilingual Speech Models Across Modeling Constraints, Languages, and Datasets**|Jiatong Shi et.al.|[2406.08641](http://arxiv.org/abs/2406.08641)|null|\n", "2406.08396": "|**2024-06-12**|**Neural Blind Source Separation and Diarization for Distant Speech Recognition**|Yoshiaki Bando et.al.|[2406.08396](http://arxiv.org/abs/2406.08396)|null|\n", "2406.08380": "|**2024-06-12**|**Towards Unsupervised Speech Recognition Without Pronunciation Models**|Junrui Ni et.al.|[2406.08380](http://arxiv.org/abs/2406.08380)|null|\n", "2406.08353": "|**2024-06-12**|**Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study on Word Error Rate and Fusion Techniques**|Yuanchao Li et.al.|[2406.08353](http://arxiv.org/abs/2406.08353)|**[link](https://github.com/yc-li20/SER-on-WER-and-Fusion)**|\n", "2406.08266": "|**2024-06-13**|**Refining Self-Supervised Learnt Speech Representation using Brain Activations**|Hengyu Li et.al.|[2406.08266](http://arxiv.org/abs/2406.08266)|null|\n", "2406.08207": "|**2024-06-12**|**Transformer-based Model for ASR N-Best Rescoring and Rewriting**|Iwen E. Kang et.al.|[2406.08207](http://arxiv.org/abs/2406.08207)|null|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.10284": "|**2024-06-12**|**Improving child speech recognition with augmented child-like speech**|Yuanyuan Zhang et.al.|[2406.10284](http://arxiv.org/abs/2406.10284)|null|\n", "2406.07914": "|**2024-06-14**|**Can Large Language Models Understand Spatial Audio?**|Changli Tang et.al.|[2406.07914](http://arxiv.org/abs/2406.07914)|null|\n", "2406.07909": "|**2024-06-12**|**Guiding Frame-Level CTC Alignments Using Self-knowledge Distillation**|Eungbeom Kim et.al.|[2406.07909](http://arxiv.org/abs/2406.07909)|null|\n", "2406.07846": "|**2024-06-12**|**DualVC 3: Leveraging Language Model Generated Pseudo Context for End-to-end Low Latency Streaming Voice Conversion**|Ziqian Ning et.al.|[2406.07846](http://arxiv.org/abs/2406.07846)|null|\n", "2406.07842": "|**2024-06-12**|**Dual-Pipeline with Low-Rank Adaptation for New Language Integration in Multilingual ASR**|Yerbolat Khassanov et.al.|[2406.07842](http://arxiv.org/abs/2406.07842)|null|\n", "2406.07823": "|**2024-06-12**|**PRoDeliberation: Parallel Robust Deliberation for End-to-End Spoken Language Understanding**|Trang Le et.al.|[2406.07823](http://arxiv.org/abs/2406.07823)|null|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.09443": "|**2024-06-12**|**Comparative Analysis of Personalized Voice Activity Detection Systems: Assessing Real-World Effectiveness**|Satyam Kumar et.al.|[2406.09443](http://arxiv.org/abs/2406.09443)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07256": "|**2024-06-11**|**AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection**|Rong Gong et.al.|[2406.07256](http://arxiv.org/abs/2406.07256)|null|\n", "2406.07589": "|**2024-06-11**|**Tag and correct: high precision post-editing approach to correction of speech recognition errors**|Tomasz Zi\u0119tkiewicz et.al.|[2406.07589](http://arxiv.org/abs/2406.07589)|null|\n", "2406.07096": "|**2024-06-11**|**Fast Context-Biasing for CTC and Transducer ASR models with CTC-based Word Spotter**|Andrei Andrusenko et.al.|[2406.07096](http://arxiv.org/abs/2406.07096)|null|\n", "2406.07090": "|**2024-07-29**|**Spoken Language Corpora Augmentation with Domain-Specific Voice-Cloned Speech**|Mateusz Czy\u017cnikiewicz et.al.|[2406.07090](http://arxiv.org/abs/2406.07090)|null|\n", "2406.07060": "|**2024-06-11**|**Reading Miscue Detection in Primary School through Automatic Speech Recognition**|Lingyun Gao et.al.|[2406.07060](http://arxiv.org/abs/2406.07060)|null|\n", "2406.06729": "|**2024-06-10**|**Synthetic Query Generation using Large Language Models for Virtual Assistants**|Sonal Sannigrahi et.al.|[2406.06729](http://arxiv.org/abs/2406.06729)|null|\n", "2406.06664": "|**2024-06-13**|**ASTRA: Aligning Speech and Text Representations for Asr without Sampling**|Neeraj Gaur et.al.|[2406.06664](http://arxiv.org/abs/2406.06664)|null|\n", "2406.06329": "|**2024-06-10**|**A Parameter-efficient Language Extension Framework for Multilingual ASR**|Wei Liu et.al.|[2406.06329](http://arxiv.org/abs/2406.06329)|null|\n", "2406.05968": "|**2024-06-10**|**Prompting Large Language Models with Audio for General-Purpose Speech Summarization**|Wonjune Kang et.al.|[2406.05968](http://arxiv.org/abs/2406.05968)|**[link](https://github.com/wonjune-kang/llm-speech-summarization)**|\n", "2406.05806": "|**2024-07-18**|**Do Prompts Really Prompt? Exploring the Prompt Understanding Capability of Whisper**|Chih-Kai Yang et.al.|[2406.05806](http://arxiv.org/abs/2406.05806)|null|\n", "2406.05784": "|**2024-07-20**|**Optimizing Multi-Stuttered Speech Classification: Leveraging Whisper's Encoder for Efficient Parameter Reduction in Automated Assessment**|Huma Ameer et.al.|[2406.05784](http://arxiv.org/abs/2406.05784)|null|\n", "2406.05661": "|**2024-06-09**|**MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked Language Modelling methods for learning Speech Representations**|Hemant Yadav et.al.|[2406.05661](http://arxiv.org/abs/2406.05661)|null|\n", "2406.04927": "|**2024-06-07**|**LLM-based speaker diarization correction: A generalizable approach**|Georgios Efstathiadis et.al.|[2406.04927](http://arxiv.org/abs/2406.04927)|**[link](https://github.com/GeorgeEfstathiadis/LLM-Diarize-ASR-Agnostic)**|\n", "2406.04791": "|**2024-07-02**|**Speaker-Smoothed kNN Speaker Adaptation for End-to-End ASR**|Shaojun Li et.al.|[2406.04791](http://arxiv.org/abs/2406.04791)|null|\n", "2406.06619": "|**2024-06-07**|**LoRA-Whisper: Parameter-Efficient and Extensible Multilingual ASR**|Zheshu Song et.al.|[2406.06619](http://arxiv.org/abs/2406.06619)|null|\n", "2406.04595": "|**2024-06-07**|**Pitch-Aware RNN-T for Mandarin Chinese Mispronunciation Detection and Diagnosis**|Xintong Wang et.al.|[2406.04595](http://arxiv.org/abs/2406.04595)|null|\n", "2406.04552": "|**2024-06-06**|**Flexible Multichannel Speech Enhancement for Noise-Robust Frontend**|Ante Juki\u0107 et.al.|[2406.04552](http://arxiv.org/abs/2406.04552)|null|\n", "2406.04541": "|**2024-06-06**|**Label-Synchronous Neural Transducer for E2E Simultaneous Speech Translation**|Keqi Deng et.al.|[2406.04541](http://arxiv.org/abs/2406.04541)|**[link](https://github.com/D-Keqi/LS-Transducer-SST)**|\n", "2406.04512": "|**2024-06-06**|**To Distill or Not to Distill? On the Robustness of Robust Knowledge Distillation**|Abdul Waheed et.al.|[2406.04512](http://arxiv.org/abs/2406.04512)|null|\n", "2406.04432": "|**2024-06-06**|**LipGER: Visually-Conditioned Generative Error Correction for Robust Automatic Speech Recognition**|Sreyan Ghosh et.al.|[2406.04432](http://arxiv.org/abs/2406.04432)|**[link](https://github.com/sreyan88/lipger)**|\n", "2406.04269": "|**2024-06-06**|**Beyond Performance Plateaus: A Comprehensive Study on Scalability in Speech Enhancement**|Wangyou Zhang et.al.|[2406.04269](http://arxiv.org/abs/2406.04269)|**[link](https://github.com/emrys365/se-scaling)**|\n", "2406.04240": "|**2024-07-02**|**Hypernetworks for Personalizing ASR to Atypical Speech**|Max M\u00fcller-Eberstein et.al.|[2406.04240](http://arxiv.org/abs/2406.04240)|null|\n", "2406.04123": "|**2024-06-06**|**Helsinki Speech Challenge 2024**|Martin Ludvigsen et.al.|[2406.04123](http://arxiv.org/abs/2406.04123)|null|\n", "2406.03872": "|**2024-06-06**|**BLSP-Emo: Towards Empathetic Large Speech-Language Models**|Chen Wang et.al.|[2406.03872](http://arxiv.org/abs/2406.03872)|**[link](https://github.com/cwang621/blsp-emo)**|\n", "2406.03814": "|**2024-06-14**|**Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and Gated Monolingual Datastores**|Jiaming Zhou et.al.|[2406.03814](http://arxiv.org/abs/2406.03814)|null|\n", "2406.03791": "|**2024-06-06**|**Speed of Light Exact Greedy Decoding for RNN-T Speech Recognition Models on GPU**|Daniel Galvez et.al.|[2406.03791](http://arxiv.org/abs/2406.03791)|null|\n", "2406.03274": "|**2024-06-11**|**Enhancing CTC-based speech recognition with diverse modeling units**|Shiyi Han et.al.|[2406.03274](http://arxiv.org/abs/2406.03274)|null|\n", "2406.03235": "|**2024-06-05**|**Error-preserving Automatic Speech Recognition of Young English Learners' Language**|Janick Michot et.al.|[2406.03235](http://arxiv.org/abs/2406.03235)|**[link](https://github.com/mict-zhaw/chall_e2e_stt)**|\n", "2406.03049": "|**2024-06-05**|**StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning**|Shaolei Zhang et.al.|[2406.03049](http://arxiv.org/abs/2406.03049)|**[link](https://github.com/ictnlp/streamspeech)**|\n", "2406.02950": "|**2024-06-05**|**4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and Mask Predict Decoders**|Yui Sudo et.al.|[2406.02950](http://arxiv.org/abs/2406.02950)|null|\n", "2406.02925": "|**2024-06-15**|**Task Arithmetic can Mitigate Synthetic-to-Real Gap in Automatic Speech Recognition**|Hsuan Su et.al.|[2406.02925](http://arxiv.org/abs/2406.02925)|null|\n", "2406.02921": "|**2024-06-11**|**Text Injection for Neural Contextual Biasing**|Zhong Meng et.al.|[2406.02921](http://arxiv.org/abs/2406.02921)|null|\n", "2406.06582": "|**2024-06-25**|**Discrete Multimodal Transformers with a Pretrained Large Language Model for Mixed-Supervision Speech Processing**|Viet Anh Trinh et.al.|[2406.06582](http://arxiv.org/abs/2406.06582)|null|\n", "2406.02649": "|**2024-06-04**|**Keyword-Guided Adaptation of Automatic Speech Recognition**|Aviv Shamsian et.al.|[2406.02649](http://arxiv.org/abs/2406.02649)|null|\n", "2406.02166": "|**2024-06-04**|**Whistle: Data-Efficient Multilingual and Crosslingual Speech Recognition via Weakly Phonetic Supervision**|Saierdaer Yusuyin et.al.|[2406.02166](http://arxiv.org/abs/2406.02166)|**[link](https://github.com/thu-spmi/cat)**|\n", "2406.02004": "|**2024-06-05**|**Efficiently Train ASR Models that Memorize Less and Perform Better with Per-core Clipping**|Lun Wang et.al.|[2406.02004](http://arxiv.org/abs/2406.02004)|null|\n", "2406.01446": "|**2024-06-03**|**Enabling ASR for Low-Resource Languages: A Comprehensive Dataset Creation Approach**|Ara Yeroyan et.al.|[2406.01446](http://arxiv.org/abs/2406.01446)|null|\n", "2406.01314": "|**2024-06-03**|**Compute-Efficient Medical Image Classification with Softmax-Free Transformers and Sequence Normalization**|Firas Khader et.al.|[2406.01314](http://arxiv.org/abs/2406.01314)|null|\n", "2406.00899": "|**2024-06-02**|**YODAS: Youtube-Oriented Dataset for Audio and Speech**|Xinjian Li et.al.|[2406.00899](http://arxiv.org/abs/2406.00899)|null|\n", "2406.00522": "|**2024-06-01**|**Wav2Prompt: End-to-End Speech Prompt Generation and Tuning For LLM in Zero and Few-shot Learning**|Keqi Deng et.al.|[2406.00522](http://arxiv.org/abs/2406.00522)|null|\n", "2407.11982": "|**2024-05-31**|**Open the Data! Chuvash Datasets**|Nikolay Plotnikov et.al.|[2407.11982](http://arxiv.org/abs/2407.11982)|null|\n", "2405.18669": "|**2024-05-31**|**Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities**|Vicky Zayats et.al.|[2405.18669](http://arxiv.org/abs/2405.18669)|null|\n", "2405.18537": "|**2024-05-28**|**Augmented Conversation with Embedded Speech-Driven On-the-Fly Referencing in AR**|Shivesh Jadon et.al.|[2405.18537](http://arxiv.org/abs/2405.18537)|null|\n", "2405.18346": "|**2024-05-28**|**Intelligent Clinical Documentation: Harnessing Generative AI for Patient-Centric Clinical Note Generation**|Anjanava Biswas et.al.|[2405.18346](http://arxiv.org/abs/2405.18346)|null|\n", "2405.17874": "|**2024-05-28**|**NUTS, NARS, and Speech**|D. van der Sluis et.al.|[2405.17874](http://arxiv.org/abs/2405.17874)|null|\n", "2405.17809": "|**2024-05-28**|**TransVIP: Speech to Speech Translation System with Voice and Isochrony Preservation**|Chenyang Le et.al.|[2405.17809](http://arxiv.org/abs/2405.17809)|null|\n", "2405.17376": "|**2024-05-27**|**Federating Dynamic Models using Early-Exit Architectures for Automatic Speech Recognition on Heterogeneous Clients**|Mohamed Nabih Ali et.al.|[2405.17376](http://arxiv.org/abs/2405.17376)|null|\n", "2405.17250": "|**2024-05-27**|**\"Pass the butter\": A study on desktop-classic multitasking robotic arm based on advanced YOLOv7 and BERT**|Haohua Que et.al.|[2405.17250](http://arxiv.org/abs/2405.17250)|null|\n", "2406.00038": "|**2024-05-27**|**ViSpeR: Multilingual Audio-Visual Speech Recognition**|Sanath Narayan et.al.|[2406.00038](http://arxiv.org/abs/2406.00038)|null|\n", "2405.16952": "|**2024-05-27**|**A Variance-Preserving Interpolation Approach for Diffusion Models with Applications to Single Channel Speech Enhancement and Recognition**|Zilu Guo et.al.|[2405.16952](http://arxiv.org/abs/2405.16952)|**[link](https://github.com/zelokuo/VPIDM)**|\n", "2405.15216": "|**2024-05-24**|**Denoising LM: Pushing the Limits of Error Correction Models for Speech Recognition**|Zijin Gu et.al.|[2405.15216](http://arxiv.org/abs/2405.15216)|null|\n", "2405.15097": "|**2024-05-23**|**Contrastive and Consistency Learning for Neural Noisy-Channel Model in Spoken Language Understanding**|Suyoung Kim et.al.|[2405.15097](http://arxiv.org/abs/2405.15097)|**[link](https://github.com/syoung7388/ccl)**|\n", "2405.14259": "|**2024-06-02**|**Let's Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Multi-modal Text Recognition**|Chan-Jan Hsu et.al.|[2405.14259](http://arxiv.org/abs/2405.14259)|**[link](https://github.com/mtkresearch/generative-fusion-decoding)**|\n", "2405.14161": "|**2024-05-23**|**Self-Taught Recognizer: Toward Unsupervised Adaptation for Speech Foundation Models**|Yuchen Hu et.al.|[2405.14161](http://arxiv.org/abs/2405.14161)|**[link](https://github.com/yuchen005/star-adapt)**|\n", "2405.14093": "|**2024-05-23**|**A Survey on Vision-Language-Action Models for Embodied AI**|Yueen Ma et.al.|[2405.14093](http://arxiv.org/abs/2405.14093)|null|\n", "2405.13903": "|**2024-05-22**|**ST-Gait++: Leveraging spatio-temporal convolutions for gait-based emotion recognition on videos**|Maria Lu\u00edsa Lima et.al.|[2405.13903](http://arxiv.org/abs/2405.13903)|null|\n", "2405.13514": "|**2024-05-22**|**Joint Optimization of Streaming and Non-Streaming Automatic Speech Recognition with Multi-Decoder and Knowledge Distillation**|Muhammad Shakeel et.al.|[2405.13514](http://arxiv.org/abs/2405.13514)|null|\n", "2405.13477": "|**2024-05-22**|**A Near-Real-Time Processing Ego Speech Filtering Pipeline Designed for Speech Interruption During Human-Robot Interaction**|Yue Li et.al.|[2405.13477](http://arxiv.org/abs/2405.13477)|null|\n", "2405.13379": "|**2024-05-22**|**You don't understand me!: Comparing ASR results for L1 and L2 speakers of Swedish**|Ronald Cumbal et.al.|[2405.13379](http://arxiv.org/abs/2405.13379)|null|\n", "2405.13344": "|**2024-05-22**|**Contextualized Automatic Speech Recognition with Dynamic Vocabulary**|Yui Sudo et.al.|[2405.13344](http://arxiv.org/abs/2405.13344)|null|\n", "2405.13166": "|**2024-05-28**|**FairLENS: Assessing Fairness in Law Enforcement Speech Recognition**|Yicheng Wang et.al.|[2405.13166](http://arxiv.org/abs/2405.13166)|null|\n", "2405.13162": "|**2024-05-21**|**Non-autoregressive real-time Accent Conversion model with voice cloning**|Vladimir Nechaev et.al.|[2405.13162](http://arxiv.org/abs/2405.13162)|null|\n", "2405.12815": "|**2024-05-21**|**Could a Computer Architect Understand our Brain?**|Valentin Puente-Varona et.al.|[2405.12815](http://arxiv.org/abs/2405.12815)|null|\n", "2405.12609": "|**2024-07-01**|**Mamba in Speech: Towards an Alternative to Self-Attention**|Xiangyu Zhang et.al.|[2405.12609](http://arxiv.org/abs/2405.12609)|null|\n", "2405.12018": "|**2024-05-20**|**Continuous Sign Language Recognition with Adapted Conformer via Unsupervised Pretraining**|Neena Aloysius et.al.|[2405.12018](http://arxiv.org/abs/2405.12018)|null|\n", "2405.11078": "|**2024-05-17**|**Acoustic modeling for Overlapping Speech Recognition: JHU Chime-5 Challenge System**|Vimal Manohar et.al.|[2405.11078](http://arxiv.org/abs/2405.11078)|**[link](https://github.com/fgnt/nara_wpe)**|\n", "2405.10025": "|**2024-05-16**|**Listen Again and Choose the Right Answer: A New Paradigm for Automatic Speech Recognition with Large Language Models**|Yuchen Hu et.al.|[2405.10025](http://arxiv.org/abs/2405.10025)|null|\n", "2405.09708": "|**2024-05-15**|**No More Mumbles: Enhancing Robot Intelligibility through Speech Adaptation**|Qiaoqiao Ren et.al.|[2405.09708](http://arxiv.org/abs/2405.09708)|**[link](https://github.com/qiaoqiao2323/robot-speech-intelligibility)**|\n", "2405.09470": "|**2024-05-15**|**Towards Evaluating the Robustness of Automatic Speech Recognition Systems via Audio Style Transfer**|Weifei Jin et.al.|[2405.09470](http://arxiv.org/abs/2405.09470)|null|\n", "2405.13018": "|**2024-05-15**|**Continued Pretraining for Domain Adaptation of Wav2vec2.0 in Automatic Speech Recognition for Elementary Math Classroom Settings**|Ahmed Adel Attia et.al.|[2405.13018](http://arxiv.org/abs/2405.13018)|null|\n", "2405.19342": "|**2024-05-14**|**Sonos Voice Control Bias Assessment Dataset: A Methodology for Demographic Bias Assessment in Voice Assistants**|Chlo\u00e9 Sekkat et.al.|[2405.19342](http://arxiv.org/abs/2405.19342)|null|\n", "2405.08402": "|**2024-05-14**|**Investigating the 'Autoencoder Behavior' in Speech Self-Supervised Models: a focus on HuBERT's Pretraining**|Valentin Vielzeuf et.al.|[2405.08402](http://arxiv.org/abs/2405.08402)|null|\n", "2405.08295": "|**2024-05-31**|**SpeechVerse: A Large-scale Generalizable Audio Language Model**|Nilaksh Das et.al.|[2405.08295](http://arxiv.org/abs/2405.08295)|null|\n", "2405.07442": "|**2024-06-07**|**Rene: A Pre-trained Multi-modal Architecture for Auscultation of Respiratory Diseases**|Pengfei Zhang et.al.|[2405.07442](http://arxiv.org/abs/2405.07442)|**[link](https://github.com/zpforlove/rene)**|\n", "2405.07354": "|**2024-05-12**|**SoccerNet-Echoes: A Soccer Game Audio Commentary Dataset**|Sushant Gautam et.al.|[2405.07354](http://arxiv.org/abs/2405.07354)|**[link](https://github.com/SoccerNet/sn-echoes)**|\n", "2405.13001": "|**2024-05-12**|**Large Language Models for Education: A Survey**|Hanyi Xu et.al.|[2405.13001](http://arxiv.org/abs/2405.13001)|null|\n", "2405.06368": "|**2024-07-22**|**DP-DyLoRA: Fine-Tuning Transformer-Based Models On-Device under Differentially Private Federated Learning using Dynamic Low-Rank Adaptation**|Jie Xu et.al.|[2405.06368](http://arxiv.org/abs/2405.06368)|null|\n", "2405.06150": "|**2024-05-10**|**Lost in Transcription: Identifying and Quantifying the Accuracy Biases of Automatic Speech Recognition Systems Against Disfluent Speech**|Dena Mujtaba et.al.|[2405.06150](http://arxiv.org/abs/2405.06150)|null|\n", "2405.06134": "|**2024-07-17**|**Muting Whisper: A Universal Acoustic Adversarial Attack on Speech Foundation Models**|Vyas Raina et.al.|[2405.06134](http://arxiv.org/abs/2405.06134)|**[link](https://github.com/rainavyas/prepend_acoustic_attack)**|\n", "2405.05498": "|**2024-05-09**|**The RoyalFlush Automatic Speech Diarization and Recognition System for In-Car Multi-Channel Automatic Speech Recognition Challenge**|Jingguang Tian et.al.|[2405.05498](http://arxiv.org/abs/2405.05498)|null|\n", "2405.04296": "|**2024-05-07**|**Open Implementation and Study of BEST-RQ for Speech Processing**|Ryan Whetten et.al.|[2405.04296](http://arxiv.org/abs/2405.04296)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2405.03484": "|**2024-05-06**|**Whispy: Adapting STT Whisper Models to Real-Time Environments**|Antonio Bevilacqua et.al.|[2405.03484](http://arxiv.org/abs/2405.03484)|null|\n", "2405.03152": "|**2024-05-06**|**MMGER: Multi-modal and Multi-granularity Generative Error Correction with LLM for Joint Accent and Speech Recognition**|Bingshen Mu et.al.|[2405.03152](http://arxiv.org/abs/2405.03152)|null|\n", "2405.02995": "|**2024-05-11**|**Analysis about Theoretical Foundations for Method to Enhancing ASR Performance using OCR Word Frequency Differences**|Kyudan Jung et.al.|[2405.02995](http://arxiv.org/abs/2405.02995)|null|\n", "2405.02578": "|**2024-05-04**|**Mixat: A Data Set of Bilingual Emirati-English Speech**|Maryam Al Ali et.al.|[2405.02578](http://arxiv.org/abs/2405.02578)|**[link](https://github.com/mbzuai-nlp/mixat)**|\n", "2406.02566": "|**2024-05-03**|**Combining X-Vectors and Bayesian Batch Active Learning: Two-Stage Active Learning Pipeline for Speech Recognition**|Ognjen Kundacina et.al.|[2406.02566](http://arxiv.org/abs/2406.02566)|null|\n", "2405.02132": "|**2024-05-06**|**Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets**|Xuelong Geng et.al.|[2405.02132](http://arxiv.org/abs/2405.02132)|null|\n", "2406.02565": "|**2024-05-02**|**Sequence-to-sequence models in peer-to-peer learning: A practical application**|Robert \u0160ajina et.al.|[2406.02565](http://arxiv.org/abs/2406.02565)|null|\n", "2405.01293": "|**2024-05-02**|**Low-resource speech recognition and dialect identification of Irish in a multi-task framework**|Liam Lonergan et.al.|[2405.01293](http://arxiv.org/abs/2405.01293)|null|\n", "2405.01207": "|**2024-05-02**|**Improving Membership Inference in ASR Model Auditing with Perturbed Loss Features**|Francisco Teixeira et.al.|[2405.01207](http://arxiv.org/abs/2405.01207)|null|\n", "2405.01004": "|**2024-05-02**|**Deep Learning Models in Speech Recognition: Measuring GPU Energy Consumption, Impact of Noise and Model Quantization for Edge Deployment**|Aditya Chakravarty et.al.|[2405.01004](http://arxiv.org/abs/2405.01004)|**[link](https://github.com/zzadiues3338/asr-energy-jetson)**|\n", "2405.00966": "|**2024-05-02**|**Efficient Compression of Multitask Multilingual Speech Models**|Thomas Palmeira Ferraz et.al.|[2405.00966](http://arxiv.org/abs/2405.00966)|null|\n", "2405.01601": "|**2024-05-01**|**Efficient Sample-Specific Encoder Perturbations**|Yassir Fathullah et.al.|[2405.01601](http://arxiv.org/abs/2405.01601)|null|\n", "2405.00307": "|**2024-05-01**|**Active Learning with Task Adaptation Pre-training for Speech Emotion Recognition**|Dongyuan Li et.al.|[2405.00307](http://arxiv.org/abs/2405.00307)|null|\n", "2405.00223": "|**2024-07-24**|**Confides: A Visual Analytics Solution for Automated Speech Recognition Analysis and Exploration**|Sunwoo Ha et.al.|[2405.00223](http://arxiv.org/abs/2405.00223)|null|\n", "2404.19310": "|**2024-05-09**|**Does Whisper understand Swiss German? An automatic, qualitative, and human evaluation**|Eyal Liron Dolev et.al.|[2404.19310](http://arxiv.org/abs/2404.19310)|null|\n", "2404.19214": "|**2024-04-30**|**EfficientASR: Speech Recognition Network Compression via Attention Redundancy and Chunk-Level FFN Optimization**|Jianzong Wang et.al.|[2404.19214](http://arxiv.org/abs/2404.19214)|null|\n", "2404.18739": "|**2024-04-29**|**Towards Dog Bark Decoding: Leveraging Human Speech Processing for Automated Bark Classification**|Artem Abzaliev et.al.|[2404.18739](http://arxiv.org/abs/2404.18739)|null|\n", "2406.02563": "|**2024-04-29**|**A cost minimization approach to fix the vocabulary size in a tokenizer for an End-to-End ASR system**|Sunil Kumar Kopparapu et.al.|[2406.02563](http://arxiv.org/abs/2406.02563)|null|\n", "2404.17394": "|**2024-04-26**|**Child Speech Recognition in Human-Robot Interaction: Problem Solved?**|Ruben Janssens et.al.|[2404.17394](http://arxiv.org/abs/2404.17394)|null|\n", "2404.16743": "|**2024-04-26**|**Automatic Speech Recognition System-Independent Word Error Rate Estimation**|Chanho Park et.al.|[2404.16743](http://arxiv.org/abs/2404.16743)|null|\n", "2404.16547": "|**2024-04-25**|**Developing Acoustic Models for Automatic Speech Recognition in Swedish**|Giampiero Salvi et.al.|[2404.16547](http://arxiv.org/abs/2404.16547)|null|\n", "2404.16407": "|**2024-04-25**|**U2++ MoE: Scaling 4.7x parameters with minimal impact on RTF**|Xingchen Song et.al.|[2404.16407](http://arxiv.org/abs/2404.16407)|null|\n", "2404.16112": "|**2024-04-24**|**Mamba-360: Survey of State Space Models as Transformer Alternative for Long Sequence Modelling: Methods, Applications, and Challenges**|Badri Narayana Patro et.al.|[2404.16112](http://arxiv.org/abs/2404.16112)|**[link](https://github.com/badripatro/mamba360)**|\n", "2406.02562": "|**2024-04-24**|**Gated Low-rank Adaptation for personalized Code-Switching Automatic Speech Recognition on the low-spec devices**|Gwantae Kim et.al.|[2406.02562](http://arxiv.org/abs/2406.02562)|null|\n", "2404.15501": "|**2024-04-23**|**Killkan: The Automatic Speech Recognition Dataset for Kichwa with Morphosyntactic Information**|Chihiro Taguchi et.al.|[2404.15501](http://arxiv.org/abs/2404.15501)|**[link](https://github.com/ctaguchi/killkan)**|\n", "2406.02561": "|**2024-04-23**|**Breaking Walls: Pioneering Automatic Speech Recognition for Central Kurdish: End-to-End Transformer Paradigm**|Abdulhady Abas Abdullah et.al.|[2406.02561](http://arxiv.org/abs/2406.02561)|null|\n", "2404.14860": "|**2024-04-23**|**Rethinking Processing Distortions: Disentangling the Impact of Speech Enhancement Errors on Speech Recognition Performance**|Tsubasa Ochiai et.al.|[2404.14860](http://arxiv.org/abs/2404.14860)|null|\n", "2404.14605": "|**2024-04-22**|**Assessment of Sign Language-Based versus Touch-Based Input for Deaf Users Interacting with Intelligent Personal Assistants**|Nina Tran et.al.|[2404.14605](http://arxiv.org/abs/2404.14605)|null|\n", "2406.02560": "|**2024-07-18**|**Less Peaky and More Accurate CTC Forced Alignment by Label Priors**|Ruizhe Huang et.al.|[2406.02560](http://arxiv.org/abs/2406.02560)|**[link](https://github.com/huangruizhe/audio)**|\n", "2404.14024": "|**2024-04-22**|**Exploring neural oscillations during speech perception via surrogate gradient spiking neural networks**|Alexandre Bittar et.al.|[2404.14024](http://arxiv.org/abs/2404.14024)|null|\n", "2404.13362": "|**2024-04-20**|**Semantically Corrected Amharic Automatic Speech Recognition**|Samuael Adnew et.al.|[2404.13362](http://arxiv.org/abs/2404.13362)|**[link](https://github.com/samuael/postprocessed_geez_asr)**|\n", "2404.12888": "|**2024-04-19**|**Learn2Talk: 3D Talking Face Learns from 2D Talking Face**|Yixiang Zhuang et.al.|[2404.12888](http://arxiv.org/abs/2404.12888)|null|\n", "2404.12628": "|**2024-04-19**|**Efficient infusion of self-supervised representations in Automatic Speech Recognition**|Darshan Prabhu et.al.|[2404.12628](http://arxiv.org/abs/2404.12628)|null|\n", "2404.15168": "|**2024-04-18**|**Artificial Neural Networks to Recognize Speakers Division from Continuous Bengali Speech**|Hasmot Ali et.al.|[2404.15168](http://arxiv.org/abs/2404.15168)|null|\n", "2404.10922": "|**2024-04-16**|**Teaching a Multilingual Large Language Model to Understand Multilingual Speech via Multi-Instructional Training**|Pavel Denisov et.al.|[2404.10922](http://arxiv.org/abs/2404.10922)|**[link](https://github.com/akreal/bloomzmms)**|\n", "2404.09841": "|**2024-04-16**|**Anatomy of Industrial Scale Multilingual ASR**|Francis McCann Ramirez et.al.|[2404.09841](http://arxiv.org/abs/2404.09841)|null|\n", "2404.09754": "|**2024-04-15**|**Resilience of Large Language Models for Noisy Instructions**|Bin Wang et.al.|[2404.09754](http://arxiv.org/abs/2404.09754)|null|\n", "2406.09425": "|**2024-04-13**|**SGPRS: Seamless GPU Partitioning Real-Time Scheduler for Periodic Deep Learning Workloads**|Amir Fakhim Babaei et.al.|[2406.09425](http://arxiv.org/abs/2406.09425)|null|\n", "2404.08424": "|**2024-04-12**|**Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction in an Object Categorization Task**|Hassan Ali et.al.|[2404.08424](http://arxiv.org/abs/2404.08424)|null|\n", "2404.08368": "|**2024-07-26**|**Automatic Speech Recognition Advancements for Indigenous Languages of the Americas**|Monica Romero et.al.|[2404.08368](http://arxiv.org/abs/2404.08368)|null|\n", "2404.07575": "|**2024-04-12**|**An Effective Automated Speaking Assessment Approach to Mitigating Data Scarcity and Imbalanced Distribution**|Tien-Hong Lo et.al.|[2404.07575](http://arxiv.org/abs/2404.07575)|null|\n", "2404.07341": "|**2024-04-12**|**Conformer-1: Robust ASR via Large-Scale Semisupervised Bootstrapping**|Kevin Zhang et.al.|[2404.07341](http://arxiv.org/abs/2404.07341)|null|\n", "2404.08011": "|**2024-04-10**|**An inclusive review on deep learning techniques and their scope in handwriting recognition**|Sukhdeep Singh et.al.|[2404.08011](http://arxiv.org/abs/2404.08011)|null|\n", "2404.06079": "|**2024-04-10**|**The X-LANCE Technical Report for Interspeech 2024 Speech Processing Using Discrete Speech Unit Challenge**|Yiwei Guo et.al.|[2404.06079](http://arxiv.org/abs/2404.06079)|null|\n", "2404.05659": "|**2024-05-28**|**VietMed: A Dataset and Benchmark for Automatic Speech Recognition of Vietnamese in the Medical Domain**|Khai Le-Duc et.al.|[2404.05659](http://arxiv.org/abs/2404.05659)|**[link](https://github.com/leduckhai/multimed)**|\n", "2404.04769": "|**2024-04-07**|**Safeguarding Voice Privacy: Harnessing Near-Ultrasonic Interference To Protect Against Unauthorized Audio Recording**|Forrest McKee et.al.|[2404.04769](http://arxiv.org/abs/2404.04769)|null|\n", "2404.04295": "|**2024-04-04**|**Transducers with Pronunciation-aware Embeddings for Automatic Speech Recognition**|Hainan Xu et.al.|[2404.04295](http://arxiv.org/abs/2404.04295)|null|\n", "2404.03073": "|**2024-04-03**|**Mai Ho'om\u0101una i ka 'Ai: Language Models Improve Automatic Speech Recognition in Hawaiian**|Kaavya Chaparala et.al.|[2404.03073](http://arxiv.org/abs/2404.03073)|null|\n", "2404.02408": "|**2024-04-03**|**CMULAB: An Open-Source Framework for Training and Deployment of Natural Language Processing Models**|Zaid Sheikh et.al.|[2404.02408](http://arxiv.org/abs/2404.02408)|**[link](https://github.com/neulab/cmulab)**|\n", "2404.02098": "|**2024-04-02**|**BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory Speech Recognition**|Alexandros Haliassos et.al.|[2404.02098](http://arxiv.org/abs/2404.02098)|**[link](https://github.com/ahaliassos/raven)**|\n", "2404.02052": "|**2024-04-02**|**Noise Masking Attacks and Defenses for Pretrained Speech Models**|Matthew Jagielski et.al.|[2404.02052](http://arxiv.org/abs/2404.02052)|null|\n", "2404.01991": "|**2024-04-02**|**Kallaama: A Transcribed Speech Dataset about Agriculture in the Three Most Widely Spoken Languages in Senegal**|Elodie Gauthier et.al.|[2404.01991](http://arxiv.org/abs/2404.01991)|**[link](https://github.com/gauthelo/kallaama-speech-dataset)**|\n", "2404.01737": "|**2024-04-02**|**Transfer Learning from Whisper for Microscopic Intelligibility Prediction**|Paul Best et.al.|[2404.01737](http://arxiv.org/abs/2404.01737)|null|\n", "2404.07226": "|**2024-03-31**|**Houston we have a Divergence: A Subgroup Performance Analysis of ASR Models**|Alkis Koudounas et.al.|[2404.07226](http://arxiv.org/abs/2404.07226)|null|\n", "2403.20262": "|**2024-07-22**|**ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language Models**|Thibaut Thonet et.al.|[2403.20262](http://arxiv.org/abs/2403.20262)|**[link](https://github.com/utter-project/elitr-bench)**|\n", "2403.19822": "|**2024-03-28**|**Multi-Stage Multi-Modal Pre-Training for Automatic Speech Recognition**|Yash Jain et.al.|[2403.19822](http://arxiv.org/abs/2403.19822)|null|\n", "2403.19224": "|**2024-03-28**|**Emotion Neural Transducer for Fine-Grained Speech Emotion Recognition**|Siyuan Shen et.al.|[2403.19224](http://arxiv.org/abs/2403.19224)|**[link](https://github.com/ecnu-cross-innovation-lab/ent)**|\n", "2403.19207": "|**2024-03-28**|**LV-CTC: Non-autoregressive ASR with CTC and latent variable models**|Yuya Fujita et.al.|[2403.19207](http://arxiv.org/abs/2403.19207)|null|\n", "2403.18721": "|**2024-06-04**|**PhysicsAssistant: An LLM-Powered Interactive Learning Robot for Physics Lab Investigations**|Ehsan Latif et.al.|[2403.18721](http://arxiv.org/abs/2403.18721)|null|\n", "2406.02555": "|**2024-03-27**|**PhoWhisper: Automatic Speech Recognition for Vietnamese**|Thanh-Thien Le et.al.|[2406.02555](http://arxiv.org/abs/2406.02555)|**[link](https://github.com/vinairesearch/phowhisper)**|\n", "2403.18182": "|**2024-03-27**|**ZAEBUC-Spoken: A Multilingual Multidialectal Arabic-English Speech Corpus**|Injy Hamed et.al.|[2403.18182](http://arxiv.org/abs/2403.18182)|null|\n", "2403.17645": "|**2024-04-11**|**DANCER: Entity Description Augmented Named Entity Corrector for Automatic Speech Recognition**|Yi-Cheng Wang et.al.|[2403.17645](http://arxiv.org/abs/2403.17645)|null|\n", "2403.17363": "|**2024-03-26**|**Extracting Biomedical Entities from Noisy Audio Transcripts**|Nima Ebadi et.al.|[2403.17363](http://arxiv.org/abs/2403.17363)|null|\n", "2403.19709": "|**2024-03-25**|**Hierarchical Recurrent Adapters for Efficient Multi-Task Adaptation of Large Speech Models**|Tsendsuren Munkhdalai et.al.|[2403.19709](http://arxiv.org/abs/2403.19709)|null|\n", "2403.16655": "|**2024-03-25**|**Grammatical vs Spelling Error Correction: An Investigation into the Responsiveness of Transformer-based Language Models using BART and MarianMT**|Rohit Raju et.al.|[2403.16655](http://arxiv.org/abs/2403.16655)|null|\n", "2403.15510": "|**2024-03-22**|**Privacy-Preserving End-to-End Spoken Language Understanding**|Yinggui Wang et.al.|[2403.15510](http://arxiv.org/abs/2403.15510)|null|\n", "2403.14438": "|**2024-03-26**|**A Multimodal Approach to Device-Directed Speech Detection with Large Language Models**|Dominik Wagner et.al.|[2403.14438](http://arxiv.org/abs/2403.14438)|null|\n", "2403.14402": "|**2024-03-21**|**XLAVS-R: Cross-Lingual Audio-Visual Speech Representation Learning for Noise-Robust Speech Perception**|HyoJung Han et.al.|[2403.14402](http://arxiv.org/abs/2403.14402)|null|\n", "2403.14168": "|**2024-06-04**|**M$^3$AV: A Multimodal, Multigenre, and Multipurpose Audio-Visual Academic Lecture Dataset**|Zhe Chen et.al.|[2403.14168](http://arxiv.org/abs/2403.14168)|null|\n", "2403.13960": "|**2024-03-20**|**Open Access NAO (OAN): a ROS2-based software framework for HRI applications with the NAO robot**|Antonio Bono et.al.|[2403.13960](http://arxiv.org/abs/2403.13960)|null|\n", "2403.13465": "|**2024-03-20**|**BanglaNum -- A Public Dataset for Bengali Digit Recognition from Speech**|Mir Sayeed Mohammad et.al.|[2403.13465](http://arxiv.org/abs/2403.13465)|null|\n", "2403.13423": "|**2024-03-20**|**Advanced Long-Content Speech Recognition With Factorized Neural Transducer**|Xun Gong et.al.|[2403.13423](http://arxiv.org/abs/2403.13423)|null|\n", "2403.15469": "|**2024-03-20**|**Isometric Neural Machine Translation using Phoneme Count Ratio Reward-based Reinforcement Learning**|Shivam Ratnakant Mhaskar et.al.|[2403.15469](http://arxiv.org/abs/2403.15469)|null|\n", "2403.12821": "|**2024-03-21**|**FlowerFormer: Empowering Neural Architecture Encoding using a Flow-aware Graph Transformer**|Dongyeong Hwang et.al.|[2403.12821](http://arxiv.org/abs/2403.12821)|**[link](https://github.com/y0ngjaenius/cvpr2024_flowerformer)**|\n", "2403.12477": "|**2024-03-19**|**Real-time Speech Extraction Using Spatially Regularized Independent Low-rank Matrix Analysis and Rank-constrained Spatial Covariance Matrix Estimation**|Yuto Ishikawa et.al.|[2403.12477](http://arxiv.org/abs/2403.12477)|null|\n", "2403.12273": "|**2024-03-18**|**Multimodal Human-Autonomous Agents Interaction Using Pre-Trained Language and Visual Foundation Models**|Linus Nwankwo et.al.|[2403.12273](http://arxiv.org/abs/2403.12273)|null|\n", "2403.11578": "|**2024-03-18**|**AdaMER-CTC: Connectionist Temporal Classification with Adaptive Maximum Entropy Regularization for Automatic Speech Recognition**|SooHwan Eom et.al.|[2403.11578](http://arxiv.org/abs/2403.11578)|null|\n", "2403.15442": "|**2024-07-21**|**Artificial Intelligence for Cochlear Implants: Review of Strategies, Challenges, and Perspectives**|Billel Essaid et.al.|[2403.15442](http://arxiv.org/abs/2403.15442)|null|\n", "2403.10961": "|**2024-03-16**|**Energy-Based Models with Applications to Speech and Language Processing**|Zhijian Ou et.al.|[2403.10961](http://arxiv.org/abs/2403.10961)|null|\n", "2403.10937": "|**2024-03-16**|**Initial Decoding with Minimally Augmented Language Model for Improved Lattice Rescoring in Low Resource ASR**|Savitha Murthy et.al.|[2403.10937](http://arxiv.org/abs/2403.10937)|null|\n", "2403.10420": "|**2024-03-15**|**Neural Networks Hear You Loud And Clear: Hearing Loss Compensation Using Deep Neural Networks**|Peter Leer et.al.|[2403.10420](http://arxiv.org/abs/2403.10420)|null|\n", "2403.09753": "|**2024-03-14**|**SpokeN-100: A Cross-Lingual Benchmarking Dataset for The Classification of Spoken Numbers in Different Languages**|Ren\u00e9 Groh et.al.|[2403.09753](http://arxiv.org/abs/2403.09753)|**[link](https://github.com/ankilab/spoken-100)**|\n", "2403.09298": "|**2024-03-14**|**More than words: Advancements and challenges in speech recognition for singing**|Anna Kruspe et.al.|[2403.09298](http://arxiv.org/abs/2403.09298)|null|\n", "2405.12983": "|**2024-03-14**|**Multilingual Audio-Visual Speech Recognition with Hybrid CTC/RNN-T Fast Conformer**|Maxime Burchi et.al.|[2405.12983](http://arxiv.org/abs/2405.12983)|null|\n", "2403.08258": "|**2024-05-21**|**Skipformer: A Skip-and-Recover Strategy for Efficient Speech Recognition**|Wenjing Zhu et.al.|[2403.08258](http://arxiv.org/abs/2403.08258)|null|\n", "2403.08196": "|**2024-03-13**|**SpeechColab Leaderboard: An Open-Source Platform for Automatic Speech Recognition Evaluation**|Jiayu Du et.al.|[2403.08196](http://arxiv.org/abs/2403.08196)|**[link](https://github.com/speechcolab/leaderboard)**|\n", "2403.08187": "|**2024-03-13**|**Automatic Speech Recognition (ASR) for the Diagnosis of pronunciation of Speech Sound Disorders in Korean children**|Taekyung Ahn et.al.|[2403.08187](http://arxiv.org/abs/2403.08187)|null|\n", "2403.08011": "|**2024-03-12**|**Gujarati-English Code-Switching Speech Recognition using ensemble prediction of spoken language**|Yash Sharma et.al.|[2403.08011](http://arxiv.org/abs/2403.08011)|null|\n", "2403.07767": "|**2024-03-12**|**Beyond the Labels: Unveiling Text-Dependency in Paralinguistic Speech Recognition Datasets**|Jan Pe\u0161\u00e1n et.al.|[2403.07767](http://arxiv.org/abs/2403.07767)|null|\n", "2403.07947": "|**2024-03-11**|**The evaluation of a code-switched Sepedi-English automatic speech recognition system**|Amanda Phaladi et.al.|[2403.07947](http://arxiv.org/abs/2403.07947)|null|\n", "2403.06734": "|**2024-03-11**|**Real-Time Multimodal Cognitive Assistant for Emergency Medical Services**|Keshara Weerasinghe et.al.|[2403.06734](http://arxiv.org/abs/2403.06734)|**[link](https://github.com/uva-dsa/ems-pipeline)**|\n", "2403.06387": "|**2024-03-11**|**Towards Decoupling Frontend Enhancement and Backend Recognition in Monaural Robust ASR**|Yufeng Yang et.al.|[2403.06387](http://arxiv.org/abs/2403.06387)|null|\n", "2403.06260": "|**2024-03-10**|**SCORE: Self-supervised Correspondence Fine-tuning for Improved Content Representations**|Amit Meghanani et.al.|[2403.06260](http://arxiv.org/abs/2403.06260)|**[link](https://github.com/trikaldarshi/score_finetuning)**|\n", "2403.05887": "|**2024-03-09**|**Aligning Speech to Languages to Enhance Code-switching Speech Recognition**|Hexin Liu et.al.|[2403.05887](http://arxiv.org/abs/2403.05887)|null|\n", "2403.07937": "|**2024-03-08**|**Speech Robust Bench: A Robustness Benchmark For Speech Recognition**|Muhammad A. Shah et.al.|[2403.07937](http://arxiv.org/abs/2403.07937)|null|\n", "2403.04445": "|**2024-03-07**|**Classist Tools: Social Class Correlates with Performance in NLP**|Amanda Cercas Curry et.al.|[2403.04445](http://arxiv.org/abs/2403.04445)|null|\n", "2403.04280": "|**2024-05-30**|**A New Benchmark for Evaluating Automatic Speech Recognition in the Arabic Call Domain**|Qusai Abo Obaidah et.al.|[2403.04280](http://arxiv.org/abs/2403.04280)|null|\n", "2403.04245": "|**2024-03-07**|**A Study of Dropout-Induced Modality Bias on Robustness to Missing Video Frames for Audio-Visual Speech Recognition**|Yusheng Dai et.al.|[2403.04245](http://arxiv.org/abs/2403.04245)|**[link](https://github.com/dalision/modalbiasavsr)**|\n", "2403.03538": "|**2024-03-06**|**RADIA -- Radio Advertisement Detection with Intelligent Analytics**|Jorge \u00c1lvarez et.al.|[2403.03538](http://arxiv.org/abs/2403.03538)|null|\n", "2403.03522": "|**2024-03-13**|**Non-verbal information in spontaneous speech -- towards a new framework of analysis**|Tirza Biron et.al.|[2403.03522](http://arxiv.org/abs/2403.03522)|null|\n", "2403.02938": "|**2024-03-05**|**AIx Speed: Playback Speed Optimization Using Listening Comprehension of Speech Recognition Models**|Kazuki Kawamura et.al.|[2403.02938](http://arxiv.org/abs/2403.02938)|null|\n", "2403.02288": "|**2024-03-04**|**PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings**|Joonas Kalda et.al.|[2403.02288](http://arxiv.org/abs/2403.02288)|**[link](https://github.com/joonaskalda/pixit)**|\n", "2403.02173": "|**2024-03-04**|**What has LeBenchmark Learnt about French Syntax?**|Zdravko Dugonji\u0107 et.al.|[2403.02173](http://arxiv.org/abs/2403.02173)|null|\n", "2403.02010": "|**2024-03-04**|**SA-SOT: Speaker-Aware Serialized Output Training for Multi-Talker ASR**|Zhiyun Fan et.al.|[2403.02010](http://arxiv.org/abs/2403.02010)|null|\n", "2403.01983": "|**2024-03-04**|**Language and Speech Technology for Central Kurdish Varieties**|Sina Ahmadi et.al.|[2403.01983](http://arxiv.org/abs/2403.01983)|**[link](https://github.com/sinaahmadi/cordi)**|\n", "2403.18843": "|**2024-03-04**|**JEP-KD: Joint-Embedding Predictive Architecture Based Knowledge Distillation for Visual Speech Recognition**|Chang Sun et.al.|[2403.18843](http://arxiv.org/abs/2403.18843)|null|\n", "2403.01369": "|**2024-03-03**|**A Closer Look at Wav2Vec2 Embeddings for On-Device Single-Channel Speech Enhancement**|Ravi Shankar et.al.|[2403.01369](http://arxiv.org/abs/2403.01369)|null|\n", "2403.05583": "|**2024-03-02**|**A Cross-Modal Approach to Silent Speech with LLM-Enhanced Recognition**|Tyler Benster et.al.|[2403.05583](http://arxiv.org/abs/2403.05583)|**[link](https://github.com/tbenst/silent_speech)**|\n", "2403.01255": "|**2024-04-18**|**Automatic Speech Recognition using Advanced Deep Learning Approaches: A survey**|Hamza Kheddar et.al.|[2403.01255](http://arxiv.org/abs/2403.01255)|null|\n", "2403.00370": "|**2024-03-01**|**Post-decoder Biasing for End-to-End Speech Recognition of Multi-turn Medical Interview**|Heyang Liu et.al.|[2403.00370](http://arxiv.org/abs/2403.00370)|null|\n", "2402.19443": "|**2024-02-29**|**Probing the Information Encoded in Neural-based Acoustic Models of Automatic Speech Recognition Systems**|Quentin Raymondaud et.al.|[2402.19443](http://arxiv.org/abs/2402.19443)|null|\n", "2402.18923": "|**2024-02-29**|**Inappropriate Pause Detection In Dysarthric Speech Using Large-Scale Speech Recognition**|Jeehyun Lee et.al.|[2402.18923](http://arxiv.org/abs/2402.18923)|null|\n", "2402.18275": "|**2024-06-04**|**Exploration of Adapter for Noise Robust Automatic Speech Recognition**|Hao Shi et.al.|[2402.18275](http://arxiv.org/abs/2402.18275)|null|\n", "2402.17954": "|**2024-06-19**|**Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps**|Giuseppe Attanasio et.al.|[2402.17954](http://arxiv.org/abs/2402.17954)|**[link](https://github.com/g8a9/multilingual-asr-gender-gap)**|\n", "2402.17189": "|**2024-02-27**|**An Effective Mixture-Of-Experts Approach For Code-Switching Speech Recognition Leveraging Encoder Disentanglement**|Tzu-Ting Yang et.al.|[2402.17189](http://arxiv.org/abs/2402.17189)|null|\n", "2402.17184": "|**2024-02-27**|**Extreme Encoder Output Frame Rate Reduction: Improving Computational Latencies of Large End-to-End Models**|Rohit Prabhavalkar et.al.|[2402.17184](http://arxiv.org/abs/2402.17184)|null|\n", "2402.15733": "|**2024-04-01**|**ArEEG_Chars: Dataset for Envisioned Speech Recognition using EEG for Arabic Characters**|Hazem Darwish et.al.|[2402.15733](http://arxiv.org/abs/2402.15733)|null|\n", "2402.15151": "|**2024-05-14**|**Where Visual Speech Meets Language: VSP-LLM Framework for Efficient and Context-Aware Visual Speech Processing**|Jeong Hun Yeo et.al.|[2402.15151](http://arxiv.org/abs/2402.15151)|**[link](https://github.com/sally-sh/vsp-llm)**|\n", "2402.14563": "|**2024-02-22**|**Wizard of Oz Experimentation for Language Technology Applications: Challenges and Tools**|Stephan Schl\u00f6gl et.al.|[2402.14563](http://arxiv.org/abs/2402.14563)|null|\n", "2402.14888": "|**2024-02-22**|**Efficient data selection employing Semantic Similarity-based Graph Structures for model training**|Roxana Petcu et.al.|[2402.14888](http://arxiv.org/abs/2402.14888)|null|\n", "2402.14185": "|**2024-02-22**|**HINT: High-quality INPainting Transformer with Mask-Aware Encoding and Enhanced Attention**|Shuang Chen et.al.|[2402.14185](http://arxiv.org/abs/2402.14185)|**[link](https://github.com/chrischen1023/hint)**|\n", "2402.13687": "|**2024-02-21**|**An Augmented Lagrangian Method for Training Recurrent Neural Networks**|Yue Wang et.al.|[2402.13687](http://arxiv.org/abs/2402.13687)|null|\n", "2402.13511": "|**2024-02-22**|**Mel-FullSubNet: Mel-Spectrogram Enhancement for Improving Both Speech Quality and ASR**|Rui Zhou et.al.|[2402.13511](http://arxiv.org/abs/2402.13511)|null|\n", "2402.13208": "|**2024-02-20**|**How do Hyenas deal with Human Speech? Speech Recognition and Translation with ConfHyena**|Marco Gaido et.al.|[2402.13208](http://arxiv.org/abs/2402.13208)|**[link](https://github.com/hlt-mt/fbk-fairseq)**|\n", "2402.13076": "|**2024-02-20**|**Not All Weights Are Created Equal: Enhancing Energy Efficiency in On-Device Streaming Speech Recognition**|Yang Li et.al.|[2402.13076](http://arxiv.org/abs/2402.13076)|null|\n", "2402.13004": "|**2024-02-20**|**Comparison of Conventional Hybrid and CTC/Attention Decoders for Continuous Visual Speech Recognition**|David Gimeno-G\u00f3mez et.al.|[2402.13004](http://arxiv.org/abs/2402.13004)|null|\n", "2402.12654": "|**2024-06-16**|**OWSM-CTC: An Open Encoder-Only Speech Foundation Model for Speech Recognition, Translation, and Language Identification**|Yifan Peng et.al.|[2402.12654](http://arxiv.org/abs/2402.12654)|null|\n", "2402.11954": "|**2024-02-19**|**Multimodal Emotion Recognition from Raw Audio with Sinc-convolution**|Xiaohui Zhang et.al.|[2402.11954](http://arxiv.org/abs/2402.11954)|null|\n", "2402.11571": "|**2024-02-18**|**Ain't Misbehavin' -- Using LLMs to Generate Expressive Robot Behavior in Conversations with the Tabletop Robot Haru**|Zining Wang et.al.|[2402.11571](http://arxiv.org/abs/2402.11571)|null|\n", "2402.11520": "|**2024-02-18**|**Cross-Attention Fusion of Visual and Geometric Features for Large Vocabulary Arabic Lipreading**|Samar Daou et.al.|[2402.11520](http://arxiv.org/abs/2402.11520)|null|\n", "2402.09797": "|**2024-02-15**|**A cross-talk robust multichannel VAD model for multiparty agent interactions trained using synthetic re-recordings**|Hyewon Han et.al.|[2402.09797](http://arxiv.org/abs/2402.09797)|null|\n", "2402.08932": "|**2024-02-14**|**Listening to Multi-talker Conversations: Modular and End-to-end Perspectives**|Desh Raj et.al.|[2402.08932](http://arxiv.org/abs/2402.08932)|null|\n", "2402.08898": "|**2024-02-14**|**UniEnc-CASSNAT: An Encoder-only Non-autoregressive ASR for Speech SSL Models**|Ruchao Fan et.al.|[2402.08898](http://arxiv.org/abs/2402.08898)|null|\n", "2402.08846": "|**2024-02-13**|**An Embarrassingly Simple Approach for LLM with Strong ASR Capacity**|Ziyang Ma et.al.|[2402.08846](http://arxiv.org/abs/2402.08846)|**[link](https://github.com/X-LANCE/SLAM-LLM)**|\n", "2402.08788": "|**2024-02-13**|**Syllable based DNN-HMM Cantonese Speech to Text System**|Timothy Wong et.al.|[2402.08788](http://arxiv.org/abs/2402.08788)|null|\n", "2402.08021": "|**2024-05-03**|**Careless Whisper: Speech-to-Text Hallucination Harms**|Allison Koenecke et.al.|[2402.08021](http://arxiv.org/abs/2402.08021)|**[link](https://github.com/koenecke/hallucination_harms)**|\n", "2402.07729": "|**2024-07-26**|**AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension**|Qian Yang et.al.|[2402.07729](http://arxiv.org/abs/2402.07729)|**[link](https://github.com/ofa-sys/air-bench)**|\n", "2402.07658": "|**2024-02-12**|**The Sound of Healthcare: Improving Medical Transcription ASR Accuracy with Large Language Models**|Ayo Adedeji et.al.|[2402.07658](http://arxiv.org/abs/2402.07658)|null|\n", "2402.07513": "|**2024-02-12**|**The Balancing Act: Unmasking and Alleviating ASR Biases in Portuguese**|Ajinkya Kulkarni et.al.|[2402.07513](http://arxiv.org/abs/2402.07513)|null|\n", "2402.07431": "|**2024-02-13**|**SALAD: Smart AI Language Assistant Daily**|Ragib Amin Nihal et.al.|[2402.07431](http://arxiv.org/abs/2402.07431)|null|\n", "2402.07095": "|**2024-02-11**|**Does ChatGPT and Whisper Make Humanoid Robots More Relatable?**|Xiaohui Chen et.al.|[2402.07095](http://arxiv.org/abs/2402.07095)|null|\n", "2402.06966": "|**2024-02-10**|**DeepCover: Advancing RNN Test Coverage and Online Error Prediction using State Machine Extraction**|Pouria Golshanrad et.al.|[2402.06966](http://arxiv.org/abs/2402.06966)|**[link](https://github.com/pouriagr/deep-cover)**|\n", "2402.06923": "|**2024-02-10**|**CochCeps-Augment: A Novel Self-Supervised Contrastive Learning Using Cochlear Cepstrum-based Masking for Speech Emotion Recognition**|Ioannis Ziogas et.al.|[2402.06923](http://arxiv.org/abs/2402.06923)|null|\n", "2402.06592": "|**2024-02-09**|**Self-consistent context aware conformer transducer for speech recognition**|Konstantin Kolokolov et.al.|[2402.06592](http://arxiv.org/abs/2402.06592)|null|\n", "2402.05706": "|**2024-02-08**|**Unified Speech-Text Pretraining for Spoken Dialog Modeling**|Heeseung Kim et.al.|[2402.05706](http://arxiv.org/abs/2402.05706)|null|\n", "2402.05457": "|**2024-02-08**|**It's Never Too Late: Fusing Acoustic Information into Large Language Models for Automatic Speech Recognition**|Chen Chen et.al.|[2402.05457](http://arxiv.org/abs/2402.05457)|null|\n", "2402.04805": "|**2024-02-07**|**Progressive unsupervised domain adaptation for ASR using ensemble models and multi-stage training**|Rehan Ahmad et.al.|[2402.04805](http://arxiv.org/abs/2402.04805)|null|\n", "2402.03988": "|**2024-05-28**|**REBORN: Reinforcement-Learned Boundary Segmentation with Iterative Training for Unsupervised ASR**|Liang-Hsuan Tseng et.al.|[2402.03988](http://arxiv.org/abs/2402.03988)|**[link](https://github.com/andybi7676/reborn-uasr)**|\n", "2402.03519": "|**2024-02-05**|**Resolving Transcription Ambiguity in Spanish: A Hybrid Acoustic-Lexical System for Punctuation Restoration**|Xiliang Zhu et.al.|[2402.03519](http://arxiv.org/abs/2402.03519)|null|\n", "2402.03050": "|**2024-02-05**|**A Comprehensive Study of the Current State-of-the-Art in Nepali Automatic Speech Recognition Systems**|Rupak Raj Ghimire et.al.|[2402.03050](http://arxiv.org/abs/2402.03050)|null|\n", "2402.02302": "|**2024-02-03**|**Predicting positive transfer for improved low-resource speech recognition using acoustic pseudo-tokens**|Nay San et.al.|[2402.02302](http://arxiv.org/abs/2402.02302)|null|\n", "2402.01931": "|**2024-02-02**|**Digits micro-model for accurate and secure transactions**|Chirag Chhablani et.al.|[2402.01931](http://arxiv.org/abs/2402.01931)|null|\n", "2402.01917": "|**2024-02-02**|**Whispering in Norwegian: Navigating Orthographic and Dialectic Challenges**|Per E Kummervold et.al.|[2402.01917](http://arxiv.org/abs/2402.01917)|null|\n", "2402.01172": "|**2024-02-02**|**Streaming Sequence Transduction through Dynamic Compression**|Weiting Tan et.al.|[2402.01172](http://arxiv.org/abs/2402.01172)|**[link](https://github.com/steventan0110/star)**|\n", "2402.01152": "|**2024-02-05**|**AccentFold: A Journey through African Accents for Zero-Shot ASR Adaptation to Target Accents**|Abraham Toluwase Owodunni et.al.|[2402.01152](http://arxiv.org/abs/2402.01152)|null|\n", "2402.01778": "|**2024-02-01**|**Introduction to speech recognition**|Gabriel Dauphin et.al.|[2402.01778](http://arxiv.org/abs/2402.01778)|null|\n", "2402.00632": "|**2024-02-01**|**Prosody in Cascade and Direct Speech-to-Text Translation: a case study on Korean Wh-Phrases**|Giulio Zhou et.al.|[2402.00632](http://arxiv.org/abs/2402.00632)|null|\n", "2402.00235": "|**2024-01-31**|**Exploring the limits of decoder-only models trained on public speech recognition corpora**|Ankit Gupta et.al.|[2402.00235](http://arxiv.org/abs/2402.00235)|null|\n", "2401.18045": "|**2024-01-31**|**SpeechComposer: Unifying Multiple Speech Tasks with Prompt Composition**|Yihan Wu et.al.|[2401.18045](http://arxiv.org/abs/2401.18045)|null|\n", "2401.17604": "|**2024-02-08**|**Computation and Parameter Efficient Multi-Modal Fusion Transformer for Cued Speech Recognition**|Lei Liu et.al.|[2401.17604](http://arxiv.org/abs/2401.17604)|null|\n", "2401.16658": "|**2024-06-16**|**OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer**|Yifan Peng et.al.|[2401.16658](http://arxiv.org/abs/2401.16658)|null|\n", "2401.15704": "|**2024-01-28**|**Phoneme-Based Proactive Anti-Eavesdropping with Controlled Recording Privilege**|Peng Huang et.al.|[2401.15704](http://arxiv.org/abs/2401.15704)|null|\n", "2401.15676": "|**2024-01-28**|**On Speaker Attribution with SURT**|Desh Raj et.al.|[2401.15676](http://arxiv.org/abs/2401.15676)|**[link](https://github.com/k2-fsa/icefall)**|\n", "2401.15532": "|**2024-01-28**|**Byte Pair Encoding Is All You Need For Automatic Bengali Speech Recognition**|Ahnaf Mozib Samin et.al.|[2401.15532](http://arxiv.org/abs/2401.15532)|null|\n", "2401.15385": "|**2024-01-27**|**Towards Event Extraction from Speech with Contextual Clues**|Jingqi Kang et.al.|[2401.15385](http://arxiv.org/abs/2401.15385)|**[link](https://github.com/jodie-kang/speechee)**|\n", "2401.14890": "|**2024-01-26**|**Comparison of parameters of vowel sounds of russian and english languages**|V. I. Fedoseev et.al.|[2401.14890](http://arxiv.org/abs/2401.14890)|null|\n", "2401.14625": "|**2024-01-26**|**Toward Practical Automatic Speech Recognition and Post-Processing: a Call for Explainable Error Benchmark Guideline**|Seonmin Koo et.al.|[2401.14625](http://arxiv.org/abs/2401.14625)|null|\n", "2401.14185": "|**2024-01-25**|**TDFNet: An Efficient Audio-Visual Speech Separation Model with Top-down Fusion**|Samuel Pegg et.al.|[2401.14185](http://arxiv.org/abs/2401.14185)|**[link](https://github.com/spkgyk/TDFNet)**|\n", "2401.13575": "|**2024-01-24**|**CNN architecture extraction on edge GPU**|Peter Horvath et.al.|[2401.13575](http://arxiv.org/abs/2401.13575)|null|\n", "2401.13463": "|**2024-03-18**|**SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken Question Answering**|Chyi-Jiunn Lin et.al.|[2401.13463](http://arxiv.org/abs/2401.13463)|null|\n", "2401.13260": "|**2024-05-28**|**MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion, Asr Error Detection, and Asr Error Correction**|Jiajun He et.al.|[2401.13260](http://arxiv.org/abs/2401.13260)|null|\n", "2401.13146": "|**2024-01-23**|**Locality enhanced dynamic biasing and sampling strategies for contextual ASR**|Md Asif Jalal et.al.|[2401.13146](http://arxiv.org/abs/2401.13146)|null|\n", "2401.12789": "|**2024-01-23**|**Multilingual and Fully Non-Autoregressive ASR with Large Language Model Fusion: A Comprehensive Study**|W. Ronny Huang et.al.|[2401.12789](http://arxiv.org/abs/2401.12789)|null|\n", "2401.12085": "|**2024-01-22**|**Consistency Based Unsupervised Self-training For ASR Personalisation**|Jisi Zhang et.al.|[2401.12085](http://arxiv.org/abs/2401.12085)|null|\n", "2401.11983": "|**2024-01-22**|**Lightweight Protection for Privacy in Offloaded Speech Understanding**|Dongqi Cai et.al.|[2401.11983](http://arxiv.org/abs/2401.11983)|null|\n", "2401.11700": "|**2024-01-22**|**Keep Decoding Parallel with Effective Knowledge Distillation from Language Models to End-to-end Speech Recognisers**|Michael Hentschel et.al.|[2401.11700](http://arxiv.org/abs/2401.11700)|null|\n", "2401.11382": "|**2024-06-06**|**Using Large Language Model for End-to-End Chinese ASR and NER**|Yuang Li et.al.|[2401.11382](http://arxiv.org/abs/2401.11382)|null|\n", "2401.11268": "|**2024-02-02**|**Word-Level ASR Quality Estimation for Efficient Corpus Sampling and Post-Editing through Analyzing Attentions of a Reference-Free Metric**|Golara Javadi et.al.|[2401.11268](http://arxiv.org/abs/2401.11268)|**[link](https://github.com/aixplain/NoRefER)**|\n", "2401.11132": "|**2024-01-20**|**ConceptThread: Visualizing Threaded Concepts in MOOC Videos**|Zhiguang Zhou et.al.|[2401.11132](http://arxiv.org/abs/2401.11132)|null|\n", "2401.10449": "|**2024-01-19**|**Contextualized Automatic Speech Recognition with Attention-Based Bias Phrase Boosted Beam Search**|Yui Sudo et.al.|[2401.10449](http://arxiv.org/abs/2401.10449)|null|\n", "2401.10447": "|**2024-01-19**|**Investigating Training Strategies and Model Robustness of Low-Rank Adaptation for Language Modeling in Speech Recognition**|Yu Yu et.al.|[2401.10447](http://arxiv.org/abs/2401.10447)|null|\n", "2401.10446": "|**2024-01-19**|**Large Language Models are Efficient Learners of Noise-Robust Speech Recognition**|Yuchen Hu et.al.|[2401.10446](http://arxiv.org/abs/2401.10446)|**[link](https://github.com/yuchen005/robustger)**|\n", "2401.10411": "|**2024-01-18**|**AGADIR: Towards Array-Geometry Agnostic Directional Speech Recognition**|Ju Lin et.al.|[2401.10411](http://arxiv.org/abs/2401.10411)|null|\n", "2401.10070": "|**2024-01-18**|**Communication-Efficient Personalized Federated Learning for Speech-to-Text Tasks**|Yichao Du et.al.|[2401.10070](http://arxiv.org/abs/2401.10070)|null|\n", "2401.09802": "|**2024-07-18**|**Efficient Training for Multilingual Visual Speech Recognition: Pre-training with Discretized Visual Speech Representation**|Minsu Kim et.al.|[2401.09802](http://arxiv.org/abs/2401.09802)|null|\n", "2401.09759": "|**2024-07-02**|**SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech Recognition**|Hao Wang et.al.|[2401.09759](http://arxiv.org/abs/2401.09759)|null|\n", "2401.09315": "|**2024-01-17**|**On Speech Pre-emphasis as a Simple and Inexpensive Method to Boost Speech Enhancement**|Iv\u00e1n L\u00f3pez-Espejo et.al.|[2401.09315](http://arxiv.org/abs/2401.09315)|null|\n", "2401.08916": "|**2024-01-17**|**Two-pass Endpoint Detection for Speech Recognition**|Anirudh Raju et.al.|[2401.08916](http://arxiv.org/abs/2401.08916)|null|\n", "2401.08887": "|**2024-01-16**|**NOTSOFAR-1 Challenge: New Datasets, Baseline, and Tasks for Distant Meeting Transcription**|Alon Vinnikov et.al.|[2401.08887](http://arxiv.org/abs/2401.08887)|null|\n", "2401.08835": "|**2024-01-16**|**Improving ASR Contextual Biasing with Guided Attention**|Jiyang Tang et.al.|[2401.08835](http://arxiv.org/abs/2401.08835)|null|\n", "2401.08833": "|**2024-01-16**|**Revisiting Self-supervised Learning of Speech Representation from a Mutual Information Perspective**|Alexander H. Liu et.al.|[2401.08833](http://arxiv.org/abs/2401.08833)|null|\n", "2401.08052": "|**2024-03-01**|**Multi-Input Multi-Output Target-Speaker Voice Activity Detection For Unified, Flexible, and Robust Audio-Visual Speaker Diarization**|Ming Cheng et.al.|[2401.08052](http://arxiv.org/abs/2401.08052)|null|\n", "2401.07957": "|**2024-01-15**|**Machine Perceptual Quality: Evaluating the Impact of Severe Lossy Compression on Audio and Image Models**|Dan Jacobellis et.al.|[2401.07957](http://arxiv.org/abs/2401.07957)|**[link](https://github.com/danjacobellis/mpq)**|\n", "2401.07575": "|**2024-07-24**|**Cascaded Cross-Modal Transformer for Audio-Textual Classification**|Nicolae-Catalin Ristea et.al.|[2401.07575](http://arxiv.org/abs/2401.07575)|**[link](https://github.com/ristea/ccmt)**|\n", "2401.07506": "|**2024-01-15**|**SeMaScore : a new evaluation metric for automatic speech recognition tasks**|Zitha Sasindran et.al.|[2401.07506](http://arxiv.org/abs/2401.07506)|null|\n", "2401.07360": "|**2024-01-14**|**Promptformer: Prompted Conformer Transducer for ASR**|Sergio Duarte-Torres et.al.|[2401.07360](http://arxiv.org/abs/2401.07360)|null|\n", "2401.06980": "|**2024-01-13**|**Joint Unsupervised and Supervised Training for Automatic Speech Recognition via Bilevel Optimization**|A F M Saif et.al.|[2401.06980](http://arxiv.org/abs/2401.06980)|**[link](https://github.com/afmsaif/joint-unsupervised-and-supervised-training-for-automatic-speech-recognition-via-bilevel-optimization)**|\n", "2401.09354": "|**2024-01-12**|**Transcending Controlled Environments Assessing the Transferability of ASRRobust NLU Models to Real-World Applications**|Hania Khan et.al.|[2401.09354](http://arxiv.org/abs/2401.09354)|null|\n", "2401.06588": "|**2024-01-12**|**Dynamic Behaviour of Connectionist Speech Recognition with Strong Latency Constraints**|Giampiero Salvi et.al.|[2401.06588](http://arxiv.org/abs/2401.06588)|null|\n", "2401.06832": "|**2024-01-12**|**XLS-R Deep Learning Model for Multilingual ASR on Low- Resource Languages: Indonesian, Javanese, and Sundanese**|Panji Arisaputra et.al.|[2401.06832](http://arxiv.org/abs/2401.06832)|null|\n", "2401.06390": "|**2024-01-12**|**LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition**|Fan Yu et.al.|[2401.06390](http://arxiv.org/abs/2401.06390)|**[link](https://github.com/alibaba-damo-academy/FunASR)**|\n", "2401.05689": "|**2024-01-11**|**UCorrect: An Unsupervised Framework for Automatic Speech Recognition Error Correction**|Jiaxin Guo et.al.|[2401.05689](http://arxiv.org/abs/2401.05689)|null|\n", "2401.06183": "|**2024-01-11**|**End to end Hindi to English speech conversion using Bark, mBART and a finetuned XLSR Wav2Vec2**|Aniket Tathe et.al.|[2401.06183](http://arxiv.org/abs/2401.06183)|null|\n", "2401.05551": "|**2024-01-10**|**Useful Blunders: Can Automated Speech Recognition Errors Improve Downstream Dementia Classification?**|Changye Li et.al.|[2401.05551](http://arxiv.org/abs/2401.05551)|null|\n", "2401.05336": "|**2024-01-10**|**Towards Online Sign Language Recognition and Translation**|Ronglai Zuo et.al.|[2401.05336](http://arxiv.org/abs/2401.05336)|**[link](https://github.com/FangyunWei/SLRT)**|\n", "2401.04482": "|**2024-07-17**|**Continuously Learning New Words in Automatic Speech Recognition**|Christian Huber et.al.|[2401.04482](http://arxiv.org/abs/2401.04482)|null|\n", "2401.04235": "|**2024-01-08**|**High-precision Voice Search Query Correction via Retrievable Speech-text Embedings**|Christopher Li et.al.|[2401.04235](http://arxiv.org/abs/2401.04235)|null|\n", "2401.04152": "|**2024-07-22**|**Cross-Speaker Encoding Network for Multi-Talker Speech Recognition**|Jiawen Kang et.al.|[2401.04152](http://arxiv.org/abs/2401.04152)|**[link](https://github.com/kjw11/csenet-asr)**|\n", "2401.03936": "|**2024-01-08**|**Exploratory Evaluation of Speech Content Masking**|Jennifer Williams et.al.|[2401.03936](http://arxiv.org/abs/2401.03936)|null|\n", "2401.03697": "|**2024-03-07**|**An audio-quality-based multi-strategy approach for target speaker extraction in the MISP 2023 Challenge**|Runduo Han et.al.|[2401.03697](http://arxiv.org/abs/2401.03697)|null|\n", "2401.03689": "|**2024-06-10**|**LUPET: Incorporating Hierarchical Information Path into Multilingual ASR**|Wei Liu et.al.|[2401.03689](http://arxiv.org/abs/2401.03689)|null|\n", "2401.03687": "|**2024-01-08**|**BS-PLCNet: Band-split Packet Loss Concealment Network with Multi-task Learning Framework and Multi-discriminators**|Zihan Zhang et.al.|[2401.03687](http://arxiv.org/abs/2401.03687)|null|\n", "2401.03506": "|**2024-07-22**|**DiarizationLM: Speaker Diarization Post-Processing with Large Language Models**|Quan Wang et.al.|[2401.03506](http://arxiv.org/abs/2401.03506)|**[link](https://github.com/google/speaker-id)**|\n", "2401.06788": "|**2024-02-29**|**The NPU-ASLP-LiAuto System Description for Visual Speech Recognition in CNVSRC 2023**|He Wang et.al.|[2401.06788](http://arxiv.org/abs/2401.06788)|**[link](https://github.com/mkt-dataoceanai/cnvsrc2023baseline)**|\n", "2401.03473": "|**2024-02-21**|**ICMC-ASR: The ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition Challenge**|He Wang et.al.|[2401.03473](http://arxiv.org/abs/2401.03473)|null|\n", "2401.03468": "|**2024-01-07**|**Multichannel AV-wav2vec2: A Framework for Learning Multichannel Multi-Modal Speech Representation**|Qiushi Zhu et.al.|[2401.03468](http://arxiv.org/abs/2401.03468)|**[link](https://github.com/zqs01/multi-channel-wav2vec2)**|\n", "2401.03424": "|**2024-04-08**|**MLCA-AVSR: Multi-Layer Cross Attention Fusion based Audio-Visual Speech Recognition**|He Wang et.al.|[2401.03424](http://arxiv.org/abs/2401.03424)|null|\n", "2401.03251": "|**2024-01-06**|**TeLeS: Temporal Lexeme Similarity Score to Estimate Confidence in End-to-End ASR**|Nagarathna Ravi et.al.|[2401.03251](http://arxiv.org/abs/2401.03251)|**[link](https://github.com/madhavlab/2023_teles_wlc)**|\n", "2401.03175": "|**2024-01-06**|**Part-of-Speech Tagger for Bodo Language using Deep Learning approach**|Dhrubajyoti Pathak et.al.|[2401.03175](http://arxiv.org/abs/2401.03175)|null|\n", "2401.02921": "|**2024-01-05**|**Towards ASR Robust Spoken Language Understanding Through In-Context Learning With Word Confusion Networks**|Kevin Everson et.al.|[2401.02921](http://arxiv.org/abs/2401.02921)|null|\n", "2401.02890": "|**2024-01-05**|**Nonlinear functional regression by functional deep neural network with kernel embedding**|Zhongjie Shi et.al.|[2401.02890](http://arxiv.org/abs/2401.02890)|null|\n", "2401.02673": "|**2024-01-05**|**A unified multichannel far-field speech recognition system: combining neural beamforming with attention based end-to-end model**|Dongdi Zhao et.al.|[2401.02673](http://arxiv.org/abs/2401.02673)|null|\n", "2401.02417": "|**2024-01-04**|**Task Oriented Dialogue as a Catalyst for Self-Supervised Automatic Speech Recognition**|David M. Chan et.al.|[2401.02417](http://arxiv.org/abs/2401.02417)|**[link](https://github.com/amazon-science/amazon-od3)**|\n", "2402.10218": "|**2024-01-04**|**AntiDeepFake: AI for Deep Fake Speech Recognition**|Enkhtogtokh Togootogtokh et.al.|[2402.10218](http://arxiv.org/abs/2402.10218)|null|\n", "2401.02046": "|**2024-01-04**|**CTC Blank Triggered Dynamic Layer-Skipping for Efficient CTC-based Speech Recognition**|Junfeng Hou et.al.|[2401.02046](http://arxiv.org/abs/2401.02046)|null|\n", "2401.01572": "|**2024-01-03**|**Hallucinations in Neural Automatic Speech Recognition: Identifying Errors and Hallucinatory Models**|Rita Frieske et.al.|[2401.01572](http://arxiv.org/abs/2401.01572)|null|\n", "2401.01537": "|**2024-06-04**|**The Art of Deception: Robust Backdoor Attack using Dynamic Stacking of Triggers**|Orson Mengara et.al.|[2401.01537](http://arxiv.org/abs/2401.01537)|null|\n", "2401.00662": "|**2024-01-01**|**Enhancing Pre-trained ASR System Fine-tuning for Dysarthric Speech Recognition using Adversarial Data Augmentation**|Huimeng Wang et.al.|[2401.00662](http://arxiv.org/abs/2401.00662)|null|\n", "2312.17279": "|**2024-05-02**|**Stateful Conformer with Cache-based Inference for Streaming Automatic Speech Recognition**|Vahid Noroozi et.al.|[2312.17279](http://arxiv.org/abs/2312.17279)|null|\n", "2312.16002": "|**2023-12-26**|**The NUS-HLT System for ICASSP2024 ICMC-ASR Grand Challenge**|Meng Ge et.al.|[2312.16002](http://arxiv.org/abs/2312.16002)|null|\n", "2312.15922": "|**2023-12-26**|**Towards Probing Contact Center Large Language Models**|Varun Nathan et.al.|[2312.15922](http://arxiv.org/abs/2312.15922)|null|\n", "2312.15499": "|**2023-12-24**|**Exploring data augmentation in bias mitigation against non-native-accented speech**|Yuanyuan Zhang et.al.|[2312.15499](http://arxiv.org/abs/2312.15499)|null|\n", "2312.14609": "|**2023-12-22**|**BLSTM-Based Confidence Estimation for End-to-End Speech Recognition**|Atsunori Ogawa et.al.|[2312.14609](http://arxiv.org/abs/2312.14609)|null|\n", "2312.14378": "|**2024-02-09**|**Multimodal Attention Merging for Improved Speech Recognition and Audio Event Classification**|Anirudh S. Sundar et.al.|[2312.14378](http://arxiv.org/abs/2312.14378)|null|\n", "2312.14055": "|**2024-07-22**|**Multi-Sentence Grounding for Long-term Instructional Video**|Zeqian Li et.al.|[2312.14055](http://arxiv.org/abs/2312.14055)|null|\n", "2312.14020": "|**2023-12-21**|**BANSpEmo: A Bangla Emotional Speech Recognition Dataset**|Md Gulzar Hussain et.al.|[2312.14020](http://arxiv.org/abs/2312.14020)|null|\n", "2312.13873": "|**2023-12-21**|**Self-Supervised Adaptive AV Fusion Module for Pre-Trained ASR Models**|Christopher Simic et.al.|[2312.13873](http://arxiv.org/abs/2312.13873)|null|\n", "2312.13560": "|**2024-02-03**|**kNN-CTC: Enhancing ASR via Retrieval of CTC Pseudo Labels**|Jiaming Zhou et.al.|[2312.13560](http://arxiv.org/abs/2312.13560)|**[link](https://github.com/nku-hlt/knn-ctc)**|\n", "2408.02582": "|**2024-08-05**|**Clustering and Mining Accented Speech for Inclusive and Fair Speech Recognition**|Jaeyoung Kim et.al.|[2408.02582](http://arxiv.org/abs/2408.02582)|null|\n", "2408.02369": "|**2024-08-08**|**The NPU-ASLP System Description for Visual Speech Recognition in CNVSRC 2024**|He Wang et.al.|[2408.02369](http://arxiv.org/abs/2408.02369)|**[link](https://gitlab.com/csltstu/sunine)**|\n", "2408.02178": "|**2024-08-05**|**StreamVoice+: Evolving into End-to-end Streaming Zero-shot Voice Conversion**|Zhichao Wang et.al.|[2408.02178](http://arxiv.org/abs/2408.02178)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.02978": "|**2024-08-06**|**ASR-enhanced Multimodal Representation Learning for Cross-Domain Product Retrieval**|Ruixiang Zhao et.al.|[2408.02978](http://arxiv.org/abs/2408.02978)|null|\n", "2408.02945": "|**2024-08-06**|**Self-Supervised Learning for Multi-Channel Neural Transducer**|Atsushi Kojima et.al.|[2408.02945](http://arxiv.org/abs/2408.02945)|null|\n", "2408.04325": "|**2024-08-08**|**HydraFormer: One Encoder For All Subsampling Rates**|Yaoxun Xu et.al.|[2408.04325](http://arxiv.org/abs/2408.04325)|**[link](https://github.com/hydraformer/hydraformer)**|\n", "2408.04306": "|**2024-08-08**|**Preserving spoken content in voice anonymisation with character-level vocoder conditioning**|Michele Panariello et.al.|[2408.04306](http://arxiv.org/abs/2408.04306)|**[link](https://github.com/m-pana/spk_anon_nac_lm)**|\n", "2408.04174": "|**2024-08-08**|**wav2graph: A Framework for Supervised Learning Knowledge Graph from Speech**|Khai Le-Duc et.al.|[2408.04174](http://arxiv.org/abs/2408.04174)|**[link](https://github.com/leduckhai/wav2graph)**|\n", "2408.03979": "|**2024-08-07**|**Speaker Adaptation for Quantised End-to-End ASR Models**|Qiuming Zhao et.al.|[2408.03979](http://arxiv.org/abs/2408.03979)|null|\n", "2408.05101": "|**2024-08-09**|**MooER: LLM-based Speech Recognition and Translation Models from Moore Threads**|Junhao Xu et.al.|[2408.05101](http://arxiv.org/abs/2408.05101)|**[link](https://github.com/moorethreads/mooer)**|\n", "2408.06264": "|**2024-08-12**|**Audio Enhancement for Computer Audition -- An Iterative Training Paradigm Using Sample Importance**|Manuel Milling et.al.|[2408.06264](http://arxiv.org/abs/2408.06264)|null|\n", "2408.06043": "|**2024-08-12**|**Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning**|Wonjun Lee et.al.|[2408.06043](http://arxiv.org/abs/2408.06043)|null|\n", "2408.05769": "|**2024-08-11**|**LI-TTA: Language Informed Test-Time Adaptation for Automatic Speech Recognition**|Eunseop Yoon et.al.|[2408.05769](http://arxiv.org/abs/2408.05769)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.05554": "|**2024-08-10**|**Improving Whisper's Recognition Performance for Under-Represented Language Kazakh Leveraging Unpaired Speech and Text**|Jinpeng Li et.al.|[2408.05554](http://arxiv.org/abs/2408.05554)|null|\n", "2408.06484": "|**2024-08-12**|**Cross-Lingual Conversational Speech Summarization with Large Language Models**|Max Nelson et.al.|[2408.06484](http://arxiv.org/abs/2408.06484)|null|\n", "2408.07388": "|**2024-08-14**|**DPSNN: Spiking Neural Network for Low-Latency Streaming Speech Enhancement**|Tao Sun et.al.|[2408.07388](http://arxiv.org/abs/2408.07388)|null|\n", "2408.08027": "|**2024-08-15**|**Enhancing Large Language Model-based Speech Recognition by Contextualization for Rare and Ambiguous Words**|Kento Nozawa et.al.|[2408.08027](http://arxiv.org/abs/2408.08027)|null|\n", "2408.07851": "|**2024-08-14**|**SER Evals: In-domain and Out-of-domain Benchmarking for Speech Emotion Recognition**|Mohamed Osman et.al.|[2408.07851](http://arxiv.org/abs/2408.07851)|**[link](https://github.com/spaghettiSystems/serval)**|\n", "2408.07081": "|**2024-08-16**|**MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical Expressions into $LaTeX$ Formulas for Improved Readability**|Kyudan Jung et.al.|[2408.07081](http://arxiv.org/abs/2408.07081)|null|\n", "2408.09688": "|**2024-08-19**|**Recording for Eyes, Not Echoing to Ears: Contextualized Spoken-to-Written Conversion of ASR Transcripts**|Jiaqing Liu et.al.|[2408.09688](http://arxiv.org/abs/2408.09688)|null|\n", "2408.09491": "|**2024-08-18**|**A Transcription Prompt-based Efficient Audio Large Language Model for Robust Speech Recognition**|Yangze Li et.al.|[2408.09491](http://arxiv.org/abs/2408.09491)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10524": "|**2024-08-20**|**XCB: an effective contextual biasing approach to bias cross-lingual phrases in speech recognition**|Xucheng Wan et.al.|[2408.10524](http://arxiv.org/abs/2408.10524)|null|\n", "2408.11804": "|**2024-08-21**|**Approaching Deep Learning through the Spectral Dynamics of Weights**|David Yunis et.al.|[2408.11804](http://arxiv.org/abs/2408.11804)|**[link](https://github.com/dyunis/spectral_dynamics)**|\n", "2408.11258": "|**2024-08-21**|**Improving Speech Recognition Error Prediction for Modern and Off-the-shelf Speech Recognizers**|Prashant Serai et.al.|[2408.11258](http://arxiv.org/abs/2408.11258)|null|\n", "2408.12500": "|**2024-08-22**|**WhisperMask: A Noise Suppressive Mask-Type Microphone for Whisper Speech**|Hirotaka Hiraki et.al.|[2408.12500](http://arxiv.org/abs/2408.12500)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12279": "|**2024-08-22**|**Developing vocal system impaired patient-aimed voice quality assessment approach using ASR representation-included multiple features**|Shaoxiang Dang et.al.|[2408.12279](http://arxiv.org/abs/2408.12279)|null|\n", "2408.11940": "|**2024-08-21**|**The State of Commercial Automatic French Legal Speech Recognition Systems and their Impact on Court Reporters et al**|Nicolad Garneau et.al.|[2408.11940](http://arxiv.org/abs/2408.11940)|null|\n", "2408.11873": "|**2024-08-19**|**Parameter-Efficient Transfer Learning under Federated Learning for Automatic Speech Recognition**|Xuan Kan et.al.|[2408.11873](http://arxiv.org/abs/2408.11873)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13106": "|**2024-08-28**|**NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks**|He Huang et.al.|[2408.13106](http://arxiv.org/abs/2408.13106)|null|\n", "2408.13008": "|**2024-08-23**|**Focused Discriminative Training For Streaming CTC-Trained Automatic Speech Recognition Models**|Adnan Haider et.al.|[2408.13008](http://arxiv.org/abs/2408.13008)|null|\n", "2408.12734": "|**2024-08-22**|**Towards measuring fairness in speech recognition: Fair-Speech dataset**|Irina-Elena Veliche et.al.|[2408.12734](http://arxiv.org/abs/2408.12734)|null|\n", "2408.14418": "|**2024-08-26**|**MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR Errors with LLM-generated Synthetic Dialogues**|Kuluhan Binici et.al.|[2408.14418](http://arxiv.org/abs/2408.14418)|null|\n", "2408.14262": "|**2024-08-26**|**Self-supervised Speech Representations Still Struggle with African American Vernacular English**|Kalvin Chang et.al.|[2408.14262](http://arxiv.org/abs/2408.14262)|**[link](https://github.com/cmu-llab/s3m-aave)**|\n", "2408.14082": "|**2024-08-26**|**Automatic recognition and detection of aphasic natural speech**|Mara Barberis et.al.|[2408.14082](http://arxiv.org/abs/2408.14082)|null|\n", "2408.13996": "|**2024-08-28**|**Research Advances and New Paradigms for Biology-inspired Spiking Neural Networks**|Tianyu Zheng et.al.|[2408.13996](http://arxiv.org/abs/2408.13996)|null|\n", "2408.13739": "|**2024-08-25**|**Literary and Colloquial Tamil Dialect Identification**|M. Nanmalar et.al.|[2408.13739](http://arxiv.org/abs/2408.13739)|null|\n", "2408.13644": "|**2024-08-24**|**Studying the Effect of Audio Filters in Pre-Trained Models for Environmental Sound Classification**|Aditya Dawn et.al.|[2408.13644](http://arxiv.org/abs/2408.13644)|null|\n", "2408.14991": "|**2024-08-27**|**Speech Recognition Transformers: Topological-lingualism Perspective**|Shruti Singh et.al.|[2408.14991](http://arxiv.org/abs/2408.14991)|null|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.15616": "|**2024-08-28**|**Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error Rate Computations And Granular Error Classifications**|Korbinian Kuhn et.al.|[2408.15616](http://arxiv.org/abs/2408.15616)|**[link](https://github.com/shuffle-project/beyond-levenshtein)**|\n", "2408.15585": "|**2024-08-28**|**Whisper-PMFA: Partial Multi-Scale Feature Aggregation for Speaker Verification using Whisper Models**|Yiyang Zhao et.al.|[2408.15585](http://arxiv.org/abs/2408.15585)|null|\n", "2408.16589": "|**2024-08-29**|**CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions**|Laurin Wagner et.al.|[2408.16589](http://arxiv.org/abs/2408.16589)|null|\n", "2408.16564": "|**2024-08-29**|**Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing Interaction and Causal Processing**|Qianhui Liu et.al.|[2408.16564](http://arxiv.org/abs/2408.16564)|null|\n", "2408.16287": "|**2024-08-29**|**Measuring the Accuracy of Automatic Speech Recognition Solutions**|Korbinian Kuhn et.al.|[2408.16287](http://arxiv.org/abs/2408.16287)|**[link](https://github.com/shuffle-project/asr-comparison)**|\n", "2408.16204": "|**2024-08-29**|**Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient Manipulation**|Lun Wang et.al.|[2408.16204](http://arxiv.org/abs/2408.16204)|null|\n", "2408.16180": "|**2024-08-29**|**Benchmarking Japanese Speech Recognition on ASR-LLM Setups with Multi-Pass Augmented Generative Error Correction**|Yuka Ko et.al.|[2408.16180](http://arxiv.org/abs/2408.16180)|null|\n"}, "TTS": {"2408.06227": "|**2024-08-12**|**FLEURS-R: A Restored Multilingual Speech Corpus for Generation Tasks**|Min Ma et.al.|[2408.06227](http://arxiv.org/abs/2408.06227)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.03887": "|**2024-08-06**|**Central Kurdish Text-to-Speech Synthesis with Novel End-to-End Transformer Training**|Hawraz A. Ahmad et.al.|[2408.03887](http://arxiv.org/abs/2408.03887)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.00284": "|**2024-08-01**|**Bailing-TTS: Chinese Dialectal Speech Synthesis Towards Human-like Spontaneous Representation**|Xinhan Di et.al.|[2408.00284](http://arxiv.org/abs/2408.00284)|null|\n", "2407.21491": "|**2024-08-01**|**Generative Expressive Conversational Speech Synthesis**|Rui Liu et.al.|[2407.21491](http://arxiv.org/abs/2407.21491)|**[link](https://github.com/ai-s2-lab/gpt-talker)**|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18541": "|**2024-07-26**|**Towards Improving NAM-to-Speech Synthesis Intelligibility using Self-Supervised Speech Models**|Neil Shah et.al.|[2407.18541](http://arxiv.org/abs/2407.18541)|null|\n", "2407.18505": "|**2024-07-26**|**VoxSim: A perceptual voice similarity dataset**|Junseok Ahn et.al.|[2407.18505](http://arxiv.org/abs/2407.18505)|null|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17167": "|**2024-07-24**|**Zero-Shot vs. Few-Shot Multi-Speaker TTS Using Pre-trained Czech SpeechT5 Model**|Jan Lehe\u010dka et.al.|[2407.17167](http://arxiv.org/abs/2407.17167)|null|\n", "2407.16840": "|**2024-07-23**|**Synth4Kws: Synthesized Speech for User Defined Keyword Spotting in Low Resource Environments**|Pai Zhu et.al.|[2407.16840](http://arxiv.org/abs/2407.16840)|null|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15188": "|**2024-07-21**|**Overview of Speaker Modeling and Its Applications: From the Lens of Deep Speaker Representation Learning**|Shuai Wang et.al.|[2407.15188](http://arxiv.org/abs/2407.15188)|null|\n", "2407.14212": "|**2024-07-19**|**Braille-to-Speech Generator: Audio Generation Based on Joint Fine-Tuning of CLIP and Fastspeech2**|Chun Xu et.al.|[2407.14212](http://arxiv.org/abs/2407.14212)|null|\n", "2407.14056": "|**2024-07-19**|**Rasa: Building Expressive Speech Synthesis Systems for Indian Languages in Low-resource Settings**|Praveen Srinivasa Varadhan et.al.|[2407.14056](http://arxiv.org/abs/2407.14056)|**[link](https://github.com/AI4Bharat/Rasa)**|\n", "2407.14006": "|**2024-07-19**|**MSceneSpeech: A Multi-Scene Speech Dataset For Expressive Speech Synthesis**|Qian Yang et.al.|[2407.14006](http://arxiv.org/abs/2407.14006)|null|\n", "2407.13509": "|**2024-07-18**|**Spontaneous Style Text-to-Speech Synthesis with Controllable Spontaneous Behaviors Based on Language Models**|Weiqin Li et.al.|[2407.13509](http://arxiv.org/abs/2407.13509)|null|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.12707": "|**2024-07-22**|**TTSDS -- Text-to-Speech Distribution Score**|Christoph Minixhofer et.al.|[2407.12707](http://arxiv.org/abs/2407.12707)|**[link](https://github.com/ttsds/ttsds)**|\n", "2408.00788": "|**2024-07-17**|**SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network**|Kexin Wang et.al.|[2408.00788](http://arxiv.org/abs/2408.00788)|null|\n", "2407.12229": "|**2024-07-17**|**Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-to-Speech**|Haibin Wu et.al.|[2407.12229](http://arxiv.org/abs/2407.12229)|null|\n", "2407.12206": "|**2024-07-16**|**A Language Modeling Approach to Diacritic-Free Hebrew TTS**|Amit Roth et.al.|[2407.12206](http://arxiv.org/abs/2407.12206)|null|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.09370": "|**2024-07-17**|**Learning High-Frequency Functions Made Easy with Sinusoidal Positional Encoding**|Chuanhao Sun et.al.|[2407.09370](http://arxiv.org/abs/2407.09370)|**[link](https://github.com/zhyuan11/SPE)**|\n", "2407.08551": "|**2024-07-11**|**Autoregressive Speech Synthesis without Vector Quantization**|Lingwei Meng et.al.|[2407.08551](http://arxiv.org/abs/2407.08551)|null|\n", "2407.08248": "|**2024-07-11**|**Toward accessible comics for blind and low vision readers**|Christophe Rigaud et.al.|[2407.08248](http://arxiv.org/abs/2407.08248)|null|\n", "2407.08016": "|**2024-07-10**|**Source Tracing of Audio Deepfake Systems**|Nicholas Klein et.al.|[2407.08016](http://arxiv.org/abs/2407.08016)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05471": "|**2024-07-07**|**Fine-Grained and Interpretable Neural Speech Editing**|Max Morrison et.al.|[2407.05471](http://arxiv.org/abs/2407.05471)|**[link](https://github.com/maxrmorrison/torbi)**|\n", "2407.05421": "|**2024-07-07**|**ASRRL-TTS: Agile Speaker Representation Reinforcement Learning for Text-to-Speech Speaker Adaptation**|Ruibo Fu et.al.|[2407.05421](http://arxiv.org/abs/2407.05421)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.04575": "|**2024-07-05**|**FA-GAN: Artifacts-free and Phase-aware High-fidelity GAN-based Vocoder**|Rubing Shen et.al.|[2407.04575](http://arxiv.org/abs/2407.04575)|null|\n", "2407.04291": "|**2024-07-05**|**We Need Variations in Speech Synthesis: Sub-center Modelling for Speaker Embeddings**|Ismail Rasim Ulgen et.al.|[2407.04291](http://arxiv.org/abs/2407.04291)|null|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.04034": "|**2024-07-04**|**Optimizing a-DCF for Spoofing-Robust Speaker Verification**|O\u011fuzhan Kurnaz et.al.|[2407.04034](http://arxiv.org/abs/2407.04034)|null|\n", "2407.03892": "|**2024-07-04**|**On the Effectiveness of Acoustic BPE in Decoder-Only TTS**|Bohan Li et.al.|[2407.03892](http://arxiv.org/abs/2407.03892)|null|\n", "2407.03236": "|**2024-07-14**|**CATT: Character-based Arabic Tashkeel Transformer**|Faris Alasmary et.al.|[2407.03236](http://arxiv.org/abs/2407.03236)|**[link](https://github.com/abjadai/catt)**|\n", "2407.02937": "|**2024-07-03**|**Probing the Feasibility of Multilingual Speaker Anonymization**|Sarina Meyer et.al.|[2407.02937](http://arxiv.org/abs/2407.02937)|**[link](https://github.com/digitalphonetics/speaker-anonymization)**|\n", "2407.02243": "|**2024-07-02**|**Robust Zero-Shot Text-to-Speech Synthesis with Reverse Inference Optimization**|Yuchen Hu et.al.|[2407.02243](http://arxiv.org/abs/2407.02243)|null|\n", "2407.01927": "|**2024-07-02**|**TTSlow: Slow Down Text-to-Speech with Efficiency Robustness Evaluations**|Xiaoxue Gao et.al.|[2407.01927](http://arxiv.org/abs/2407.01927)|null|\n", "2407.01291": "|**2024-07-01**|**Lightweight Zero-shot Text-to-Speech with Mixture of Adapters**|Kenichi Fujita et.al.|[2407.01291](http://arxiv.org/abs/2407.01291)|null|\n", "2407.12038": "|**2024-07-31**|**ICAGC 2024: Inspirational and Convincing Audio Generation Challenge 2024**|Ruibo Fu et.al.|[2407.12038](http://arxiv.org/abs/2407.12038)|null|\n", "2407.00826": "|**2024-06-30**|**NAIST Simultaneous Speech Translation System for IWSLT 2024**|Yuka Ko et.al.|[2407.00826](http://arxiv.org/abs/2407.00826)|null|\n", "2407.00766": "|**2024-06-30**|**An Attribute Interpolation Method in Speech Synthesis by Model Merging**|Masato Murata et.al.|[2407.00766](http://arxiv.org/abs/2407.00766)|null|\n", "2407.00753": "|**2024-06-30**|**FLY-TTS: Fast, Lightweight and High-Quality End-to-End Text-to-Speech Synthesis**|Yinlin Guo et.al.|[2407.00753](http://arxiv.org/abs/2407.00753)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2406.19243": "|**2024-06-27**|**Application of ASV for Voice Identification after VC and Duration Predictor Improvement in TTS Models**|Borodin Kirill Nikolayevich et.al.|[2406.19243](http://arxiv.org/abs/2406.19243)|null|\n", "2406.19135": "|**2024-06-27**|**DEX-TTS: Diffusion-based EXpressive Text-to-Speech with Style Modeling on Time Variability**|Hyun Joon Park et.al.|[2406.19135](http://arxiv.org/abs/2406.19135)|**[link](https://github.com/winddori2002/dex-tts)**|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18089": "|**2024-06-26**|**A Study on Synthesizing Expressive Violin Performances: Approaches and Comparisons**|Tzu-Yun Hung et.al.|[2406.18089](http://arxiv.org/abs/2406.18089)|null|\n", "2406.18088": "|**2024-06-29**|**LLM-Driven Multimodal Opinion Expression Identification**|Bonian Jia et.al.|[2406.18088](http://arxiv.org/abs/2406.18088)|null|\n", "2406.18009": "|**2024-06-26**|**E2 TTS: Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS**|Sefik Emre Eskimez et.al.|[2406.18009](http://arxiv.org/abs/2406.18009)|null|\n", "2406.17957": "|**2024-06-25**|**Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment**|Paarth Neekhara et.al.|[2406.17957](http://arxiv.org/abs/2406.17957)|null|\n", "2406.17310": "|**2024-06-25**|**High Fidelity Text-to-Speech Via Discrete Tokens Using Token Transducer and Group Masked Language Model**|Joun Yeop Lee et.al.|[2406.17310](http://arxiv.org/abs/2406.17310)|null|\n", "2406.17257": "|**2024-06-25**|**Leveraging Parameter-Efficient Transfer Learning for Multi-Lingual Text-to-Speech Adaptation**|Yingting Li et.al.|[2406.17257](http://arxiv.org/abs/2406.17257)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16751": "|**2024-07-07**|**Towards Zero-Shot Text-To-Speech for Arabic Dialects**|Khai Duy Doan et.al.|[2406.16751](http://arxiv.org/abs/2406.16751)|null|\n", "2406.16716": "|**2024-06-24**|**One-Class Learning with Adaptive Centroid Shift for Audio Deepfake Detection**|Hyun Myung Kim et.al.|[2406.16716](http://arxiv.org/abs/2406.16716)|null|\n", "2406.17801": "|**2024-06-22**|**A multi-speaker multi-lingual voice cloning system based on vits2 for limmits 2024 challenge**|Xiaopeng Wang et.al.|[2406.17801](http://arxiv.org/abs/2406.17801)|null|\n", "2406.15752": "|**2024-06-22**|**TacoLM: GaTed Attention Equipped Codec Language Model are Efficient Zero-Shot Text to Speech Synthesizers**|Yakun Song et.al.|[2406.15752](http://arxiv.org/abs/2406.15752)|**[link](https://github.com/Ereboas/TacoLM)**|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14875": "|**2024-06-21**|**GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech**|Wenbin Wang et.al.|[2406.14875](http://arxiv.org/abs/2406.14875)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.12946": "|**2024-06-18**|**Instruction Data Generation and Unsupervised Adaptation for Speech Language Models**|Vahid Noroozi et.al.|[2406.12946](http://arxiv.org/abs/2406.12946)|null|\n", "2406.12164": "|**2024-07-09**|**A Mel Spectrogram Enhancement Paradigm Based on CWT in Speech Synthesis**|Guoqiang Hu et.al.|[2406.12164](http://arxiv.org/abs/2406.12164)|null|\n", "2406.11727": "|**2024-06-27**|**1000 African Voices: Advancing inclusive multi-speaker multi-accent speech synthesis**|Sewade Ogun et.al.|[2406.11727](http://arxiv.org/abs/2406.11727)|null|\n", "2406.11427": "|**2024-06-17**|**DiTTo-TTS: Efficient and Scalable Zero-Shot Text-to-Speech with Diffusion Transformer**|Keon Lee et.al.|[2406.11427](http://arxiv.org/abs/2406.11427)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.10844": "|**2024-06-16**|**Multi-Scale Accent Modeling with Disentangling for Multi-Speaker Multi-Accent TTS Synthesis**|Xuehao Zhou et.al.|[2406.10844](http://arxiv.org/abs/2406.10844)|null|\n", "2406.10514": "|**2024-06-15**|**GTR-Voice: Articulatory Phonetics Informed Controllable Expressive Speech Synthesis**|Zehua Kcriss Li et.al.|[2406.10514](http://arxiv.org/abs/2406.10514)|null|\n", "2406.10422": "|**2024-06-14**|**Phoneme Discretized Saliency Maps for Explainable Detection of AI-Generated Voice**|Shubham Gupta et.al.|[2406.10422](http://arxiv.org/abs/2406.10422)|null|\n", "2406.10056": "|**2024-06-14**|**UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner**|Dongchao Yang et.al.|[2406.10056](http://arxiv.org/abs/2406.10056)|**[link](https://github.com/yangdongchao/llm-codec)**|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.08989": "|**2024-06-13**|**ToneUnit: A Speech Discretization Approach for Tonal Language Speech Synthesis**|Dehua Tao et.al.|[2406.08989](http://arxiv.org/abs/2406.08989)|null|\n", "2406.08820": "|**2024-06-13**|**DisfluencySpeech -- Single-Speaker Conversational Speech Dataset with Paralanguage**|Kyra Wang et.al.|[2406.08820](http://arxiv.org/abs/2406.08820)|null|\n", "2406.08812": "|**2024-06-13**|**Generating Speakers by Prompting Listener Impressions for Pre-trained Multi-Speaker Text-to-Speech Systems**|Zhengyang Chen et.al.|[2406.08812](http://arxiv.org/abs/2406.08812)|null|\n", "2406.08802": "|**2024-06-13**|**DubWise: Video-Guided Speech Duration Control in Multimodal LLM-based Text-to-Speech for Dubbing**|Neha Sahipjohn et.al.|[2406.08802](http://arxiv.org/abs/2406.08802)|null|\n", "2406.08568": "|**2024-06-12**|**Training Data Augmentation for Dysarthric Automatic Speech Recognition by Text-to-Dysarthric-Speech Synthesis**|Wing-Zin Leung et.al.|[2406.08568](http://arxiv.org/abs/2406.08568)|null|\n", "2406.08416": "|**2024-06-20**|**TokSing: Singing Voice Synthesis based on Discrete Tokens**|Yuning Wu et.al.|[2406.08416](http://arxiv.org/abs/2406.08416)|null|\n", "2406.08196": "|**2024-06-12**|**FreeV: Free Lunch For Vocoders Through Pseudo Inversed Mel Filter**|Yuanjun Lv et.al.|[2406.08196](http://arxiv.org/abs/2406.08196)|**[link](https://github.com/bakerbunker/freev)**|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.08076": "|**2024-06-12**|**VECL-TTS: Voice identity and Emotional style controllable Cross-Lingual Text-to-Speech**|Ashishkumar Gudmalwar et.al.|[2406.08076](http://arxiv.org/abs/2406.08076)|null|\n", "2406.07969": "|**2024-06-12**|**LibriTTS-P: A Corpus with Speaking Style and Speaker Identity Prompts for Text-to-Speech and Style Captioning**|Masaya Kawamura et.al.|[2406.07969](http://arxiv.org/abs/2406.07969)|**[link](https://github.com/line/libritts-p)**|\n", "2406.07855": "|**2024-06-12**|**VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment**|Bing Han et.al.|[2406.07855](http://arxiv.org/abs/2406.07855)|null|\n", "2406.07803": "|**2024-06-12**|**EmoSphere-TTS: Emotional Style and Intensity Modeling via Spherical Emotion Vector for Controllable Emotional Text-to-Speech**|Deok-Hyeon Cho et.al.|[2406.07803](http://arxiv.org/abs/2406.07803)|**[link](https://github.com/Choddeok/EmoSphere-TTS)**|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07289": "|**2024-06-11**|**Can We Achieve High-quality Direct Speech-to-Speech Translation without Parallel Speech Data?**|Qingkai Fang et.al.|[2406.07289](http://arxiv.org/abs/2406.07289)|null|\n", "2406.07237": "|**2024-06-11**|**CodecFake: Enhancing Anti-Spoofing Models Against Deepfake Audios from Codec-Based Speech Synthesis Systems**|Haibin Wu et.al.|[2406.07237](http://arxiv.org/abs/2406.07237)|null|\n", "2406.06979": "|**2024-06-11**|**AudioMarkBench: Benchmarking Robustness of Audio Watermarking**|Hongbin Liu et.al.|[2406.06979](http://arxiv.org/abs/2406.06979)|**[link](https://github.com/moyangkuo/audiomarkbench)**|\n", "2406.06406": "|**2024-06-11**|**Controlling Emotion in Text-to-Speech with Natural Language Prompts**|Thomas Bott et.al.|[2406.06406](http://arxiv.org/abs/2406.06406)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06403": "|**2024-06-10**|**Meta Learning Text-to-Speech Synthesis in over 7000 Languages**|Florian Lux et.al.|[2406.06403](http://arxiv.org/abs/2406.06403)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06111": "|**2024-06-10**|**JenGAN: Stacked Shifted Filters in GAN-Based Speech Synthesis**|Hyunjae Cho et.al.|[2406.06111](http://arxiv.org/abs/2406.06111)|null|\n", "2406.05965": "|**2024-06-10**|**MakeSinger: A Semi-Supervised Training Method for Data-Efficient Singing Voice Synthesis via Classifier-free Diffusion Guidance**|Semin Kim et.al.|[2406.05965](http://arxiv.org/abs/2406.05965)|null|\n", "2406.05763": "|**2024-06-19**|**WenetSpeech4TTS: A 12,800-hour Mandarin TTS Corpus for Large Speech Generation Model Benchmark**|Linhan Ma et.al.|[2406.05763](http://arxiv.org/abs/2406.05763)|**[link](https://github.com/dukGuo/valle-audiodec)**|\n", "2406.05699": "|**2024-06-09**|**An Investigation of Noise Robustness for Flow-Matching-Based Zero-Shot TTS**|Xiaofei Wang et.al.|[2406.05699](http://arxiv.org/abs/2406.05699)|null|\n", "2406.05681": "|**2024-06-11**|**Towards Expressive Zero-Shot Speech Synthesis with Hierarchical Prosody Modeling**|Yuepeng Jiang et.al.|[2406.05681](http://arxiv.org/abs/2406.05681)|null|\n", "2406.05672": "|**2024-06-12**|**Text-aware and Context-aware Expressive Audiobook Speech Synthesis**|Dake Guo et.al.|[2406.05672](http://arxiv.org/abs/2406.05672)|null|\n", "2408.06906": "|**2024-08-13**|**VNet: A GAN-based Multi-Tier Discriminator Network for Speech Synthesis Vocoders**|Yubing Cao et.al.|[2408.06906](http://arxiv.org/abs/2408.06906)|null|\n", "2408.06858": "|**2024-08-13**|**SaSLaW: Dialogue Speech Corpus with Audio-visual Egocentric Information Toward Environment-adaptive Dialogue Speech Synthesis**|Osamu Take et.al.|[2408.06858](http://arxiv.org/abs/2408.06858)|**[link](https://github.com/sarulab-speech/saslaw)**|\n", "2408.06827": "|**2024-08-13**|**PRESENT: Zero-Shot Text-to-Prosody Control**|Perry Lam et.al.|[2408.06827](http://arxiv.org/abs/2408.06827)|**[link](https://github.com/iamanigeeit/present)**|\n", "2408.07547": "|**2024-08-14**|**PeriodWave: Multi-Period Flow Matching for High-Fidelity Waveform Generation**|Sang-Hoon Lee et.al.|[2408.07547](http://arxiv.org/abs/2408.07547)|**[link](https://github.com/sh-lee-prml/periodwave)**|\n", "2408.07414": "|**2024-08-14**|**WavLM model ensemble for audio deepfake detection**|David Combei et.al.|[2408.07414](http://arxiv.org/abs/2408.07414)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10852": "|**2024-08-20**|**EELE: Exploring Efficient and Extensible LoRA Integration in Emotional Text-to-Speech**|Xin Qi et.al.|[2408.10852](http://arxiv.org/abs/2408.10852)|null|\n", "2408.10771": "|**2024-08-20**|**SSL-TTS: Leveraging Self-Supervised Embeddings and kNN Retrieval for Zero-Shot Multi-speaker TTS**|Karl El Hajal et.al.|[2408.10771](http://arxiv.org/abs/2408.10771)|null|\n", "2408.10549": "|**2024-08-20**|**AI-Based IVR**|Gassyrbek Kosherbay et.al.|[2408.10549](http://arxiv.org/abs/2408.10549)|null|\n", "2408.10463": "|**2024-08-20**|**Adversarial training of Keyword Spotting to Minimize TTS Data Overfitting**|Hyun Jin Park et.al.|[2408.10463](http://arxiv.org/abs/2408.10463)|null|\n", "2408.10207": "|**2024-07-01**|**A Comprehensive Survey on Diffusion Models and Their Applications**|Md Manjurul Ahsan et.al.|[2408.10207](http://arxiv.org/abs/2408.10207)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12170": "|**2024-08-22**|**VoiceX: A Text-To-Speech Framework for Custom Voices**|Silvan Mertes et.al.|[2408.12170](http://arxiv.org/abs/2408.12170)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13240": "|**2024-08-23**|**Which Prosodic Features Matter Most for Pragmatics?**|Nigel G. Ward et.al.|[2408.13240](http://arxiv.org/abs/2408.13240)|null|\n", "2408.14423": "|**2024-08-27**|**DualSpeech: Enhancing Speaker-Fidelity and Text-Intelligibility Through Dual Classifier-Free Guidance**|Jinhyeok Yang et.al.|[2408.14423](http://arxiv.org/abs/2408.14423)|null|\n", "2408.13970": "|**2024-08-26**|**Anonymization of Voices in Spaces for Civic Dialogue: Measuring Impact on Empathy, Trust, and Feeling Heard**|Wonjune Kang et.al.|[2408.13970](http://arxiv.org/abs/2408.13970)|null|\n", "2408.13893": "|**2024-08-28**|**SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models**|Dongchao Yang et.al.|[2408.13893](http://arxiv.org/abs/2408.13893)|null|\n", "2408.13608": "|**2024-08-24**|**SpeechCraft: A Fine-grained Expressive Speech Dataset with Natural Language Description**|Zeyu Jin et.al.|[2408.13608](http://arxiv.org/abs/2408.13608)|**[link](https://github.com/thuhcsi/speechcraft)**|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.14739": "|**2024-08-28**|**VoiceTailor: Lightweight Plug-In Adapter for Diffusion-Based Personalized Text-to-Speech**|Heeseung Kim et.al.|[2408.14739](http://arxiv.org/abs/2408.14739)|null|\n", "2408.14713": "|**2024-08-27**|**StyleSpeech: Parameter-efficient Fine Tuning for Pre-trained Controllable Text-to-Speech**|Haowei Lou et.al.|[2408.14713](http://arxiv.org/abs/2408.14713)|null|\n", "2408.15916": "|**2024-08-28**|**Multi-modal Adversarial Training for Zero-Shot Voice Cloning**|John Janiczek et.al.|[2408.15916](http://arxiv.org/abs/2408.15916)|null|\n", "2408.15775": "|**2024-08-29**|**Easy, Interpretable, Effective: openSMILE for voice deepfake detection**|Octavian Pascu et.al.|[2408.15775](http://arxiv.org/abs/2408.15775)|null|\n", "2408.15676": "|**2024-08-28**|**VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling**|Yixuan Zhou et.al.|[2408.15676](http://arxiv.org/abs/2408.15676)|null|\n", "2408.16725": "|**2024-08-29**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|null|\n", "2408.16546": "|**2024-08-29**|**RAVE for Speech: Efficient Voice Conversion at High Sampling Rates**|Anders R. Bargum et.al.|[2408.16546](http://arxiv.org/abs/2408.16546)|null|\n", "2408.16373": "|**2024-08-29**|**Enabling Beam Search for Language Model-Based Text-to-Speech Synthesis**|Zehai Tu et.al.|[2408.16373](http://arxiv.org/abs/2408.16373)|null|\n"}}
\ No newline at end of file
+{"ASR": {"2408.00624": "|**2024-08-01**|**SynesLM: A Unified Approach for Audio-visual Speech Recognition and Translation via Language Model and Synthetic Data**|Yichen Lu et.al.|[2408.00624](http://arxiv.org/abs/2408.00624)|**[link](https://github.com/espnet/espnet)**|\n", "2408.00205": "|**2024-08-01**|**Sentence-wise Speech Summarization: Task, Datasets, and End-to-End Modeling with LM Knowledge Distillation**|Kohei Matsuura et.al.|[2408.00205](http://arxiv.org/abs/2408.00205)|null|\n", "2407.21783": "|**2024-08-15**|**The Llama 3 Herd of Models**|Abhimanyu Dubey et.al.|[2407.21783](http://arxiv.org/abs/2407.21783)|null|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.21414": "|**2024-07-31**|**Towards interfacing large language models with ASR systems using confidence measures and prompting**|Maryam Naderi et.al.|[2407.21414](http://arxiv.org/abs/2407.21414)|null|\n", "2407.21211": "|**2024-07-30**|**Self-Supervised Models in Automatic Whispered Speech Recognition**|Aref Farhadipour et.al.|[2407.21211](http://arxiv.org/abs/2407.21211)|null|\n", "2407.21066": "|**2024-07-28**|**ELP-Adapters: Parameter Efficient Adapter Tuning for Various Speech Processing Tasks**|Nakamasa Inoue et.al.|[2407.21066](http://arxiv.org/abs/2407.21066)|null|\n", "2407.21061": "|**2024-07-26**|**Improving noisy student training for low-resource languages in End-to-End ASR using CycleGAN and inter-domain losses**|Chia-Yu Li et.al.|[2407.21061](http://arxiv.org/abs/2407.21061)|null|\n", "2407.18581": "|**2024-08-07**|**Dynamic Language Group-Based MoE: Enhancing Code-Switching Speech Recognition with Hierarchical Routing**|Hukai Huang et.al.|[2407.18581](http://arxiv.org/abs/2407.18581)|**[link](https://github.com/kaihuhuang/language-group)**|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18461": "|**2024-07-26**|**Enhancing Dysarthric Speech Recognition for Unseen Speakers via Prototype-Based Adaptation**|Shiyao Wang et.al.|[2407.18461](http://arxiv.org/abs/2407.18461)|**[link](https://github.com/nku-hlt/pb-dsr)**|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17874": "|**2024-07-25**|**Improving Domain-Specific ASR with LLM-Generated Contextual Descriptions**|Jiwon Suh et.al.|[2407.17874](http://arxiv.org/abs/2407.17874)|null|\n", "2407.17852": "|**2024-07-25**|**Scaling A Simple Approach to Zero-Shot Speech Recognition**|Jinming Zhao et.al.|[2407.17852](http://arxiv.org/abs/2407.17852)|**[link](https://github.com/facebookresearch/fairseq)**|\n", "2407.17605": "|**2024-07-24**|**Coupling Speech Encoders with Downstream Text Models**|Ciprian Chelba et.al.|[2407.17605](http://arxiv.org/abs/2407.17605)|null|\n", "2407.17160": "|**2024-07-24**|**A Comparative Analysis of Bilingual and Trilingual Wav2Vec Models for Automatic Speech Recognition in Multilingual Oral History Archives**|Jan Lehe\u010dka et.al.|[2407.17160](http://arxiv.org/abs/2407.17160)|null|\n", "2407.16537": "|**2024-07-23**|**Quantifying the Role of Textual Predictability in Automatic Speech Recognition**|Sean Robertson et.al.|[2407.16537](http://arxiv.org/abs/2407.16537)|null|\n", "2407.16447": "|**2024-07-23**|**The CHiME-8 DASR Challenge for Generalizable and Array Agnostic Distant Automatic Speech Recognition and Diarization**|Samuele Cornell et.al.|[2407.16447](http://arxiv.org/abs/2407.16447)|null|\n", "2407.16370": "|**2024-07-23**|**Evolutionary Prompt Design for LLM-Based Post-ASR Error Correction**|Rithik Sachdev et.al.|[2407.16370](http://arxiv.org/abs/2407.16370)|**[link](https://github.com/rithiksachdev/PostASR-Correction-SLT2024)**|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15749": "|**2024-07-22**|**Robustness of Speech Separation Models for Similar-pitch Speakers**|Bunlong Lay et.al.|[2407.15749](http://arxiv.org/abs/2407.15749)|null|\n", "2407.15300": "|**2024-07-22**|**SELM: Enhancing Speech Emotion Recognition for Out-of-Domain Scenarios**|Hazim Bukhari et.al.|[2407.15300](http://arxiv.org/abs/2407.15300)|null|\n", "2407.14573": "|**2024-08-24**|**Trading Devil Final: Backdoor attack via Stock market and Bayesian Optimization**|Orson Mengara et.al.|[2407.14573](http://arxiv.org/abs/2407.14573)|null|\n", "2407.14021": "|**2024-07-19**|**GE2E-AC: Generalized End-to-End Loss Training for Accent Classification**|Chihiro Watanabe et.al.|[2407.14021](http://arxiv.org/abs/2407.14021)|null|\n", "2407.13982": "|**2024-07-19**|**Reexamining Racial Disparities in Automatic Speech Recognition Performance: The Role of Confounding by Provenance**|Changye Li et.al.|[2407.13982](http://arxiv.org/abs/2407.13982)|null|\n", "2408.00005": "|**2024-07-18**|**Framework for Curating Speech Datasets and Evaluating ASR Systems: A Case Study for Polish**|Micha\u0142 Junczyk et.al.|[2408.00005](http://arxiv.org/abs/2408.00005)|**[link](https://github.com/goodmike31/pl-asr-bigos-tools)**|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.13300": "|**2024-07-18**|**Robust ASR Error Correction with Conservative Data Filtering**|Takuma Udagawa et.al.|[2407.13300](http://arxiv.org/abs/2407.13300)|null|\n", "2407.13292": "|**2024-07-18**|**Low-Resourced Speech Recognition for Iu Mien Language via Weakly-Supervised Phoneme-based Multilingual Pre-training**|Lukuan Dong et.al.|[2407.13292](http://arxiv.org/abs/2407.13292)|null|\n", "2407.13266": "|**2024-07-18**|**How Private is Low-Frequency Speech Audio in the Wild? An Analysis of Verbal Intelligibility by Humans and Machines**|Ailin Liu et.al.|[2407.13266](http://arxiv.org/abs/2407.13266)|null|\n", "2407.13142": "|**2024-07-18**|**A light-weight and efficient punctuation and word casing prediction model for on-device streaming ASR**|Jian You et.al.|[2407.13142](http://arxiv.org/abs/2407.13142)|null|\n", "2407.12389": "|**2024-07-17**|**Morphosyntactic Analysis for CHILDES**|Houjun Liu et.al.|[2407.12389](http://arxiv.org/abs/2407.12389)|null|\n", "2407.12240": "|**2024-07-17**|**Adaptive Cascading Network for Continual Test-Time Adaptation**|Kien X. Nguyen et.al.|[2407.12240](http://arxiv.org/abs/2407.12240)|null|\n", "2407.12094": "|**2024-07-16**|**Identifying Speakers in Dialogue Transcripts: A Text-based Approach Using Pretrained Language Models**|Minh Nguyen et.al.|[2407.12094](http://arxiv.org/abs/2407.12094)|**[link](https://github.com/adobe-research/speaker-identification)**|\n", "2407.11828": "|**2024-07-17**|**Vibravox: A Dataset of French Speech Captured with Body-conduction Audio Sensors**|Julien Hauret et.al.|[2407.11828](http://arxiv.org/abs/2407.11828)|**[link](https://github.com/jhauret/vibravox)**|\n", "2407.11641": "|**2024-07-16**|**Investigating the Effect of Label Topology and Training Criterion on ASR Performance and Alignment Quality**|Tina Raissi et.al.|[2407.11641](http://arxiv.org/abs/2407.11641)|null|\n", "2407.11516": "|**2024-07-16**|**The VoicePrivacy 2022 Challenge: Progress and Perspectives in Voice Anonymisation**|Michele Panariello et.al.|[2407.11516](http://arxiv.org/abs/2407.11516)|null|\n", "2407.11345": "|**2024-07-16**|**Beyond Binary: Multiclass Paraphasia Detection with Generative Pretrained Transformers and End-to-End Models**|Matthew Perez et.al.|[2407.11345](http://arxiv.org/abs/2407.11345)|null|\n", "2407.10603": "|**2024-07-15**|**Leave No Knowledge Behind During Knowledge Distillation: Towards Practical and Effective Knowledge Distillation for Code-Switching ASR Using Realistic Data**|Liang-Hsuan Tseng et.al.|[2407.10603](http://arxiv.org/abs/2407.10603)|null|\n", "2407.10303": "|**2024-07-14**|**Improving Neural Biasing for Contextual Speech Recognition by Early Context Injection and Text Perturbation**|Ruizhe Huang et.al.|[2407.10303](http://arxiv.org/abs/2407.10303)|null|\n", "2407.10255": "|**2024-07-14**|**CUSIDE-T: Chunking, Simulating Future and Decoding for Transducer based Streaming ASR**|Wenbo Zhao et.al.|[2407.10255](http://arxiv.org/abs/2407.10255)|null|\n", "2407.10118": "|**2024-07-14**|**Textless Dependency Parsing by Labeled Sequence Prediction**|Shunsuke Kando et.al.|[2407.10118](http://arxiv.org/abs/2407.10118)|**[link](https://github.com/mynlp/speechparser)**|\n", "2407.10048": "|**2024-07-14**|**Whisper-SV: Adapting Whisper for Low-data-resource Speaker Verification**|Li Zhang et.al.|[2407.10048](http://arxiv.org/abs/2407.10048)|null|\n", "2407.09849": "|**2024-07-13**|**Text-Based Detection of On-Hold Scripts in Contact Center Calls**|Dmitrii Galimzianov et.al.|[2407.09849](http://arxiv.org/abs/2407.09849)|**[link](https://github.com/gal-dmitry/HOLD_DETECTION_PUBLIC)**|\n", "2407.09817": "|**2024-08-24**|**Empowering Whisper as a Joint Multi-Talker and Target-Talker Speech Recognition System**|Lingwei Meng et.al.|[2407.09817](http://arxiv.org/abs/2407.09817)|**[link](https://github.com/LingweiMeng/Whisper-Sidecar)**|\n", "2407.09807": "|**2024-07-13**|**A Streaming Multi-Channel End-to-End Speech Recognition System with Realistic Evaluations**|Xiangzhu Kong et.al.|[2407.09807](http://arxiv.org/abs/2407.09807)|**[link](https://github.com/thu-spmi/cat)**|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.08618": "|**2024-08-12**|**Tamil Language Computing: the Present and the Future**|Kengatharaiyer Sarveswaran et.al.|[2407.08618](http://arxiv.org/abs/2407.08618)|null|\n", "2407.08658": "|**2024-07-10**|**Evaluating Voice Command Pipelines for Drone Control: From STT and LLM to Direct Classification and Siamese Networks**|Lucca Emmanuel Pineli Sim\u00f5es et.al.|[2407.08658](http://arxiv.org/abs/2407.08658)|null|\n", "2407.07566": "|**2024-07-10**|**HebDB: a Weakly Supervised Dataset for Hebrew Speech Processing**|Arnon Turetzky et.al.|[2407.07566](http://arxiv.org/abs/2407.07566)|null|\n", "2407.18930": "|**2024-07-10**|**Dynamic Encoder Size Based on Data-Driven Layer-wise Pruning for Speech Recognition**|Jingjing Xu et.al.|[2407.18930](http://arxiv.org/abs/2407.18930)|null|\n", "2407.17416": "|**2024-07-10**|**Explaining Spectrograms in Machine Learning: A Study on Neural Networks for Speech Classification**|Jesin James et.al.|[2407.17416](http://arxiv.org/abs/2407.17416)|null|\n", "2407.06606": "|**2024-07-09**|**Tailored Design of Audio-Visual Speech Recognition Models using Branchformers**|David Gimeno-G\u00f3mez et.al.|[2407.06606](http://arxiv.org/abs/2407.06606)|**[link](https://github.com/david-gimeno/tailored-avsr)**|\n", "2407.06310": "|**2024-07-08**|**Homogeneous Speaker Features for On-the-Fly Dysarthric and Elderly Speaker Adaptation**|Mengzhe Geng et.al.|[2407.06310](http://arxiv.org/abs/2407.06310)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.14525": "|**2024-07-07**|**Morse Code-Enabled Speech Recognition for Individuals with Visual and Hearing Impairments**|Ritabrata Roy Choudhury et.al.|[2407.14525](http://arxiv.org/abs/2407.14525)|null|\n", "2407.04675": "|**2024-07-10**|**Seed-ASR: Understanding Diverse Speech and Contexts with LLM-based Speech Recognition**|Ye Bai et.al.|[2407.04675](http://arxiv.org/abs/2407.04675)|null|\n", "2407.04662": "|**2024-07-05**|**Multitaper mel-spectrograms for keyword spotting**|Douglas Baptista de Souza et.al.|[2407.04662](http://arxiv.org/abs/2407.04662)|null|\n", "2407.04652": "|**2024-07-05**|**Pretraining End-to-End Keyword Search with Automatically Discovered Acoustic Units**|Bolaji Yusuf et.al.|[2407.04652](http://arxiv.org/abs/2407.04652)|**[link](https://github.com/beer-asr/beer)**|\n", "2407.04641": "|**2024-07-05**|**Speculative Speech Recognition by Audio-Prefixed Low-Rank Adaptation of Language Models**|Bolaji Yusuf et.al.|[2407.04641](http://arxiv.org/abs/2407.04641)|null|\n", "2407.04601": "|**2024-07-05**|**Written Term Detection Improves Spoken Term Detection**|Bolaji Yusuf et.al.|[2407.04601](http://arxiv.org/abs/2407.04601)|**[link](https://github.com/bolajiy/golden-retriever)**|\n", "2407.04533": "|**2024-07-09**|**Performance Analysis of Speech Encoders for Low-Resource SLU and ASR in Tunisian Dialect**|Salima Mdhaffar et.al.|[2407.04533](http://arxiv.org/abs/2407.04533)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2407.04482": "|**2024-07-05**|**Controlling Whisper: Universal Acoustic Adversarial Attacks to Control Speech Foundation Models**|Vyas Raina et.al.|[2407.04482](http://arxiv.org/abs/2407.04482)|null|\n", "2407.04439": "|**2024-07-05**|**XLSR-Transducer: Streaming ASR for Self-Supervised Pretrained Models**|Shashi Kumar et.al.|[2407.04439](http://arxiv.org/abs/2407.04439)|null|\n", "2407.04368": "|**2024-07-05**|**Romanization Encoding For Multilingual ASR**|Wen Ding et.al.|[2407.04368](http://arxiv.org/abs/2407.04368)|null|\n", "2407.04280": "|**2024-07-05**|**LearnerVoice: A Dataset of Non-Native English Learners' Spontaneous Speech**|Haechan Kim et.al.|[2407.04280](http://arxiv.org/abs/2407.04280)|null|\n", "2407.04219": "|**2024-07-05**|**Semi-supervised Learning for Code-Switching ASR with Large Language Model Filter**|Yu Xi et.al.|[2407.04219](http://arxiv.org/abs/2407.04219)|null|\n", "2407.04051": "|**2024-07-11**|**FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs**|Keyu An et.al.|[2407.04051](http://arxiv.org/abs/2407.04051)|**[link](https://github.com/FunAudioLLM/SenseVoice)**|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.03966": "|**2024-07-04**|**Serialized Output Training by Learned Dominance**|Ying Shi et.al.|[2407.03966](http://arxiv.org/abs/2407.03966)|null|\n", "2407.03809": "|**2024-07-04**|**Finetuning End-to-End Models for Estonian Conversational Spoken Language Translation**|Tiia Sildam et.al.|[2407.03809](http://arxiv.org/abs/2407.03809)|null|\n", "2407.03734": "|**2024-07-04**|**Improving Self-supervised Pre-training using Accent-Specific Codebooks**|Darshan Prabhu et.al.|[2407.03734](http://arxiv.org/abs/2407.03734)|**[link](https://github.com/csalt-research/accented-codebooks-asr)**|\n", "2407.03718": "|**2024-07-24**|**Multi-Convformer: Extending Conformer with Multiple Convolution Kernels**|Darshan Prabhu et.al.|[2407.03718](http://arxiv.org/abs/2407.03718)|**[link](https://github.com/espnet/espnet)**|\n", "2407.03563": "|**2024-07-04**|**Learning Video Temporal Dynamics with Cross-Modal Attention for Robust Audio-Visual Speech Recognition**|Sungnyun Kim et.al.|[2407.03563](http://arxiv.org/abs/2407.03563)|null|\n", "2407.03495": "|**2024-07-03**|**Codec-ASR: Training Performant Automatic Speech Recognition Systems with Discrete Speech Representations**|Kunal Dhawan et.al.|[2407.03495](http://arxiv.org/abs/2407.03495)|null|\n", "2407.03440": "|**2024-07-03**|**Advanced Framework for Animal Sound Classification With Features Optimization**|Qiang Yang et.al.|[2407.03440](http://arxiv.org/abs/2407.03440)|null|\n", "2407.03026": "|**2024-07-03**|**Qifusion-Net: Layer-adapted Stream/Non-stream Model for End-to-End Multi-Accent Speech Recognition**|Jinming Chen et.al.|[2407.03026](http://arxiv.org/abs/2407.03026)|null|\n", "2407.13782": "|**2024-07-03**|**Self-supervised ASR Models and Features For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2407.13782](http://arxiv.org/abs/2407.13782)|null|\n", "2407.02052": "|**2024-07-02**|**The USTC-NERCSLIP Systems for The ICMC-ASR Challenge**|Minghui Wu et.al.|[2407.02052](http://arxiv.org/abs/2407.02052)|null|\n", "2407.02543": "|**2024-07-02**|**Towards the Next Frontier in Speech Representation Learning Using Disentanglement**|Varun Krishna et.al.|[2407.02543](http://arxiv.org/abs/2407.02543)|null|\n", "2407.01909": "|**2024-07-02**|**Pinyin Regularization in Error Correction for Chinese Speech Recognition with Large Language Models**|Zhiyuan Tang et.al.|[2407.01909](http://arxiv.org/abs/2407.01909)|**[link](https://github.com/tzyll/ChineseHP)**|\n", "2407.17477": "|**2024-07-30**|**Toward Automated Detection of Biased Social Signals from the Content of Clinical Conversations**|Feng Chen et.al.|[2407.17477](http://arxiv.org/abs/2407.17477)|null|\n", "2407.00756": "|**2024-06-30**|**Less Forgetting for Better Generalization: Exploring Continual-learning Fine-tuning Methods for Speech Self-supervised Representations**|Salah Zaiem et.al.|[2407.00756](http://arxiv.org/abs/2407.00756)|null|\n", "2407.00518": "|**2024-06-29**|**When Robots Get Chatty: Grounding Multimodal Human-Robot Conversation and Collaboration**|Philipp Allgeuer et.al.|[2407.00518](http://arxiv.org/abs/2407.00518)|null|\n", "2407.12817": "|**2024-06-29**|**Error Correction by Paying Attention to Both Acoustic and Confidence References for Automatic Speech Recognition**|Yuchun Shu et.al.|[2407.12817](http://arxiv.org/abs/2407.12817)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2407.12029": "|**2024-06-29**|**A Quality-Aware Voltage Overscaling Framework to Improve the Energy Efficiency and Lifetime of TPUs based on Statistical Error Modeling**|Alireza Senobari et.al.|[2407.12029](http://arxiv.org/abs/2407.12029)|null|\n", "2407.12028": "|**2024-06-28**|**TreeSeg: Hierarchical Topic Segmentation of Large Transcripts**|Dimitrios C. Gklezakos et.al.|[2407.12028](http://arxiv.org/abs/2407.12028)|null|\n", "2406.19706": "|**2024-06-28**|**SAML: Speaker Adaptive Mixture of LoRA Experts for End-to-End ASR**|Qiuming Zhao et.al.|[2406.19706](http://arxiv.org/abs/2406.19706)|null|\n", "2406.19674": "|**2024-06-28**|**Less is More: Accurate Speech Recognition & Translation without Web-Scale Data**|Krishna C. Puvvada et.al.|[2406.19674](http://arxiv.org/abs/2406.19674)|null|\n", "2406.19564": "|**2024-06-27**|**Voices Unheard: NLP Resources and Models for Yor\u00f9b\u00e1 Regional Dialects**|Orevaoghene Ahia et.al.|[2406.19564](http://arxiv.org/abs/2406.19564)|**[link](https://github.com/orevaahia/yorulect)**|\n", "2406.19363": "|**2024-06-27**|**Tradition or Innovation: A Comparison of Modern ASR Methods for Forced Alignment**|Rotem Rousso et.al.|[2406.19363](http://arxiv.org/abs/2406.19363)|null|\n", "2406.19311": "|**2024-06-27**|**Zero-Query Adversarial Attack on Black-box Automatic Speech Recognition Systems**|Zheng Fang et.al.|[2406.19311](http://arxiv.org/abs/2406.19311)|null|\n", "2406.18972": "|**2024-06-27**|**Applying LLMs for Rescoring N-best ASR Hypotheses of Casual Conversations: Effects of Domain Adaptation and Context Carry-over**|Atsunori Ogawa et.al.|[2406.18972](http://arxiv.org/abs/2406.18972)|null|\n", "2406.18928": "|**2024-06-27**|**Enhanced ASR Robustness to Packet Loss with a Front-End Adaptation Network**|Yehoshua Dissen et.al.|[2406.18928](http://arxiv.org/abs/2406.18928)|null|\n", "2406.18862": "|**2024-06-27**|**Streaming Decoder-Only Automatic Speech Recognition with Discrete Speech Units: A Pilot Study**|Peikun Chen et.al.|[2406.18862](http://arxiv.org/abs/2406.18862)|**[link](https://github.com/chenpk00/IS2024_stream_decoder_only_asr)**|\n", "2406.18373": "|**2024-06-26**|**Dynamic Data Pruning for Automatic Speech Recognition**|Qiao Xiao et.al.|[2406.18373](http://arxiv.org/abs/2406.18373)|null|\n", "2406.18301": "|**2024-06-26**|**MSR-86K: An Evolving, Multilingual Corpus with 86,300 Hours of Transcribed Audio for Speech Recognition Research**|Song Li et.al.|[2406.18301](http://arxiv.org/abs/2406.18301)|null|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18120": "|**2024-07-12**|**ArzEn-LLM: Code-Switched Egyptian Arabic-English Translation and Speech Recognition Using LLMs**|Ahmed Heakl et.al.|[2406.18120](http://arxiv.org/abs/2406.18120)|**[link](https://github.com/ahmedheakl/arazn-llm)**|\n", "2406.18021": "|**2024-06-26**|**SC-MoE: Switch Conformer Mixture of Experts for Unified Streaming and Non-streaming Code-Switching ASR**|Shuaishuai Ye et.al.|[2406.18021](http://arxiv.org/abs/2406.18021)|null|\n", "2406.17935": "|**2024-06-25**|**Sequential Editing for Lifelong Training of Speech Recognition Models**|Devang Kulshreshtha et.al.|[2406.17935](http://arxiv.org/abs/2406.17935)|null|\n", "2406.17926": "|**2024-06-25**|**FASA: a Flexible and Automatic Speech Aligner for Extracting High-quality Aligned Children Speech Data**|Dancheng Liu et.al.|[2406.17926](http://arxiv.org/abs/2406.17926)|**[link](https://github.com/DanchengLiu/FASA)**|\n", "2406.17618": "|**2024-06-25**|**Towards Building an End-to-End Multilingual Automatic Lyrics Transcription Model**|Jiawen Huang et.al.|[2406.17618](http://arxiv.org/abs/2406.17618)|**[link](https://github.com/jhuang448/MultilingualALT)**|\n", "2406.17614": "|**2024-06-25**|**MSRS: Training Multimodal Speech Recognition Models from Scratch with Sparse Mask Optimization**|Adriana Fernandez-Lopez et.al.|[2406.17614](http://arxiv.org/abs/2406.17614)|null|\n", "2406.17825": "|**2024-06-25**|**Automatic speech recognition for the Nepali language using CNN, bidirectional LSTM and ResNet**|Manish Dhakal et.al.|[2406.17825](http://arxiv.org/abs/2406.17825)|**[link](https://github.com/manishdhakal/asr-nepali-using-cnn-bilstm-resnet)**|\n", "2406.17272": "|**2024-06-25**|**A Comprehensive Solution to Connect Speech Encoder and Large Language Model for ASR**|Van Tung Pham et.al.|[2406.17272](http://arxiv.org/abs/2406.17272)|null|\n", "2406.17124": "|**2024-06-24**|**Investigating Confidence Estimation Measures for Speaker Diarization**|Anurag Chowdhury et.al.|[2406.17124](http://arxiv.org/abs/2406.17124)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16777": "|**2024-06-24**|**Blending LLMs into Cascaded Speech Translation: KIT's Offline Speech Translation System for IWSLT 2024**|Sai Koneru et.al.|[2406.16777](http://arxiv.org/abs/2406.16777)|null|\n", "2406.16120": "|**2024-06-23**|**Contextualized End-to-end Automatic Speech Recognition with Intermediate Biasing Loss**|Muhammad Shakeel et.al.|[2406.16120](http://arxiv.org/abs/2406.16120)|null|\n", "2406.16107": "|**2024-08-01**|**Decoder-only Architecture for Streaming End-to-end Speech Recognition**|Emiru Tsunoo et.al.|[2406.16107](http://arxiv.org/abs/2406.16107)|null|\n", "2406.15723": "|**2024-06-22**|**Acoustic Feature Mixup for Balanced Multi-aspect Pronunciation Assessment**|Heejin Do et.al.|[2406.15723](http://arxiv.org/abs/2406.15723)|null|\n", "2406.15668": "|**2024-06-21**|**PI-Whisper: An Adaptive and Incremental ASR Framework for Diverse and Evolving Speaker Characteristics**|Amir Nassereldine et.al.|[2406.15668](http://arxiv.org/abs/2406.15668)|null|\n", "2406.15265": "|**2024-06-21**|**Perception of Phonological Assimilation by Neural Speech Recognition Models**|Charlotte Pouw et.al.|[2406.15265](http://arxiv.org/abs/2406.15265)|null|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14747": "|**2024-06-20**|**An Adapter-Based Unified Model for Multiple Spoken Language Processing Tasks**|Varsha Suresh et.al.|[2406.14747](http://arxiv.org/abs/2406.14747)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.14266": "|**2024-06-20**|**Intelligent Interface: Enhancing Lecture Engagement with Didactic Activity Summaries**|Anna Wr\u00f3blewska et.al.|[2406.14266](http://arxiv.org/abs/2406.14266)|null|\n", "2406.13842": "|**2024-06-19**|**Joint vs Sequential Speaker-Role Detection and Automatic Speech Recognition for Air-traffic Control**|Alexander Blatt et.al.|[2406.13842](http://arxiv.org/abs/2406.13842)|null|\n", "2406.13502": "|**2024-06-19**|**ManWav: The First Manchu ASR Model**|Jean Seo et.al.|[2406.13502](http://arxiv.org/abs/2406.13502)|null|\n", "2406.13431": "|**2024-06-24**|**Children's Speech Recognition through Discrete Token Enhancement**|Vrunda N. Sukhadia et.al.|[2406.13431](http://arxiv.org/abs/2406.13431)|null|\n", "2406.12699": "|**2024-06-18**|**Bridging the Gap: Integrating Pre-trained Speech Enhancement and Recognition Models for Robust Speech Recognition**|Kuan-Chen Wang et.al.|[2406.12699](http://arxiv.org/abs/2406.12699)|null|\n", "2406.12674": "|**2024-06-18**|**Transcribe, Align and Segment: Creating speech datasets for low-resource languages**|Taras Sereda et.al.|[2406.12674](http://arxiv.org/abs/2406.12674)|null|\n", "2406.12621": "|**2024-06-18**|**Growing Trees on Sounds: Assessing Strategies for End-to-End Dependency Parsing of Speech**|Adrien Pupier et.al.|[2406.12621](http://arxiv.org/abs/2406.12621)|**[link](https://github.com/Pupiera/Growing_tree_on_sound)**|\n", "2406.12611": "|**2024-06-18**|**Rapid Language Adaptation for Multilingual E2E Speech Recognition Using Encoder Prompting**|Yosuke Kashiwagi et.al.|[2406.12611](http://arxiv.org/abs/2406.12611)|null|\n", "2406.12503": "|**2024-06-18**|**Unsupervised Online Continual Learning for Automatic Speech Recognition**|Steven Vander Eeckt et.al.|[2406.12503](http://arxiv.org/abs/2406.12503)|**[link](https://github.com/stevenvdeeckt/unsupervised-ocl-for-asr)**|\n", "2406.12387": "|**2024-06-18**|**Performant ASR Models for Medical Entities in Accented Speech**|Tejumade Afonja et.al.|[2406.12387](http://arxiv.org/abs/2406.12387)|null|\n", "2406.12317": "|**2024-06-18**|**Finding Task-specific Subnetworks in Multi-task Spoken Language Understanding Model**|Hayato Futami et.al.|[2406.12317](http://arxiv.org/abs/2406.12317)|null|\n", "2406.12233": "|**2024-06-18**|**SyncVSR: Data-Efficient Visual Speech Recognition with End-to-End Crossmodal Audio Token Synchronization**|Young Jin Ahn et.al.|[2406.12233](http://arxiv.org/abs/2406.12233)|**[link](https://github.com/KAIST-AILab/SyncVSR)**|\n", "2406.11546": "|**2024-06-17**|**GigaSpeech 2: An Evolving, Large-Scale and Multi-domain ASR Corpus for Low-Resource Languages with Automated Crawling, Transcription and Refinement**|Yifan Yang et.al.|[2406.11546](http://arxiv.org/abs/2406.11546)|**[link](https://github.com/SpeechColab/GigaSpeech2)**|\n", "2406.12937": "|**2024-06-17**|**Self-Train Before You Transcribe**|Robert Flynn et.al.|[2406.12937](http://arxiv.org/abs/2406.12937)|**[link](https://github.com/robflynnyh/Self-Train-Before-You-Transcribe)**|\n", "2406.11064": "|**2024-06-16**|**Continual Test-time Adaptation for End-to-end Speech Recognition on Noisy Speech**|Guan-Ting Lin et.al.|[2406.11064](http://arxiv.org/abs/2406.11064)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.11025": "|**2024-06-16**|**Large Language Models for Dysfluency Detection in Stuttered Speech**|Dominik Wagner et.al.|[2406.11025](http://arxiv.org/abs/2406.11025)|null|\n", "2406.11022": "|**2024-06-16**|**Outlier Reduction with Gated Attention for Improved Post-training Quantization in Large Sequence-to-sequence Speech Foundation Models**|Dominik Wagner et.al.|[2406.11022](http://arxiv.org/abs/2406.11022)|null|\n", "2406.11016": "|**2024-06-16**|**Optimized Speculative Sampling for GPU Hardware Accelerators**|Dominik Wagner et.al.|[2406.11016](http://arxiv.org/abs/2406.11016)|null|\n", "2406.10993": "|**2024-06-16**|**CoSTA: Code-Switched Speech Translation using Aligned Speech-Text Interleaving**|Bhavani Shankar et.al.|[2406.10993](http://arxiv.org/abs/2406.10993)|null|\n", "2406.10932": "|**2024-06-16**|**Imperceptible Rhythm Backdoor Attacks: Exploring Rhythm Transformation for Embedding Undetectable Vulnerabilities on Speech Recognition**|Wenhan Yao et.al.|[2406.10932](http://arxiv.org/abs/2406.10932)|null|\n", "2406.12931": "|**2024-06-16**|**Automatic Speech Recognition for Biomedical Data in Bengali Language**|Shariar Kabir et.al.|[2406.12931](http://arxiv.org/abs/2406.12931)|null|\n", "2406.10741": "|**2024-06-15**|**Speech Emotion Recognition Using CNN and Its Use Case in Digital Healthcare**|Nishargo Nigar et.al.|[2406.10741](http://arxiv.org/abs/2406.10741)|null|\n", "2406.10719": "|**2024-06-21**|**Trading Devil: Robust backdoor attack via Stochastic investment models and Bayesian approach**|Orson Mengara et.al.|[2406.10719](http://arxiv.org/abs/2406.10719)|null|\n", "2406.10177": "|**2024-06-14**|**Inclusive ASR for Disfluent Speech: Cascaded Large-Scale Self-Supervised Learning with Targeted Fine-Tuning and Data Augmentation**|Dena Mujtaba et.al.|[2406.10177](http://arxiv.org/abs/2406.10177)|null|\n", "2406.10083": "|**2024-06-14**|**On the Evaluation of Speech Foundation Models for Spoken Language Understanding**|Siddhant Arora et.al.|[2406.10083](http://arxiv.org/abs/2406.10083)|null|\n", "2406.10082": "|**2024-06-14**|**Whisper-Flamingo: Integrating Visual Features into Whisper for Audio-Visual Speech Recognition and Translation**|Andrew Rouditchenko et.al.|[2406.10082](http://arxiv.org/abs/2406.10082)|**[link](https://github.com/roudimit/whisper-flamingo)**|\n", "2406.10052": "|**2024-06-14**|**Simul-Whisper: Attention-Guided Streaming Whisper with Truncation Detection**|Haoyu Wang et.al.|[2406.10052](http://arxiv.org/abs/2406.10052)|**[link](https://github.com/backspacetg/simul_whisper)**|\n", "2406.09999": "|**2024-06-14**|**ROAR: Reinforcing Original to Augmented Data Ratio Dynamics for Wav2Vec2.0 Based ASR**|Vishwanath Pratap Singh et.al.|[2406.09999](http://arxiv.org/abs/2406.09999)|null|\n", "2406.10313": "|**2024-06-14**|**CNVSRC 2023: The First Chinese Continuous Visual Speech Recognition Challenge**|Chen Chen et.al.|[2406.10313](http://arxiv.org/abs/2406.10313)|null|\n", "2406.09950": "|**2024-06-14**|**An efficient text augmentation approach for contextualized Mandarin speech recognition**|Naijun Zheng et.al.|[2406.09950](http://arxiv.org/abs/2406.09950)|null|\n", "2406.09873": "|**2024-06-14**|**Perceiver-Prompt: Flexible Speaker Adaptation in Whisper for Chinese Disordered Speech Recognition**|Yicong Jiang et.al.|[2406.09873](http://arxiv.org/abs/2406.09873)|null|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.09676": "|**2024-06-14**|**Optimizing Byte-level Representation for End-to-end ASR**|Roger Hsiao et.al.|[2406.09676](http://arxiv.org/abs/2406.09676)|null|\n", "2406.09662": "|**2024-06-14**|**Learning Language Structures through Grounding**|Freda Shi et.al.|[2406.09662](http://arxiv.org/abs/2406.09662)|null|\n", "2406.09618": "|**2024-06-13**|**Multi-Modal Retrieval For Large Language Model Based Speech Recognition**|Jari Kolehmainen et.al.|[2406.09618](http://arxiv.org/abs/2406.09618)|null|\n", "2406.09569": "|**2024-06-13**|**Speech ReaLLM -- Real-time Streaming Speech Recognition with Multimodal LLMs by Teaching the Flow of Time**|Frank Seide et.al.|[2406.09569](http://arxiv.org/abs/2406.09569)|null|\n", "2406.09494": "|**2024-06-13**|**The Second DISPLACE Challenge : DIarization of SPeaker and LAnguage in Conversational Environments**|Shareef Babu Kalluri et.al.|[2406.09494](http://arxiv.org/abs/2406.09494)|null|\n", "2406.09202": "|**2024-06-13**|**Language Complexity and Speech Recognition Accuracy: Orthographic Complexity Hurts, Phonological Complexity Doesn't**|Chihiro Taguchi et.al.|[2406.09202](http://arxiv.org/abs/2406.09202)|**[link](https://github.com/ctaguchi/asrcomplexity)**|\n", "2406.09153": "|**2024-06-13**|**LASER: Learning by Aligning Self-supervised Representations of Speech for Improving Content-related Tasks**|Amit Meghanani et.al.|[2406.09153](http://arxiv.org/abs/2406.09153)|**[link](https://github.com/Trikaldarshi/LASER)**|\n", "2406.08914": "|**2024-06-13**|**Transcription-Free Fine-Tuning of Speech Separation Models for Noisy and Reverberant Multi-Speaker Automatic Speech Recognition**|William Ravenscroft et.al.|[2406.08914](http://arxiv.org/abs/2406.08914)|null|\n", "2406.08904": "|**2024-06-13**|**AdaPTwin: Low-Cost Adaptive Compression of Product Twins in Transformers**|Emil Biju et.al.|[2406.08904](http://arxiv.org/abs/2406.08904)|null|\n", "2406.08641": "|**2024-06-12**|**ML-SUPERB 2.0: Benchmarking Multilingual Speech Models Across Modeling Constraints, Languages, and Datasets**|Jiatong Shi et.al.|[2406.08641](http://arxiv.org/abs/2406.08641)|null|\n", "2406.08396": "|**2024-06-12**|**Neural Blind Source Separation and Diarization for Distant Speech Recognition**|Yoshiaki Bando et.al.|[2406.08396](http://arxiv.org/abs/2406.08396)|null|\n", "2406.08380": "|**2024-06-12**|**Towards Unsupervised Speech Recognition Without Pronunciation Models**|Junrui Ni et.al.|[2406.08380](http://arxiv.org/abs/2406.08380)|null|\n", "2406.08353": "|**2024-06-12**|**Speech Emotion Recognition with ASR Transcripts: A Comprehensive Study on Word Error Rate and Fusion Techniques**|Yuanchao Li et.al.|[2406.08353](http://arxiv.org/abs/2406.08353)|**[link](https://github.com/yc-li20/SER-on-WER-and-Fusion)**|\n", "2406.08266": "|**2024-06-13**|**Refining Self-Supervised Learnt Speech Representation using Brain Activations**|Hengyu Li et.al.|[2406.08266](http://arxiv.org/abs/2406.08266)|null|\n", "2406.08207": "|**2024-06-12**|**Transformer-based Model for ASR N-Best Rescoring and Rewriting**|Iwen E. Kang et.al.|[2406.08207](http://arxiv.org/abs/2406.08207)|null|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.10284": "|**2024-06-12**|**Improving child speech recognition with augmented child-like speech**|Yuanyuan Zhang et.al.|[2406.10284](http://arxiv.org/abs/2406.10284)|null|\n", "2406.07914": "|**2024-06-14**|**Can Large Language Models Understand Spatial Audio?**|Changli Tang et.al.|[2406.07914](http://arxiv.org/abs/2406.07914)|null|\n", "2406.07909": "|**2024-06-12**|**Guiding Frame-Level CTC Alignments Using Self-knowledge Distillation**|Eungbeom Kim et.al.|[2406.07909](http://arxiv.org/abs/2406.07909)|null|\n", "2406.07846": "|**2024-06-12**|**DualVC 3: Leveraging Language Model Generated Pseudo Context for End-to-end Low Latency Streaming Voice Conversion**|Ziqian Ning et.al.|[2406.07846](http://arxiv.org/abs/2406.07846)|null|\n", "2406.07842": "|**2024-06-12**|**Dual-Pipeline with Low-Rank Adaptation for New Language Integration in Multilingual ASR**|Yerbolat Khassanov et.al.|[2406.07842](http://arxiv.org/abs/2406.07842)|null|\n", "2406.07823": "|**2024-06-12**|**PRoDeliberation: Parallel Robust Deliberation for End-to-End Spoken Language Understanding**|Trang Le et.al.|[2406.07823](http://arxiv.org/abs/2406.07823)|null|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.09443": "|**2024-06-12**|**Comparative Analysis of Personalized Voice Activity Detection Systems: Assessing Real-World Effectiveness**|Satyam Kumar et.al.|[2406.09443](http://arxiv.org/abs/2406.09443)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07256": "|**2024-06-11**|**AS-70: A Mandarin stuttered speech dataset for automatic speech recognition and stuttering event detection**|Rong Gong et.al.|[2406.07256](http://arxiv.org/abs/2406.07256)|null|\n", "2406.07589": "|**2024-06-11**|**Tag and correct: high precision post-editing approach to correction of speech recognition errors**|Tomasz Zi\u0119tkiewicz et.al.|[2406.07589](http://arxiv.org/abs/2406.07589)|null|\n", "2406.07096": "|**2024-06-11**|**Fast Context-Biasing for CTC and Transducer ASR models with CTC-based Word Spotter**|Andrei Andrusenko et.al.|[2406.07096](http://arxiv.org/abs/2406.07096)|null|\n", "2406.07090": "|**2024-07-29**|**Spoken Language Corpora Augmentation with Domain-Specific Voice-Cloned Speech**|Mateusz Czy\u017cnikiewicz et.al.|[2406.07090](http://arxiv.org/abs/2406.07090)|null|\n", "2406.07060": "|**2024-06-11**|**Reading Miscue Detection in Primary School through Automatic Speech Recognition**|Lingyun Gao et.al.|[2406.07060](http://arxiv.org/abs/2406.07060)|null|\n", "2406.06729": "|**2024-06-10**|**Synthetic Query Generation using Large Language Models for Virtual Assistants**|Sonal Sannigrahi et.al.|[2406.06729](http://arxiv.org/abs/2406.06729)|null|\n", "2406.06664": "|**2024-06-13**|**ASTRA: Aligning Speech and Text Representations for Asr without Sampling**|Neeraj Gaur et.al.|[2406.06664](http://arxiv.org/abs/2406.06664)|null|\n", "2406.06329": "|**2024-06-10**|**A Parameter-efficient Language Extension Framework for Multilingual ASR**|Wei Liu et.al.|[2406.06329](http://arxiv.org/abs/2406.06329)|null|\n", "2406.05968": "|**2024-06-10**|**Prompting Large Language Models with Audio for General-Purpose Speech Summarization**|Wonjune Kang et.al.|[2406.05968](http://arxiv.org/abs/2406.05968)|**[link](https://github.com/wonjune-kang/llm-speech-summarization)**|\n", "2406.05806": "|**2024-07-18**|**Do Prompts Really Prompt? Exploring the Prompt Understanding Capability of Whisper**|Chih-Kai Yang et.al.|[2406.05806](http://arxiv.org/abs/2406.05806)|null|\n", "2406.05784": "|**2024-07-20**|**Optimizing Multi-Stuttered Speech Classification: Leveraging Whisper's Encoder for Efficient Parameter Reduction in Automated Assessment**|Huma Ameer et.al.|[2406.05784](http://arxiv.org/abs/2406.05784)|null|\n", "2406.05661": "|**2024-06-09**|**MS-HuBERT: Mitigating Pre-training and Inference Mismatch in Masked Language Modelling methods for learning Speech Representations**|Hemant Yadav et.al.|[2406.05661](http://arxiv.org/abs/2406.05661)|null|\n", "2406.04927": "|**2024-06-07**|**LLM-based speaker diarization correction: A generalizable approach**|Georgios Efstathiadis et.al.|[2406.04927](http://arxiv.org/abs/2406.04927)|**[link](https://github.com/GeorgeEfstathiadis/LLM-Diarize-ASR-Agnostic)**|\n", "2406.04791": "|**2024-07-02**|**Speaker-Smoothed kNN Speaker Adaptation for End-to-End ASR**|Shaojun Li et.al.|[2406.04791](http://arxiv.org/abs/2406.04791)|null|\n", "2406.06619": "|**2024-06-07**|**LoRA-Whisper: Parameter-Efficient and Extensible Multilingual ASR**|Zheshu Song et.al.|[2406.06619](http://arxiv.org/abs/2406.06619)|null|\n", "2406.04595": "|**2024-06-07**|**Pitch-Aware RNN-T for Mandarin Chinese Mispronunciation Detection and Diagnosis**|Xintong Wang et.al.|[2406.04595](http://arxiv.org/abs/2406.04595)|null|\n", "2406.04552": "|**2024-06-06**|**Flexible Multichannel Speech Enhancement for Noise-Robust Frontend**|Ante Juki\u0107 et.al.|[2406.04552](http://arxiv.org/abs/2406.04552)|null|\n", "2406.04541": "|**2024-06-06**|**Label-Synchronous Neural Transducer for E2E Simultaneous Speech Translation**|Keqi Deng et.al.|[2406.04541](http://arxiv.org/abs/2406.04541)|**[link](https://github.com/D-Keqi/LS-Transducer-SST)**|\n", "2406.04512": "|**2024-06-06**|**To Distill or Not to Distill? On the Robustness of Robust Knowledge Distillation**|Abdul Waheed et.al.|[2406.04512](http://arxiv.org/abs/2406.04512)|null|\n", "2406.04432": "|**2024-06-06**|**LipGER: Visually-Conditioned Generative Error Correction for Robust Automatic Speech Recognition**|Sreyan Ghosh et.al.|[2406.04432](http://arxiv.org/abs/2406.04432)|**[link](https://github.com/sreyan88/lipger)**|\n", "2406.04269": "|**2024-06-06**|**Beyond Performance Plateaus: A Comprehensive Study on Scalability in Speech Enhancement**|Wangyou Zhang et.al.|[2406.04269](http://arxiv.org/abs/2406.04269)|**[link](https://github.com/emrys365/se-scaling)**|\n", "2406.04240": "|**2024-07-02**|**Hypernetworks for Personalizing ASR to Atypical Speech**|Max M\u00fcller-Eberstein et.al.|[2406.04240](http://arxiv.org/abs/2406.04240)|null|\n", "2406.04123": "|**2024-06-06**|**Helsinki Speech Challenge 2024**|Martin Ludvigsen et.al.|[2406.04123](http://arxiv.org/abs/2406.04123)|null|\n", "2406.03872": "|**2024-06-06**|**BLSP-Emo: Towards Empathetic Large Speech-Language Models**|Chen Wang et.al.|[2406.03872](http://arxiv.org/abs/2406.03872)|**[link](https://github.com/cwang621/blsp-emo)**|\n", "2406.03814": "|**2024-06-14**|**Improving Zero-Shot Chinese-English Code-Switching ASR with kNN-CTC and Gated Monolingual Datastores**|Jiaming Zhou et.al.|[2406.03814](http://arxiv.org/abs/2406.03814)|null|\n", "2406.03791": "|**2024-06-06**|**Speed of Light Exact Greedy Decoding for RNN-T Speech Recognition Models on GPU**|Daniel Galvez et.al.|[2406.03791](http://arxiv.org/abs/2406.03791)|null|\n", "2406.03274": "|**2024-06-11**|**Enhancing CTC-based speech recognition with diverse modeling units**|Shiyi Han et.al.|[2406.03274](http://arxiv.org/abs/2406.03274)|null|\n", "2406.03235": "|**2024-06-05**|**Error-preserving Automatic Speech Recognition of Young English Learners' Language**|Janick Michot et.al.|[2406.03235](http://arxiv.org/abs/2406.03235)|**[link](https://github.com/mict-zhaw/chall_e2e_stt)**|\n", "2406.03049": "|**2024-06-05**|**StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning**|Shaolei Zhang et.al.|[2406.03049](http://arxiv.org/abs/2406.03049)|**[link](https://github.com/ictnlp/streamspeech)**|\n", "2406.02950": "|**2024-06-05**|**4D ASR: Joint Beam Search Integrating CTC, Attention, Transducer, and Mask Predict Decoders**|Yui Sudo et.al.|[2406.02950](http://arxiv.org/abs/2406.02950)|null|\n", "2406.02925": "|**2024-06-15**|**Task Arithmetic can Mitigate Synthetic-to-Real Gap in Automatic Speech Recognition**|Hsuan Su et.al.|[2406.02925](http://arxiv.org/abs/2406.02925)|null|\n", "2406.02921": "|**2024-06-11**|**Text Injection for Neural Contextual Biasing**|Zhong Meng et.al.|[2406.02921](http://arxiv.org/abs/2406.02921)|null|\n", "2406.06582": "|**2024-06-25**|**Discrete Multimodal Transformers with a Pretrained Large Language Model for Mixed-Supervision Speech Processing**|Viet Anh Trinh et.al.|[2406.06582](http://arxiv.org/abs/2406.06582)|null|\n", "2406.02649": "|**2024-06-04**|**Keyword-Guided Adaptation of Automatic Speech Recognition**|Aviv Shamsian et.al.|[2406.02649](http://arxiv.org/abs/2406.02649)|null|\n", "2406.02166": "|**2024-06-04**|**Whistle: Data-Efficient Multilingual and Crosslingual Speech Recognition via Weakly Phonetic Supervision**|Saierdaer Yusuyin et.al.|[2406.02166](http://arxiv.org/abs/2406.02166)|**[link](https://github.com/thu-spmi/cat)**|\n", "2406.02004": "|**2024-06-05**|**Efficiently Train ASR Models that Memorize Less and Perform Better with Per-core Clipping**|Lun Wang et.al.|[2406.02004](http://arxiv.org/abs/2406.02004)|null|\n", "2406.01446": "|**2024-06-03**|**Enabling ASR for Low-Resource Languages: A Comprehensive Dataset Creation Approach**|Ara Yeroyan et.al.|[2406.01446](http://arxiv.org/abs/2406.01446)|null|\n", "2406.01314": "|**2024-06-03**|**Compute-Efficient Medical Image Classification with Softmax-Free Transformers and Sequence Normalization**|Firas Khader et.al.|[2406.01314](http://arxiv.org/abs/2406.01314)|null|\n", "2406.00899": "|**2024-06-02**|**YODAS: Youtube-Oriented Dataset for Audio and Speech**|Xinjian Li et.al.|[2406.00899](http://arxiv.org/abs/2406.00899)|null|\n", "2406.00522": "|**2024-06-01**|**Wav2Prompt: End-to-End Speech Prompt Generation and Tuning For LLM in Zero and Few-shot Learning**|Keqi Deng et.al.|[2406.00522](http://arxiv.org/abs/2406.00522)|null|\n", "2407.11982": "|**2024-05-31**|**Open the Data! Chuvash Datasets**|Nikolay Plotnikov et.al.|[2407.11982](http://arxiv.org/abs/2407.11982)|null|\n", "2405.18669": "|**2024-05-31**|**Zipper: A Multi-Tower Decoder Architecture for Fusing Modalities**|Vicky Zayats et.al.|[2405.18669](http://arxiv.org/abs/2405.18669)|null|\n", "2405.18537": "|**2024-05-28**|**Augmented Conversation with Embedded Speech-Driven On-the-Fly Referencing in AR**|Shivesh Jadon et.al.|[2405.18537](http://arxiv.org/abs/2405.18537)|null|\n", "2405.18346": "|**2024-05-28**|**Intelligent Clinical Documentation: Harnessing Generative AI for Patient-Centric Clinical Note Generation**|Anjanava Biswas et.al.|[2405.18346](http://arxiv.org/abs/2405.18346)|null|\n", "2405.17874": "|**2024-05-28**|**NUTS, NARS, and Speech**|D. van der Sluis et.al.|[2405.17874](http://arxiv.org/abs/2405.17874)|null|\n", "2405.17809": "|**2024-05-28**|**TransVIP: Speech to Speech Translation System with Voice and Isochrony Preservation**|Chenyang Le et.al.|[2405.17809](http://arxiv.org/abs/2405.17809)|null|\n", "2405.17376": "|**2024-05-27**|**Federating Dynamic Models using Early-Exit Architectures for Automatic Speech Recognition on Heterogeneous Clients**|Mohamed Nabih Ali et.al.|[2405.17376](http://arxiv.org/abs/2405.17376)|null|\n", "2405.17250": "|**2024-05-27**|**\"Pass the butter\": A study on desktop-classic multitasking robotic arm based on advanced YOLOv7 and BERT**|Haohua Que et.al.|[2405.17250](http://arxiv.org/abs/2405.17250)|null|\n", "2406.00038": "|**2024-05-27**|**ViSpeR: Multilingual Audio-Visual Speech Recognition**|Sanath Narayan et.al.|[2406.00038](http://arxiv.org/abs/2406.00038)|null|\n", "2405.16952": "|**2024-05-27**|**A Variance-Preserving Interpolation Approach for Diffusion Models with Applications to Single Channel Speech Enhancement and Recognition**|Zilu Guo et.al.|[2405.16952](http://arxiv.org/abs/2405.16952)|**[link](https://github.com/zelokuo/VPIDM)**|\n", "2405.15216": "|**2024-05-24**|**Denoising LM: Pushing the Limits of Error Correction Models for Speech Recognition**|Zijin Gu et.al.|[2405.15216](http://arxiv.org/abs/2405.15216)|null|\n", "2405.15097": "|**2024-05-23**|**Contrastive and Consistency Learning for Neural Noisy-Channel Model in Spoken Language Understanding**|Suyoung Kim et.al.|[2405.15097](http://arxiv.org/abs/2405.15097)|**[link](https://github.com/syoung7388/ccl)**|\n", "2405.14259": "|**2024-06-02**|**Let's Fuse Step by Step: A Generative Fusion Decoding Algorithm with LLMs for Multi-modal Text Recognition**|Chan-Jan Hsu et.al.|[2405.14259](http://arxiv.org/abs/2405.14259)|**[link](https://github.com/mtkresearch/generative-fusion-decoding)**|\n", "2405.14161": "|**2024-05-23**|**Self-Taught Recognizer: Toward Unsupervised Adaptation for Speech Foundation Models**|Yuchen Hu et.al.|[2405.14161](http://arxiv.org/abs/2405.14161)|**[link](https://github.com/yuchen005/star-adapt)**|\n", "2405.14093": "|**2024-05-23**|**A Survey on Vision-Language-Action Models for Embodied AI**|Yueen Ma et.al.|[2405.14093](http://arxiv.org/abs/2405.14093)|null|\n", "2405.13903": "|**2024-05-22**|**ST-Gait++: Leveraging spatio-temporal convolutions for gait-based emotion recognition on videos**|Maria Lu\u00edsa Lima et.al.|[2405.13903](http://arxiv.org/abs/2405.13903)|null|\n", "2405.13514": "|**2024-05-22**|**Joint Optimization of Streaming and Non-Streaming Automatic Speech Recognition with Multi-Decoder and Knowledge Distillation**|Muhammad Shakeel et.al.|[2405.13514](http://arxiv.org/abs/2405.13514)|null|\n", "2405.13477": "|**2024-05-22**|**A Near-Real-Time Processing Ego Speech Filtering Pipeline Designed for Speech Interruption During Human-Robot Interaction**|Yue Li et.al.|[2405.13477](http://arxiv.org/abs/2405.13477)|null|\n", "2405.13379": "|**2024-05-22**|**You don't understand me!: Comparing ASR results for L1 and L2 speakers of Swedish**|Ronald Cumbal et.al.|[2405.13379](http://arxiv.org/abs/2405.13379)|null|\n", "2405.13344": "|**2024-05-22**|**Contextualized Automatic Speech Recognition with Dynamic Vocabulary**|Yui Sudo et.al.|[2405.13344](http://arxiv.org/abs/2405.13344)|null|\n", "2405.13166": "|**2024-05-28**|**FairLENS: Assessing Fairness in Law Enforcement Speech Recognition**|Yicheng Wang et.al.|[2405.13166](http://arxiv.org/abs/2405.13166)|null|\n", "2405.13162": "|**2024-05-21**|**Non-autoregressive real-time Accent Conversion model with voice cloning**|Vladimir Nechaev et.al.|[2405.13162](http://arxiv.org/abs/2405.13162)|null|\n", "2405.12815": "|**2024-05-21**|**Could a Computer Architect Understand our Brain?**|Valentin Puente-Varona et.al.|[2405.12815](http://arxiv.org/abs/2405.12815)|null|\n", "2405.12609": "|**2024-07-01**|**Mamba in Speech: Towards an Alternative to Self-Attention**|Xiangyu Zhang et.al.|[2405.12609](http://arxiv.org/abs/2405.12609)|null|\n", "2405.12018": "|**2024-05-20**|**Continuous Sign Language Recognition with Adapted Conformer via Unsupervised Pretraining**|Neena Aloysius et.al.|[2405.12018](http://arxiv.org/abs/2405.12018)|null|\n", "2405.11078": "|**2024-05-17**|**Acoustic modeling for Overlapping Speech Recognition: JHU Chime-5 Challenge System**|Vimal Manohar et.al.|[2405.11078](http://arxiv.org/abs/2405.11078)|**[link](https://github.com/fgnt/nara_wpe)**|\n", "2405.10025": "|**2024-05-16**|**Listen Again and Choose the Right Answer: A New Paradigm for Automatic Speech Recognition with Large Language Models**|Yuchen Hu et.al.|[2405.10025](http://arxiv.org/abs/2405.10025)|null|\n", "2405.09708": "|**2024-05-15**|**No More Mumbles: Enhancing Robot Intelligibility through Speech Adaptation**|Qiaoqiao Ren et.al.|[2405.09708](http://arxiv.org/abs/2405.09708)|**[link](https://github.com/qiaoqiao2323/robot-speech-intelligibility)**|\n", "2405.09470": "|**2024-05-15**|**Towards Evaluating the Robustness of Automatic Speech Recognition Systems via Audio Style Transfer**|Weifei Jin et.al.|[2405.09470](http://arxiv.org/abs/2405.09470)|null|\n", "2405.13018": "|**2024-05-15**|**Continued Pretraining for Domain Adaptation of Wav2vec2.0 in Automatic Speech Recognition for Elementary Math Classroom Settings**|Ahmed Adel Attia et.al.|[2405.13018](http://arxiv.org/abs/2405.13018)|null|\n", "2405.19342": "|**2024-05-14**|**Sonos Voice Control Bias Assessment Dataset: A Methodology for Demographic Bias Assessment in Voice Assistants**|Chlo\u00e9 Sekkat et.al.|[2405.19342](http://arxiv.org/abs/2405.19342)|null|\n", "2405.08402": "|**2024-05-14**|**Investigating the 'Autoencoder Behavior' in Speech Self-Supervised Models: a focus on HuBERT's Pretraining**|Valentin Vielzeuf et.al.|[2405.08402](http://arxiv.org/abs/2405.08402)|null|\n", "2405.08295": "|**2024-05-31**|**SpeechVerse: A Large-scale Generalizable Audio Language Model**|Nilaksh Das et.al.|[2405.08295](http://arxiv.org/abs/2405.08295)|null|\n", "2405.07442": "|**2024-06-07**|**Rene: A Pre-trained Multi-modal Architecture for Auscultation of Respiratory Diseases**|Pengfei Zhang et.al.|[2405.07442](http://arxiv.org/abs/2405.07442)|**[link](https://github.com/zpforlove/rene)**|\n", "2405.07354": "|**2024-05-12**|**SoccerNet-Echoes: A Soccer Game Audio Commentary Dataset**|Sushant Gautam et.al.|[2405.07354](http://arxiv.org/abs/2405.07354)|**[link](https://github.com/SoccerNet/sn-echoes)**|\n", "2405.13001": "|**2024-05-12**|**Large Language Models for Education: A Survey**|Hanyi Xu et.al.|[2405.13001](http://arxiv.org/abs/2405.13001)|null|\n", "2405.06368": "|**2024-07-22**|**DP-DyLoRA: Fine-Tuning Transformer-Based Models On-Device under Differentially Private Federated Learning using Dynamic Low-Rank Adaptation**|Jie Xu et.al.|[2405.06368](http://arxiv.org/abs/2405.06368)|null|\n", "2405.06150": "|**2024-05-10**|**Lost in Transcription: Identifying and Quantifying the Accuracy Biases of Automatic Speech Recognition Systems Against Disfluent Speech**|Dena Mujtaba et.al.|[2405.06150](http://arxiv.org/abs/2405.06150)|null|\n", "2405.06134": "|**2024-07-17**|**Muting Whisper: A Universal Acoustic Adversarial Attack on Speech Foundation Models**|Vyas Raina et.al.|[2405.06134](http://arxiv.org/abs/2405.06134)|**[link](https://github.com/rainavyas/prepend_acoustic_attack)**|\n", "2405.05498": "|**2024-05-09**|**The RoyalFlush Automatic Speech Diarization and Recognition System for In-Car Multi-Channel Automatic Speech Recognition Challenge**|Jingguang Tian et.al.|[2405.05498](http://arxiv.org/abs/2405.05498)|null|\n", "2405.04296": "|**2024-05-07**|**Open Implementation and Study of BEST-RQ for Speech Processing**|Ryan Whetten et.al.|[2405.04296](http://arxiv.org/abs/2405.04296)|**[link](https://github.com/speechbrain/speechbrain)**|\n", "2405.03484": "|**2024-05-06**|**Whispy: Adapting STT Whisper Models to Real-Time Environments**|Antonio Bevilacqua et.al.|[2405.03484](http://arxiv.org/abs/2405.03484)|null|\n", "2405.03152": "|**2024-05-06**|**MMGER: Multi-modal and Multi-granularity Generative Error Correction with LLM for Joint Accent and Speech Recognition**|Bingshen Mu et.al.|[2405.03152](http://arxiv.org/abs/2405.03152)|null|\n", "2405.02995": "|**2024-05-11**|**Analysis about Theoretical Foundations for Method to Enhancing ASR Performance using OCR Word Frequency Differences**|Kyudan Jung et.al.|[2405.02995](http://arxiv.org/abs/2405.02995)|null|\n", "2405.02578": "|**2024-05-04**|**Mixat: A Data Set of Bilingual Emirati-English Speech**|Maryam Al Ali et.al.|[2405.02578](http://arxiv.org/abs/2405.02578)|**[link](https://github.com/mbzuai-nlp/mixat)**|\n", "2406.02566": "|**2024-05-03**|**Combining X-Vectors and Bayesian Batch Active Learning: Two-Stage Active Learning Pipeline for Speech Recognition**|Ognjen Kundacina et.al.|[2406.02566](http://arxiv.org/abs/2406.02566)|null|\n", "2405.02132": "|**2024-05-06**|**Unveiling the Potential of LLM-Based ASR on Chinese Open-Source Datasets**|Xuelong Geng et.al.|[2405.02132](http://arxiv.org/abs/2405.02132)|null|\n", "2406.02565": "|**2024-05-02**|**Sequence-to-sequence models in peer-to-peer learning: A practical application**|Robert \u0160ajina et.al.|[2406.02565](http://arxiv.org/abs/2406.02565)|null|\n", "2405.01293": "|**2024-05-02**|**Low-resource speech recognition and dialect identification of Irish in a multi-task framework**|Liam Lonergan et.al.|[2405.01293](http://arxiv.org/abs/2405.01293)|null|\n", "2405.01207": "|**2024-05-02**|**Improving Membership Inference in ASR Model Auditing with Perturbed Loss Features**|Francisco Teixeira et.al.|[2405.01207](http://arxiv.org/abs/2405.01207)|null|\n", "2405.01004": "|**2024-05-02**|**Deep Learning Models in Speech Recognition: Measuring GPU Energy Consumption, Impact of Noise and Model Quantization for Edge Deployment**|Aditya Chakravarty et.al.|[2405.01004](http://arxiv.org/abs/2405.01004)|**[link](https://github.com/zzadiues3338/asr-energy-jetson)**|\n", "2405.00966": "|**2024-05-02**|**Efficient Compression of Multitask Multilingual Speech Models**|Thomas Palmeira Ferraz et.al.|[2405.00966](http://arxiv.org/abs/2405.00966)|null|\n", "2405.01601": "|**2024-05-01**|**Efficient Sample-Specific Encoder Perturbations**|Yassir Fathullah et.al.|[2405.01601](http://arxiv.org/abs/2405.01601)|null|\n", "2405.00307": "|**2024-05-01**|**Active Learning with Task Adaptation Pre-training for Speech Emotion Recognition**|Dongyuan Li et.al.|[2405.00307](http://arxiv.org/abs/2405.00307)|null|\n", "2405.00223": "|**2024-07-24**|**Confides: A Visual Analytics Solution for Automated Speech Recognition Analysis and Exploration**|Sunwoo Ha et.al.|[2405.00223](http://arxiv.org/abs/2405.00223)|null|\n", "2404.19310": "|**2024-05-09**|**Does Whisper understand Swiss German? An automatic, qualitative, and human evaluation**|Eyal Liron Dolev et.al.|[2404.19310](http://arxiv.org/abs/2404.19310)|null|\n", "2404.19214": "|**2024-04-30**|**EfficientASR: Speech Recognition Network Compression via Attention Redundancy and Chunk-Level FFN Optimization**|Jianzong Wang et.al.|[2404.19214](http://arxiv.org/abs/2404.19214)|null|\n", "2404.18739": "|**2024-04-29**|**Towards Dog Bark Decoding: Leveraging Human Speech Processing for Automated Bark Classification**|Artem Abzaliev et.al.|[2404.18739](http://arxiv.org/abs/2404.18739)|null|\n", "2406.02563": "|**2024-04-29**|**A cost minimization approach to fix the vocabulary size in a tokenizer for an End-to-End ASR system**|Sunil Kumar Kopparapu et.al.|[2406.02563](http://arxiv.org/abs/2406.02563)|null|\n", "2404.17394": "|**2024-04-26**|**Child Speech Recognition in Human-Robot Interaction: Problem Solved?**|Ruben Janssens et.al.|[2404.17394](http://arxiv.org/abs/2404.17394)|null|\n", "2404.16743": "|**2024-04-26**|**Automatic Speech Recognition System-Independent Word Error Rate Estimation**|Chanho Park et.al.|[2404.16743](http://arxiv.org/abs/2404.16743)|null|\n", "2404.16547": "|**2024-04-25**|**Developing Acoustic Models for Automatic Speech Recognition in Swedish**|Giampiero Salvi et.al.|[2404.16547](http://arxiv.org/abs/2404.16547)|null|\n", "2404.16407": "|**2024-04-25**|**U2++ MoE: Scaling 4.7x parameters with minimal impact on RTF**|Xingchen Song et.al.|[2404.16407](http://arxiv.org/abs/2404.16407)|null|\n", "2404.16112": "|**2024-04-24**|**Mamba-360: Survey of State Space Models as Transformer Alternative for Long Sequence Modelling: Methods, Applications, and Challenges**|Badri Narayana Patro et.al.|[2404.16112](http://arxiv.org/abs/2404.16112)|**[link](https://github.com/badripatro/mamba360)**|\n", "2406.02562": "|**2024-04-24**|**Gated Low-rank Adaptation for personalized Code-Switching Automatic Speech Recognition on the low-spec devices**|Gwantae Kim et.al.|[2406.02562](http://arxiv.org/abs/2406.02562)|null|\n", "2404.15501": "|**2024-04-23**|**Killkan: The Automatic Speech Recognition Dataset for Kichwa with Morphosyntactic Information**|Chihiro Taguchi et.al.|[2404.15501](http://arxiv.org/abs/2404.15501)|**[link](https://github.com/ctaguchi/killkan)**|\n", "2406.02561": "|**2024-04-23**|**Breaking Walls: Pioneering Automatic Speech Recognition for Central Kurdish: End-to-End Transformer Paradigm**|Abdulhady Abas Abdullah et.al.|[2406.02561](http://arxiv.org/abs/2406.02561)|null|\n", "2404.14860": "|**2024-04-23**|**Rethinking Processing Distortions: Disentangling the Impact of Speech Enhancement Errors on Speech Recognition Performance**|Tsubasa Ochiai et.al.|[2404.14860](http://arxiv.org/abs/2404.14860)|null|\n", "2404.14605": "|**2024-04-22**|**Assessment of Sign Language-Based versus Touch-Based Input for Deaf Users Interacting with Intelligent Personal Assistants**|Nina Tran et.al.|[2404.14605](http://arxiv.org/abs/2404.14605)|null|\n", "2406.02560": "|**2024-07-18**|**Less Peaky and More Accurate CTC Forced Alignment by Label Priors**|Ruizhe Huang et.al.|[2406.02560](http://arxiv.org/abs/2406.02560)|**[link](https://github.com/huangruizhe/audio)**|\n", "2404.14024": "|**2024-04-22**|**Exploring neural oscillations during speech perception via surrogate gradient spiking neural networks**|Alexandre Bittar et.al.|[2404.14024](http://arxiv.org/abs/2404.14024)|null|\n", "2404.13362": "|**2024-04-20**|**Semantically Corrected Amharic Automatic Speech Recognition**|Samuael Adnew et.al.|[2404.13362](http://arxiv.org/abs/2404.13362)|**[link](https://github.com/samuael/postprocessed_geez_asr)**|\n", "2404.12888": "|**2024-04-19**|**Learn2Talk: 3D Talking Face Learns from 2D Talking Face**|Yixiang Zhuang et.al.|[2404.12888](http://arxiv.org/abs/2404.12888)|null|\n", "2404.12628": "|**2024-04-19**|**Efficient infusion of self-supervised representations in Automatic Speech Recognition**|Darshan Prabhu et.al.|[2404.12628](http://arxiv.org/abs/2404.12628)|null|\n", "2404.15168": "|**2024-04-18**|**Artificial Neural Networks to Recognize Speakers Division from Continuous Bengali Speech**|Hasmot Ali et.al.|[2404.15168](http://arxiv.org/abs/2404.15168)|null|\n", "2404.10922": "|**2024-04-16**|**Teaching a Multilingual Large Language Model to Understand Multilingual Speech via Multi-Instructional Training**|Pavel Denisov et.al.|[2404.10922](http://arxiv.org/abs/2404.10922)|**[link](https://github.com/akreal/bloomzmms)**|\n", "2404.09841": "|**2024-04-16**|**Anatomy of Industrial Scale Multilingual ASR**|Francis McCann Ramirez et.al.|[2404.09841](http://arxiv.org/abs/2404.09841)|null|\n", "2404.09754": "|**2024-04-15**|**Resilience of Large Language Models for Noisy Instructions**|Bin Wang et.al.|[2404.09754](http://arxiv.org/abs/2404.09754)|null|\n", "2406.09425": "|**2024-04-13**|**SGPRS: Seamless GPU Partitioning Real-Time Scheduler for Periodic Deep Learning Workloads**|Amir Fakhim Babaei et.al.|[2406.09425](http://arxiv.org/abs/2406.09425)|null|\n", "2404.08424": "|**2024-04-12**|**Comparing Apples to Oranges: LLM-powered Multimodal Intention Prediction in an Object Categorization Task**|Hassan Ali et.al.|[2404.08424](http://arxiv.org/abs/2404.08424)|null|\n", "2404.08368": "|**2024-07-26**|**Automatic Speech Recognition Advancements for Indigenous Languages of the Americas**|Monica Romero et.al.|[2404.08368](http://arxiv.org/abs/2404.08368)|null|\n", "2404.07575": "|**2024-04-12**|**An Effective Automated Speaking Assessment Approach to Mitigating Data Scarcity and Imbalanced Distribution**|Tien-Hong Lo et.al.|[2404.07575](http://arxiv.org/abs/2404.07575)|null|\n", "2404.07341": "|**2024-04-12**|**Conformer-1: Robust ASR via Large-Scale Semisupervised Bootstrapping**|Kevin Zhang et.al.|[2404.07341](http://arxiv.org/abs/2404.07341)|null|\n", "2404.08011": "|**2024-04-10**|**An inclusive review on deep learning techniques and their scope in handwriting recognition**|Sukhdeep Singh et.al.|[2404.08011](http://arxiv.org/abs/2404.08011)|null|\n", "2404.06079": "|**2024-04-10**|**The X-LANCE Technical Report for Interspeech 2024 Speech Processing Using Discrete Speech Unit Challenge**|Yiwei Guo et.al.|[2404.06079](http://arxiv.org/abs/2404.06079)|null|\n", "2404.05659": "|**2024-05-28**|**VietMed: A Dataset and Benchmark for Automatic Speech Recognition of Vietnamese in the Medical Domain**|Khai Le-Duc et.al.|[2404.05659](http://arxiv.org/abs/2404.05659)|**[link](https://github.com/leduckhai/multimed)**|\n", "2404.04769": "|**2024-04-07**|**Safeguarding Voice Privacy: Harnessing Near-Ultrasonic Interference To Protect Against Unauthorized Audio Recording**|Forrest McKee et.al.|[2404.04769](http://arxiv.org/abs/2404.04769)|null|\n", "2404.04295": "|**2024-04-04**|**Transducers with Pronunciation-aware Embeddings for Automatic Speech Recognition**|Hainan Xu et.al.|[2404.04295](http://arxiv.org/abs/2404.04295)|null|\n", "2404.03073": "|**2024-04-03**|**Mai Ho'om\u0101una i ka 'Ai: Language Models Improve Automatic Speech Recognition in Hawaiian**|Kaavya Chaparala et.al.|[2404.03073](http://arxiv.org/abs/2404.03073)|null|\n", "2404.02408": "|**2024-04-03**|**CMULAB: An Open-Source Framework for Training and Deployment of Natural Language Processing Models**|Zaid Sheikh et.al.|[2404.02408](http://arxiv.org/abs/2404.02408)|**[link](https://github.com/neulab/cmulab)**|\n", "2404.02098": "|**2024-04-02**|**BRAVEn: Improving Self-Supervised Pre-training for Visual and Auditory Speech Recognition**|Alexandros Haliassos et.al.|[2404.02098](http://arxiv.org/abs/2404.02098)|**[link](https://github.com/ahaliassos/raven)**|\n", "2404.02052": "|**2024-04-02**|**Noise Masking Attacks and Defenses for Pretrained Speech Models**|Matthew Jagielski et.al.|[2404.02052](http://arxiv.org/abs/2404.02052)|null|\n", "2404.01991": "|**2024-04-02**|**Kallaama: A Transcribed Speech Dataset about Agriculture in the Three Most Widely Spoken Languages in Senegal**|Elodie Gauthier et.al.|[2404.01991](http://arxiv.org/abs/2404.01991)|**[link](https://github.com/gauthelo/kallaama-speech-dataset)**|\n", "2404.01737": "|**2024-04-02**|**Transfer Learning from Whisper for Microscopic Intelligibility Prediction**|Paul Best et.al.|[2404.01737](http://arxiv.org/abs/2404.01737)|null|\n", "2404.07226": "|**2024-03-31**|**Houston we have a Divergence: A Subgroup Performance Analysis of ASR Models**|Alkis Koudounas et.al.|[2404.07226](http://arxiv.org/abs/2404.07226)|null|\n", "2403.20262": "|**2024-07-22**|**ELITR-Bench: A Meeting Assistant Benchmark for Long-Context Language Models**|Thibaut Thonet et.al.|[2403.20262](http://arxiv.org/abs/2403.20262)|**[link](https://github.com/utter-project/elitr-bench)**|\n", "2403.19822": "|**2024-03-28**|**Multi-Stage Multi-Modal Pre-Training for Automatic Speech Recognition**|Yash Jain et.al.|[2403.19822](http://arxiv.org/abs/2403.19822)|null|\n", "2403.19224": "|**2024-03-28**|**Emotion Neural Transducer for Fine-Grained Speech Emotion Recognition**|Siyuan Shen et.al.|[2403.19224](http://arxiv.org/abs/2403.19224)|**[link](https://github.com/ecnu-cross-innovation-lab/ent)**|\n", "2403.19207": "|**2024-03-28**|**LV-CTC: Non-autoregressive ASR with CTC and latent variable models**|Yuya Fujita et.al.|[2403.19207](http://arxiv.org/abs/2403.19207)|null|\n", "2403.18721": "|**2024-06-04**|**PhysicsAssistant: An LLM-Powered Interactive Learning Robot for Physics Lab Investigations**|Ehsan Latif et.al.|[2403.18721](http://arxiv.org/abs/2403.18721)|null|\n", "2406.02555": "|**2024-03-27**|**PhoWhisper: Automatic Speech Recognition for Vietnamese**|Thanh-Thien Le et.al.|[2406.02555](http://arxiv.org/abs/2406.02555)|**[link](https://github.com/vinairesearch/phowhisper)**|\n", "2403.18182": "|**2024-03-27**|**ZAEBUC-Spoken: A Multilingual Multidialectal Arabic-English Speech Corpus**|Injy Hamed et.al.|[2403.18182](http://arxiv.org/abs/2403.18182)|null|\n", "2403.17645": "|**2024-04-11**|**DANCER: Entity Description Augmented Named Entity Corrector for Automatic Speech Recognition**|Yi-Cheng Wang et.al.|[2403.17645](http://arxiv.org/abs/2403.17645)|null|\n", "2403.17363": "|**2024-03-26**|**Extracting Biomedical Entities from Noisy Audio Transcripts**|Nima Ebadi et.al.|[2403.17363](http://arxiv.org/abs/2403.17363)|null|\n", "2403.19709": "|**2024-03-25**|**Hierarchical Recurrent Adapters for Efficient Multi-Task Adaptation of Large Speech Models**|Tsendsuren Munkhdalai et.al.|[2403.19709](http://arxiv.org/abs/2403.19709)|null|\n", "2403.16655": "|**2024-03-25**|**Grammatical vs Spelling Error Correction: An Investigation into the Responsiveness of Transformer-based Language Models using BART and MarianMT**|Rohit Raju et.al.|[2403.16655](http://arxiv.org/abs/2403.16655)|null|\n", "2403.15510": "|**2024-03-22**|**Privacy-Preserving End-to-End Spoken Language Understanding**|Yinggui Wang et.al.|[2403.15510](http://arxiv.org/abs/2403.15510)|null|\n", "2403.14438": "|**2024-03-26**|**A Multimodal Approach to Device-Directed Speech Detection with Large Language Models**|Dominik Wagner et.al.|[2403.14438](http://arxiv.org/abs/2403.14438)|null|\n", "2403.14402": "|**2024-03-21**|**XLAVS-R: Cross-Lingual Audio-Visual Speech Representation Learning for Noise-Robust Speech Perception**|HyoJung Han et.al.|[2403.14402](http://arxiv.org/abs/2403.14402)|null|\n", "2403.14168": "|**2024-06-04**|**M$^3$AV: A Multimodal, Multigenre, and Multipurpose Audio-Visual Academic Lecture Dataset**|Zhe Chen et.al.|[2403.14168](http://arxiv.org/abs/2403.14168)|null|\n", "2403.13960": "|**2024-03-20**|**Open Access NAO (OAN): a ROS2-based software framework for HRI applications with the NAO robot**|Antonio Bono et.al.|[2403.13960](http://arxiv.org/abs/2403.13960)|null|\n", "2403.13465": "|**2024-03-20**|**BanglaNum -- A Public Dataset for Bengali Digit Recognition from Speech**|Mir Sayeed Mohammad et.al.|[2403.13465](http://arxiv.org/abs/2403.13465)|null|\n", "2403.13423": "|**2024-03-20**|**Advanced Long-Content Speech Recognition With Factorized Neural Transducer**|Xun Gong et.al.|[2403.13423](http://arxiv.org/abs/2403.13423)|null|\n", "2403.15469": "|**2024-03-20**|**Isometric Neural Machine Translation using Phoneme Count Ratio Reward-based Reinforcement Learning**|Shivam Ratnakant Mhaskar et.al.|[2403.15469](http://arxiv.org/abs/2403.15469)|null|\n", "2403.12821": "|**2024-03-21**|**FlowerFormer: Empowering Neural Architecture Encoding using a Flow-aware Graph Transformer**|Dongyeong Hwang et.al.|[2403.12821](http://arxiv.org/abs/2403.12821)|**[link](https://github.com/y0ngjaenius/cvpr2024_flowerformer)**|\n", "2403.12477": "|**2024-03-19**|**Real-time Speech Extraction Using Spatially Regularized Independent Low-rank Matrix Analysis and Rank-constrained Spatial Covariance Matrix Estimation**|Yuto Ishikawa et.al.|[2403.12477](http://arxiv.org/abs/2403.12477)|null|\n", "2403.12273": "|**2024-03-18**|**Multimodal Human-Autonomous Agents Interaction Using Pre-Trained Language and Visual Foundation Models**|Linus Nwankwo et.al.|[2403.12273](http://arxiv.org/abs/2403.12273)|null|\n", "2403.11578": "|**2024-03-18**|**AdaMER-CTC: Connectionist Temporal Classification with Adaptive Maximum Entropy Regularization for Automatic Speech Recognition**|SooHwan Eom et.al.|[2403.11578](http://arxiv.org/abs/2403.11578)|null|\n", "2403.15442": "|**2024-07-21**|**Artificial Intelligence for Cochlear Implants: Review of Strategies, Challenges, and Perspectives**|Billel Essaid et.al.|[2403.15442](http://arxiv.org/abs/2403.15442)|null|\n", "2403.10961": "|**2024-03-16**|**Energy-Based Models with Applications to Speech and Language Processing**|Zhijian Ou et.al.|[2403.10961](http://arxiv.org/abs/2403.10961)|null|\n", "2403.10937": "|**2024-03-16**|**Initial Decoding with Minimally Augmented Language Model for Improved Lattice Rescoring in Low Resource ASR**|Savitha Murthy et.al.|[2403.10937](http://arxiv.org/abs/2403.10937)|null|\n", "2403.10420": "|**2024-03-15**|**Neural Networks Hear You Loud And Clear: Hearing Loss Compensation Using Deep Neural Networks**|Peter Leer et.al.|[2403.10420](http://arxiv.org/abs/2403.10420)|null|\n", "2403.09753": "|**2024-03-14**|**SpokeN-100: A Cross-Lingual Benchmarking Dataset for The Classification of Spoken Numbers in Different Languages**|Ren\u00e9 Groh et.al.|[2403.09753](http://arxiv.org/abs/2403.09753)|**[link](https://github.com/ankilab/spoken-100)**|\n", "2403.09298": "|**2024-03-14**|**More than words: Advancements and challenges in speech recognition for singing**|Anna Kruspe et.al.|[2403.09298](http://arxiv.org/abs/2403.09298)|null|\n", "2405.12983": "|**2024-03-14**|**Multilingual Audio-Visual Speech Recognition with Hybrid CTC/RNN-T Fast Conformer**|Maxime Burchi et.al.|[2405.12983](http://arxiv.org/abs/2405.12983)|null|\n", "2403.08258": "|**2024-05-21**|**Skipformer: A Skip-and-Recover Strategy for Efficient Speech Recognition**|Wenjing Zhu et.al.|[2403.08258](http://arxiv.org/abs/2403.08258)|null|\n", "2403.08196": "|**2024-03-13**|**SpeechColab Leaderboard: An Open-Source Platform for Automatic Speech Recognition Evaluation**|Jiayu Du et.al.|[2403.08196](http://arxiv.org/abs/2403.08196)|**[link](https://github.com/speechcolab/leaderboard)**|\n", "2403.08187": "|**2024-03-13**|**Automatic Speech Recognition (ASR) for the Diagnosis of pronunciation of Speech Sound Disorders in Korean children**|Taekyung Ahn et.al.|[2403.08187](http://arxiv.org/abs/2403.08187)|null|\n", "2403.08011": "|**2024-03-12**|**Gujarati-English Code-Switching Speech Recognition using ensemble prediction of spoken language**|Yash Sharma et.al.|[2403.08011](http://arxiv.org/abs/2403.08011)|null|\n", "2403.07767": "|**2024-03-12**|**Beyond the Labels: Unveiling Text-Dependency in Paralinguistic Speech Recognition Datasets**|Jan Pe\u0161\u00e1n et.al.|[2403.07767](http://arxiv.org/abs/2403.07767)|null|\n", "2403.07947": "|**2024-03-11**|**The evaluation of a code-switched Sepedi-English automatic speech recognition system**|Amanda Phaladi et.al.|[2403.07947](http://arxiv.org/abs/2403.07947)|null|\n", "2403.06734": "|**2024-03-11**|**Real-Time Multimodal Cognitive Assistant for Emergency Medical Services**|Keshara Weerasinghe et.al.|[2403.06734](http://arxiv.org/abs/2403.06734)|**[link](https://github.com/uva-dsa/ems-pipeline)**|\n", "2403.06387": "|**2024-03-11**|**Towards Decoupling Frontend Enhancement and Backend Recognition in Monaural Robust ASR**|Yufeng Yang et.al.|[2403.06387](http://arxiv.org/abs/2403.06387)|null|\n", "2403.06260": "|**2024-03-10**|**SCORE: Self-supervised Correspondence Fine-tuning for Improved Content Representations**|Amit Meghanani et.al.|[2403.06260](http://arxiv.org/abs/2403.06260)|**[link](https://github.com/trikaldarshi/score_finetuning)**|\n", "2403.05887": "|**2024-03-09**|**Aligning Speech to Languages to Enhance Code-switching Speech Recognition**|Hexin Liu et.al.|[2403.05887](http://arxiv.org/abs/2403.05887)|null|\n", "2403.07937": "|**2024-03-08**|**Speech Robust Bench: A Robustness Benchmark For Speech Recognition**|Muhammad A. Shah et.al.|[2403.07937](http://arxiv.org/abs/2403.07937)|null|\n", "2403.04445": "|**2024-03-07**|**Classist Tools: Social Class Correlates with Performance in NLP**|Amanda Cercas Curry et.al.|[2403.04445](http://arxiv.org/abs/2403.04445)|null|\n", "2403.04280": "|**2024-05-30**|**A New Benchmark for Evaluating Automatic Speech Recognition in the Arabic Call Domain**|Qusai Abo Obaidah et.al.|[2403.04280](http://arxiv.org/abs/2403.04280)|null|\n", "2403.04245": "|**2024-03-07**|**A Study of Dropout-Induced Modality Bias on Robustness to Missing Video Frames for Audio-Visual Speech Recognition**|Yusheng Dai et.al.|[2403.04245](http://arxiv.org/abs/2403.04245)|**[link](https://github.com/dalision/modalbiasavsr)**|\n", "2403.03538": "|**2024-03-06**|**RADIA -- Radio Advertisement Detection with Intelligent Analytics**|Jorge \u00c1lvarez et.al.|[2403.03538](http://arxiv.org/abs/2403.03538)|null|\n", "2403.03522": "|**2024-03-13**|**Non-verbal information in spontaneous speech -- towards a new framework of analysis**|Tirza Biron et.al.|[2403.03522](http://arxiv.org/abs/2403.03522)|null|\n", "2403.02938": "|**2024-03-05**|**AIx Speed: Playback Speed Optimization Using Listening Comprehension of Speech Recognition Models**|Kazuki Kawamura et.al.|[2403.02938](http://arxiv.org/abs/2403.02938)|null|\n", "2403.02288": "|**2024-03-04**|**PixIT: Joint Training of Speaker Diarization and Speech Separation from Real-world Multi-speaker Recordings**|Joonas Kalda et.al.|[2403.02288](http://arxiv.org/abs/2403.02288)|**[link](https://github.com/joonaskalda/pixit)**|\n", "2403.02173": "|**2024-03-04**|**What has LeBenchmark Learnt about French Syntax?**|Zdravko Dugonji\u0107 et.al.|[2403.02173](http://arxiv.org/abs/2403.02173)|null|\n", "2403.02010": "|**2024-03-04**|**SA-SOT: Speaker-Aware Serialized Output Training for Multi-Talker ASR**|Zhiyun Fan et.al.|[2403.02010](http://arxiv.org/abs/2403.02010)|null|\n", "2403.01983": "|**2024-03-04**|**Language and Speech Technology for Central Kurdish Varieties**|Sina Ahmadi et.al.|[2403.01983](http://arxiv.org/abs/2403.01983)|**[link](https://github.com/sinaahmadi/cordi)**|\n", "2403.18843": "|**2024-03-04**|**JEP-KD: Joint-Embedding Predictive Architecture Based Knowledge Distillation for Visual Speech Recognition**|Chang Sun et.al.|[2403.18843](http://arxiv.org/abs/2403.18843)|null|\n", "2403.01369": "|**2024-03-03**|**A Closer Look at Wav2Vec2 Embeddings for On-Device Single-Channel Speech Enhancement**|Ravi Shankar et.al.|[2403.01369](http://arxiv.org/abs/2403.01369)|null|\n", "2403.05583": "|**2024-03-02**|**A Cross-Modal Approach to Silent Speech with LLM-Enhanced Recognition**|Tyler Benster et.al.|[2403.05583](http://arxiv.org/abs/2403.05583)|**[link](https://github.com/tbenst/silent_speech)**|\n", "2403.01255": "|**2024-04-18**|**Automatic Speech Recognition using Advanced Deep Learning Approaches: A survey**|Hamza Kheddar et.al.|[2403.01255](http://arxiv.org/abs/2403.01255)|null|\n", "2403.00370": "|**2024-03-01**|**Post-decoder Biasing for End-to-End Speech Recognition of Multi-turn Medical Interview**|Heyang Liu et.al.|[2403.00370](http://arxiv.org/abs/2403.00370)|null|\n", "2402.19443": "|**2024-02-29**|**Probing the Information Encoded in Neural-based Acoustic Models of Automatic Speech Recognition Systems**|Quentin Raymondaud et.al.|[2402.19443](http://arxiv.org/abs/2402.19443)|null|\n", "2402.18923": "|**2024-02-29**|**Inappropriate Pause Detection In Dysarthric Speech Using Large-Scale Speech Recognition**|Jeehyun Lee et.al.|[2402.18923](http://arxiv.org/abs/2402.18923)|null|\n", "2402.18275": "|**2024-06-04**|**Exploration of Adapter for Noise Robust Automatic Speech Recognition**|Hao Shi et.al.|[2402.18275](http://arxiv.org/abs/2402.18275)|null|\n", "2402.17954": "|**2024-06-19**|**Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps**|Giuseppe Attanasio et.al.|[2402.17954](http://arxiv.org/abs/2402.17954)|**[link](https://github.com/g8a9/multilingual-asr-gender-gap)**|\n", "2402.17189": "|**2024-02-27**|**An Effective Mixture-Of-Experts Approach For Code-Switching Speech Recognition Leveraging Encoder Disentanglement**|Tzu-Ting Yang et.al.|[2402.17189](http://arxiv.org/abs/2402.17189)|null|\n", "2402.17184": "|**2024-02-27**|**Extreme Encoder Output Frame Rate Reduction: Improving Computational Latencies of Large End-to-End Models**|Rohit Prabhavalkar et.al.|[2402.17184](http://arxiv.org/abs/2402.17184)|null|\n", "2402.15733": "|**2024-04-01**|**ArEEG_Chars: Dataset for Envisioned Speech Recognition using EEG for Arabic Characters**|Hazem Darwish et.al.|[2402.15733](http://arxiv.org/abs/2402.15733)|null|\n", "2402.15151": "|**2024-05-14**|**Where Visual Speech Meets Language: VSP-LLM Framework for Efficient and Context-Aware Visual Speech Processing**|Jeong Hun Yeo et.al.|[2402.15151](http://arxiv.org/abs/2402.15151)|**[link](https://github.com/sally-sh/vsp-llm)**|\n", "2402.14563": "|**2024-02-22**|**Wizard of Oz Experimentation for Language Technology Applications: Challenges and Tools**|Stephan Schl\u00f6gl et.al.|[2402.14563](http://arxiv.org/abs/2402.14563)|null|\n", "2402.14888": "|**2024-02-22**|**Efficient data selection employing Semantic Similarity-based Graph Structures for model training**|Roxana Petcu et.al.|[2402.14888](http://arxiv.org/abs/2402.14888)|null|\n", "2402.14185": "|**2024-02-22**|**HINT: High-quality INPainting Transformer with Mask-Aware Encoding and Enhanced Attention**|Shuang Chen et.al.|[2402.14185](http://arxiv.org/abs/2402.14185)|**[link](https://github.com/chrischen1023/hint)**|\n", "2402.13687": "|**2024-02-21**|**An Augmented Lagrangian Method for Training Recurrent Neural Networks**|Yue Wang et.al.|[2402.13687](http://arxiv.org/abs/2402.13687)|null|\n", "2402.13511": "|**2024-02-22**|**Mel-FullSubNet: Mel-Spectrogram Enhancement for Improving Both Speech Quality and ASR**|Rui Zhou et.al.|[2402.13511](http://arxiv.org/abs/2402.13511)|null|\n", "2402.13208": "|**2024-02-20**|**How do Hyenas deal with Human Speech? Speech Recognition and Translation with ConfHyena**|Marco Gaido et.al.|[2402.13208](http://arxiv.org/abs/2402.13208)|**[link](https://github.com/hlt-mt/fbk-fairseq)**|\n", "2402.13076": "|**2024-02-20**|**Not All Weights Are Created Equal: Enhancing Energy Efficiency in On-Device Streaming Speech Recognition**|Yang Li et.al.|[2402.13076](http://arxiv.org/abs/2402.13076)|null|\n", "2402.13004": "|**2024-02-20**|**Comparison of Conventional Hybrid and CTC/Attention Decoders for Continuous Visual Speech Recognition**|David Gimeno-G\u00f3mez et.al.|[2402.13004](http://arxiv.org/abs/2402.13004)|null|\n", "2402.12654": "|**2024-06-16**|**OWSM-CTC: An Open Encoder-Only Speech Foundation Model for Speech Recognition, Translation, and Language Identification**|Yifan Peng et.al.|[2402.12654](http://arxiv.org/abs/2402.12654)|null|\n", "2402.11954": "|**2024-02-19**|**Multimodal Emotion Recognition from Raw Audio with Sinc-convolution**|Xiaohui Zhang et.al.|[2402.11954](http://arxiv.org/abs/2402.11954)|null|\n", "2402.11571": "|**2024-02-18**|**Ain't Misbehavin' -- Using LLMs to Generate Expressive Robot Behavior in Conversations with the Tabletop Robot Haru**|Zining Wang et.al.|[2402.11571](http://arxiv.org/abs/2402.11571)|null|\n", "2402.11520": "|**2024-02-18**|**Cross-Attention Fusion of Visual and Geometric Features for Large Vocabulary Arabic Lipreading**|Samar Daou et.al.|[2402.11520](http://arxiv.org/abs/2402.11520)|null|\n", "2402.09797": "|**2024-02-15**|**A cross-talk robust multichannel VAD model for multiparty agent interactions trained using synthetic re-recordings**|Hyewon Han et.al.|[2402.09797](http://arxiv.org/abs/2402.09797)|null|\n", "2402.08932": "|**2024-02-14**|**Listening to Multi-talker Conversations: Modular and End-to-end Perspectives**|Desh Raj et.al.|[2402.08932](http://arxiv.org/abs/2402.08932)|null|\n", "2402.08898": "|**2024-02-14**|**UniEnc-CASSNAT: An Encoder-only Non-autoregressive ASR for Speech SSL Models**|Ruchao Fan et.al.|[2402.08898](http://arxiv.org/abs/2402.08898)|null|\n", "2402.08846": "|**2024-02-13**|**An Embarrassingly Simple Approach for LLM with Strong ASR Capacity**|Ziyang Ma et.al.|[2402.08846](http://arxiv.org/abs/2402.08846)|**[link](https://github.com/X-LANCE/SLAM-LLM)**|\n", "2402.08788": "|**2024-02-13**|**Syllable based DNN-HMM Cantonese Speech to Text System**|Timothy Wong et.al.|[2402.08788](http://arxiv.org/abs/2402.08788)|null|\n", "2402.08021": "|**2024-05-03**|**Careless Whisper: Speech-to-Text Hallucination Harms**|Allison Koenecke et.al.|[2402.08021](http://arxiv.org/abs/2402.08021)|**[link](https://github.com/koenecke/hallucination_harms)**|\n", "2402.07729": "|**2024-07-26**|**AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension**|Qian Yang et.al.|[2402.07729](http://arxiv.org/abs/2402.07729)|**[link](https://github.com/ofa-sys/air-bench)**|\n", "2402.07658": "|**2024-02-12**|**The Sound of Healthcare: Improving Medical Transcription ASR Accuracy with Large Language Models**|Ayo Adedeji et.al.|[2402.07658](http://arxiv.org/abs/2402.07658)|null|\n", "2402.07513": "|**2024-02-12**|**The Balancing Act: Unmasking and Alleviating ASR Biases in Portuguese**|Ajinkya Kulkarni et.al.|[2402.07513](http://arxiv.org/abs/2402.07513)|null|\n", "2402.07431": "|**2024-02-13**|**SALAD: Smart AI Language Assistant Daily**|Ragib Amin Nihal et.al.|[2402.07431](http://arxiv.org/abs/2402.07431)|null|\n", "2402.07095": "|**2024-02-11**|**Does ChatGPT and Whisper Make Humanoid Robots More Relatable?**|Xiaohui Chen et.al.|[2402.07095](http://arxiv.org/abs/2402.07095)|null|\n", "2402.06966": "|**2024-02-10**|**DeepCover: Advancing RNN Test Coverage and Online Error Prediction using State Machine Extraction**|Pouria Golshanrad et.al.|[2402.06966](http://arxiv.org/abs/2402.06966)|**[link](https://github.com/pouriagr/deep-cover)**|\n", "2402.06923": "|**2024-02-10**|**CochCeps-Augment: A Novel Self-Supervised Contrastive Learning Using Cochlear Cepstrum-based Masking for Speech Emotion Recognition**|Ioannis Ziogas et.al.|[2402.06923](http://arxiv.org/abs/2402.06923)|null|\n", "2402.06592": "|**2024-02-09**|**Self-consistent context aware conformer transducer for speech recognition**|Konstantin Kolokolov et.al.|[2402.06592](http://arxiv.org/abs/2402.06592)|null|\n", "2402.05706": "|**2024-02-08**|**Unified Speech-Text Pretraining for Spoken Dialog Modeling**|Heeseung Kim et.al.|[2402.05706](http://arxiv.org/abs/2402.05706)|null|\n", "2402.05457": "|**2024-02-08**|**It's Never Too Late: Fusing Acoustic Information into Large Language Models for Automatic Speech Recognition**|Chen Chen et.al.|[2402.05457](http://arxiv.org/abs/2402.05457)|null|\n", "2402.04805": "|**2024-02-07**|**Progressive unsupervised domain adaptation for ASR using ensemble models and multi-stage training**|Rehan Ahmad et.al.|[2402.04805](http://arxiv.org/abs/2402.04805)|null|\n", "2402.03988": "|**2024-05-28**|**REBORN: Reinforcement-Learned Boundary Segmentation with Iterative Training for Unsupervised ASR**|Liang-Hsuan Tseng et.al.|[2402.03988](http://arxiv.org/abs/2402.03988)|**[link](https://github.com/andybi7676/reborn-uasr)**|\n", "2402.03519": "|**2024-02-05**|**Resolving Transcription Ambiguity in Spanish: A Hybrid Acoustic-Lexical System for Punctuation Restoration**|Xiliang Zhu et.al.|[2402.03519](http://arxiv.org/abs/2402.03519)|null|\n", "2402.03050": "|**2024-02-05**|**A Comprehensive Study of the Current State-of-the-Art in Nepali Automatic Speech Recognition Systems**|Rupak Raj Ghimire et.al.|[2402.03050](http://arxiv.org/abs/2402.03050)|null|\n", "2402.02302": "|**2024-02-03**|**Predicting positive transfer for improved low-resource speech recognition using acoustic pseudo-tokens**|Nay San et.al.|[2402.02302](http://arxiv.org/abs/2402.02302)|null|\n", "2402.01931": "|**2024-02-02**|**Digits micro-model for accurate and secure transactions**|Chirag Chhablani et.al.|[2402.01931](http://arxiv.org/abs/2402.01931)|null|\n", "2402.01917": "|**2024-02-02**|**Whispering in Norwegian: Navigating Orthographic and Dialectic Challenges**|Per E Kummervold et.al.|[2402.01917](http://arxiv.org/abs/2402.01917)|null|\n", "2402.01172": "|**2024-02-02**|**Streaming Sequence Transduction through Dynamic Compression**|Weiting Tan et.al.|[2402.01172](http://arxiv.org/abs/2402.01172)|**[link](https://github.com/steventan0110/star)**|\n", "2402.01152": "|**2024-02-05**|**AccentFold: A Journey through African Accents for Zero-Shot ASR Adaptation to Target Accents**|Abraham Toluwase Owodunni et.al.|[2402.01152](http://arxiv.org/abs/2402.01152)|null|\n", "2402.01778": "|**2024-02-01**|**Introduction to speech recognition**|Gabriel Dauphin et.al.|[2402.01778](http://arxiv.org/abs/2402.01778)|null|\n", "2402.00632": "|**2024-02-01**|**Prosody in Cascade and Direct Speech-to-Text Translation: a case study on Korean Wh-Phrases**|Giulio Zhou et.al.|[2402.00632](http://arxiv.org/abs/2402.00632)|null|\n", "2402.00235": "|**2024-01-31**|**Exploring the limits of decoder-only models trained on public speech recognition corpora**|Ankit Gupta et.al.|[2402.00235](http://arxiv.org/abs/2402.00235)|null|\n", "2401.18045": "|**2024-01-31**|**SpeechComposer: Unifying Multiple Speech Tasks with Prompt Composition**|Yihan Wu et.al.|[2401.18045](http://arxiv.org/abs/2401.18045)|null|\n", "2401.17604": "|**2024-02-08**|**Computation and Parameter Efficient Multi-Modal Fusion Transformer for Cued Speech Recognition**|Lei Liu et.al.|[2401.17604](http://arxiv.org/abs/2401.17604)|null|\n", "2401.16658": "|**2024-06-16**|**OWSM v3.1: Better and Faster Open Whisper-Style Speech Models based on E-Branchformer**|Yifan Peng et.al.|[2401.16658](http://arxiv.org/abs/2401.16658)|null|\n", "2401.15704": "|**2024-01-28**|**Phoneme-Based Proactive Anti-Eavesdropping with Controlled Recording Privilege**|Peng Huang et.al.|[2401.15704](http://arxiv.org/abs/2401.15704)|null|\n", "2401.15676": "|**2024-01-28**|**On Speaker Attribution with SURT**|Desh Raj et.al.|[2401.15676](http://arxiv.org/abs/2401.15676)|**[link](https://github.com/k2-fsa/icefall)**|\n", "2401.15532": "|**2024-01-28**|**Byte Pair Encoding Is All You Need For Automatic Bengali Speech Recognition**|Ahnaf Mozib Samin et.al.|[2401.15532](http://arxiv.org/abs/2401.15532)|null|\n", "2401.15385": "|**2024-01-27**|**Towards Event Extraction from Speech with Contextual Clues**|Jingqi Kang et.al.|[2401.15385](http://arxiv.org/abs/2401.15385)|**[link](https://github.com/jodie-kang/speechee)**|\n", "2401.14890": "|**2024-01-26**|**Comparison of parameters of vowel sounds of russian and english languages**|V. I. Fedoseev et.al.|[2401.14890](http://arxiv.org/abs/2401.14890)|null|\n", "2401.14625": "|**2024-01-26**|**Toward Practical Automatic Speech Recognition and Post-Processing: a Call for Explainable Error Benchmark Guideline**|Seonmin Koo et.al.|[2401.14625](http://arxiv.org/abs/2401.14625)|null|\n", "2401.14185": "|**2024-01-25**|**TDFNet: An Efficient Audio-Visual Speech Separation Model with Top-down Fusion**|Samuel Pegg et.al.|[2401.14185](http://arxiv.org/abs/2401.14185)|**[link](https://github.com/spkgyk/TDFNet)**|\n", "2401.13575": "|**2024-01-24**|**CNN architecture extraction on edge GPU**|Peter Horvath et.al.|[2401.13575](http://arxiv.org/abs/2401.13575)|null|\n", "2401.13463": "|**2024-03-18**|**SpeechDPR: End-to-End Spoken Passage Retrieval for Open-Domain Spoken Question Answering**|Chyi-Jiunn Lin et.al.|[2401.13463](http://arxiv.org/abs/2401.13463)|null|\n", "2401.13260": "|**2024-05-28**|**MF-AED-AEC: Speech Emotion Recognition by Leveraging Multimodal Fusion, Asr Error Detection, and Asr Error Correction**|Jiajun He et.al.|[2401.13260](http://arxiv.org/abs/2401.13260)|null|\n", "2401.13146": "|**2024-01-23**|**Locality enhanced dynamic biasing and sampling strategies for contextual ASR**|Md Asif Jalal et.al.|[2401.13146](http://arxiv.org/abs/2401.13146)|null|\n", "2401.12789": "|**2024-01-23**|**Multilingual and Fully Non-Autoregressive ASR with Large Language Model Fusion: A Comprehensive Study**|W. Ronny Huang et.al.|[2401.12789](http://arxiv.org/abs/2401.12789)|null|\n", "2401.12085": "|**2024-01-22**|**Consistency Based Unsupervised Self-training For ASR Personalisation**|Jisi Zhang et.al.|[2401.12085](http://arxiv.org/abs/2401.12085)|null|\n", "2401.11983": "|**2024-01-22**|**Lightweight Protection for Privacy in Offloaded Speech Understanding**|Dongqi Cai et.al.|[2401.11983](http://arxiv.org/abs/2401.11983)|null|\n", "2401.11700": "|**2024-01-22**|**Keep Decoding Parallel with Effective Knowledge Distillation from Language Models to End-to-end Speech Recognisers**|Michael Hentschel et.al.|[2401.11700](http://arxiv.org/abs/2401.11700)|null|\n", "2401.11382": "|**2024-06-06**|**Using Large Language Model for End-to-End Chinese ASR and NER**|Yuang Li et.al.|[2401.11382](http://arxiv.org/abs/2401.11382)|null|\n", "2401.11268": "|**2024-02-02**|**Word-Level ASR Quality Estimation for Efficient Corpus Sampling and Post-Editing through Analyzing Attentions of a Reference-Free Metric**|Golara Javadi et.al.|[2401.11268](http://arxiv.org/abs/2401.11268)|**[link](https://github.com/aixplain/NoRefER)**|\n", "2401.11132": "|**2024-01-20**|**ConceptThread: Visualizing Threaded Concepts in MOOC Videos**|Zhiguang Zhou et.al.|[2401.11132](http://arxiv.org/abs/2401.11132)|null|\n", "2401.10449": "|**2024-01-19**|**Contextualized Automatic Speech Recognition with Attention-Based Bias Phrase Boosted Beam Search**|Yui Sudo et.al.|[2401.10449](http://arxiv.org/abs/2401.10449)|null|\n", "2401.10447": "|**2024-01-19**|**Investigating Training Strategies and Model Robustness of Low-Rank Adaptation for Language Modeling in Speech Recognition**|Yu Yu et.al.|[2401.10447](http://arxiv.org/abs/2401.10447)|null|\n", "2401.10446": "|**2024-01-19**|**Large Language Models are Efficient Learners of Noise-Robust Speech Recognition**|Yuchen Hu et.al.|[2401.10446](http://arxiv.org/abs/2401.10446)|**[link](https://github.com/yuchen005/robustger)**|\n", "2401.10411": "|**2024-01-18**|**AGADIR: Towards Array-Geometry Agnostic Directional Speech Recognition**|Ju Lin et.al.|[2401.10411](http://arxiv.org/abs/2401.10411)|null|\n", "2401.10070": "|**2024-01-18**|**Communication-Efficient Personalized Federated Learning for Speech-to-Text Tasks**|Yichao Du et.al.|[2401.10070](http://arxiv.org/abs/2401.10070)|null|\n", "2401.09802": "|**2024-07-18**|**Efficient Training for Multilingual Visual Speech Recognition: Pre-training with Discretized Visual Speech Representation**|Minsu Kim et.al.|[2401.09802](http://arxiv.org/abs/2401.09802)|null|\n", "2401.09759": "|**2024-07-02**|**SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech Recognition**|Hao Wang et.al.|[2401.09759](http://arxiv.org/abs/2401.09759)|null|\n", "2401.09315": "|**2024-01-17**|**On Speech Pre-emphasis as a Simple and Inexpensive Method to Boost Speech Enhancement**|Iv\u00e1n L\u00f3pez-Espejo et.al.|[2401.09315](http://arxiv.org/abs/2401.09315)|null|\n", "2401.08916": "|**2024-01-17**|**Two-pass Endpoint Detection for Speech Recognition**|Anirudh Raju et.al.|[2401.08916](http://arxiv.org/abs/2401.08916)|null|\n", "2401.08887": "|**2024-01-16**|**NOTSOFAR-1 Challenge: New Datasets, Baseline, and Tasks for Distant Meeting Transcription**|Alon Vinnikov et.al.|[2401.08887](http://arxiv.org/abs/2401.08887)|null|\n", "2401.08835": "|**2024-01-16**|**Improving ASR Contextual Biasing with Guided Attention**|Jiyang Tang et.al.|[2401.08835](http://arxiv.org/abs/2401.08835)|null|\n", "2401.08833": "|**2024-01-16**|**Revisiting Self-supervised Learning of Speech Representation from a Mutual Information Perspective**|Alexander H. Liu et.al.|[2401.08833](http://arxiv.org/abs/2401.08833)|null|\n", "2401.08052": "|**2024-03-01**|**Multi-Input Multi-Output Target-Speaker Voice Activity Detection For Unified, Flexible, and Robust Audio-Visual Speaker Diarization**|Ming Cheng et.al.|[2401.08052](http://arxiv.org/abs/2401.08052)|null|\n", "2401.07957": "|**2024-01-15**|**Machine Perceptual Quality: Evaluating the Impact of Severe Lossy Compression on Audio and Image Models**|Dan Jacobellis et.al.|[2401.07957](http://arxiv.org/abs/2401.07957)|**[link](https://github.com/danjacobellis/mpq)**|\n", "2401.07575": "|**2024-07-24**|**Cascaded Cross-Modal Transformer for Audio-Textual Classification**|Nicolae-Catalin Ristea et.al.|[2401.07575](http://arxiv.org/abs/2401.07575)|**[link](https://github.com/ristea/ccmt)**|\n", "2401.07506": "|**2024-01-15**|**SeMaScore : a new evaluation metric for automatic speech recognition tasks**|Zitha Sasindran et.al.|[2401.07506](http://arxiv.org/abs/2401.07506)|null|\n", "2401.07360": "|**2024-01-14**|**Promptformer: Prompted Conformer Transducer for ASR**|Sergio Duarte-Torres et.al.|[2401.07360](http://arxiv.org/abs/2401.07360)|null|\n", "2401.06980": "|**2024-01-13**|**Joint Unsupervised and Supervised Training for Automatic Speech Recognition via Bilevel Optimization**|A F M Saif et.al.|[2401.06980](http://arxiv.org/abs/2401.06980)|**[link](https://github.com/afmsaif/joint-unsupervised-and-supervised-training-for-automatic-speech-recognition-via-bilevel-optimization)**|\n", "2401.09354": "|**2024-01-12**|**Transcending Controlled Environments Assessing the Transferability of ASRRobust NLU Models to Real-World Applications**|Hania Khan et.al.|[2401.09354](http://arxiv.org/abs/2401.09354)|null|\n", "2401.06588": "|**2024-01-12**|**Dynamic Behaviour of Connectionist Speech Recognition with Strong Latency Constraints**|Giampiero Salvi et.al.|[2401.06588](http://arxiv.org/abs/2401.06588)|null|\n", "2401.06832": "|**2024-01-12**|**XLS-R Deep Learning Model for Multilingual ASR on Low- Resource Languages: Indonesian, Javanese, and Sundanese**|Panji Arisaputra et.al.|[2401.06832](http://arxiv.org/abs/2401.06832)|null|\n", "2401.06390": "|**2024-01-12**|**LCB-net: Long-Context Biasing for Audio-Visual Speech Recognition**|Fan Yu et.al.|[2401.06390](http://arxiv.org/abs/2401.06390)|**[link](https://github.com/alibaba-damo-academy/FunASR)**|\n", "2401.05689": "|**2024-01-11**|**UCorrect: An Unsupervised Framework for Automatic Speech Recognition Error Correction**|Jiaxin Guo et.al.|[2401.05689](http://arxiv.org/abs/2401.05689)|null|\n", "2401.06183": "|**2024-01-11**|**End to end Hindi to English speech conversion using Bark, mBART and a finetuned XLSR Wav2Vec2**|Aniket Tathe et.al.|[2401.06183](http://arxiv.org/abs/2401.06183)|null|\n", "2401.05551": "|**2024-01-10**|**Useful Blunders: Can Automated Speech Recognition Errors Improve Downstream Dementia Classification?**|Changye Li et.al.|[2401.05551](http://arxiv.org/abs/2401.05551)|null|\n", "2401.05336": "|**2024-01-10**|**Towards Online Sign Language Recognition and Translation**|Ronglai Zuo et.al.|[2401.05336](http://arxiv.org/abs/2401.05336)|**[link](https://github.com/FangyunWei/SLRT)**|\n", "2401.04482": "|**2024-07-17**|**Continuously Learning New Words in Automatic Speech Recognition**|Christian Huber et.al.|[2401.04482](http://arxiv.org/abs/2401.04482)|null|\n", "2401.04235": "|**2024-01-08**|**High-precision Voice Search Query Correction via Retrievable Speech-text Embedings**|Christopher Li et.al.|[2401.04235](http://arxiv.org/abs/2401.04235)|null|\n", "2401.04152": "|**2024-07-22**|**Cross-Speaker Encoding Network for Multi-Talker Speech Recognition**|Jiawen Kang et.al.|[2401.04152](http://arxiv.org/abs/2401.04152)|**[link](https://github.com/kjw11/csenet-asr)**|\n", "2401.03936": "|**2024-01-08**|**Exploratory Evaluation of Speech Content Masking**|Jennifer Williams et.al.|[2401.03936](http://arxiv.org/abs/2401.03936)|null|\n", "2401.03697": "|**2024-03-07**|**An audio-quality-based multi-strategy approach for target speaker extraction in the MISP 2023 Challenge**|Runduo Han et.al.|[2401.03697](http://arxiv.org/abs/2401.03697)|null|\n", "2401.03689": "|**2024-06-10**|**LUPET: Incorporating Hierarchical Information Path into Multilingual ASR**|Wei Liu et.al.|[2401.03689](http://arxiv.org/abs/2401.03689)|null|\n", "2401.03687": "|**2024-01-08**|**BS-PLCNet: Band-split Packet Loss Concealment Network with Multi-task Learning Framework and Multi-discriminators**|Zihan Zhang et.al.|[2401.03687](http://arxiv.org/abs/2401.03687)|null|\n", "2401.03506": "|**2024-07-22**|**DiarizationLM: Speaker Diarization Post-Processing with Large Language Models**|Quan Wang et.al.|[2401.03506](http://arxiv.org/abs/2401.03506)|**[link](https://github.com/google/speaker-id)**|\n", "2401.06788": "|**2024-02-29**|**The NPU-ASLP-LiAuto System Description for Visual Speech Recognition in CNVSRC 2023**|He Wang et.al.|[2401.06788](http://arxiv.org/abs/2401.06788)|**[link](https://github.com/mkt-dataoceanai/cnvsrc2023baseline)**|\n", "2401.03473": "|**2024-02-21**|**ICMC-ASR: The ICASSP 2024 In-Car Multi-Channel Automatic Speech Recognition Challenge**|He Wang et.al.|[2401.03473](http://arxiv.org/abs/2401.03473)|null|\n", "2401.03468": "|**2024-01-07**|**Multichannel AV-wav2vec2: A Framework for Learning Multichannel Multi-Modal Speech Representation**|Qiushi Zhu et.al.|[2401.03468](http://arxiv.org/abs/2401.03468)|**[link](https://github.com/zqs01/multi-channel-wav2vec2)**|\n", "2401.03424": "|**2024-04-08**|**MLCA-AVSR: Multi-Layer Cross Attention Fusion based Audio-Visual Speech Recognition**|He Wang et.al.|[2401.03424](http://arxiv.org/abs/2401.03424)|null|\n", "2401.03251": "|**2024-01-06**|**TeLeS: Temporal Lexeme Similarity Score to Estimate Confidence in End-to-End ASR**|Nagarathna Ravi et.al.|[2401.03251](http://arxiv.org/abs/2401.03251)|**[link](https://github.com/madhavlab/2023_teles_wlc)**|\n", "2401.03175": "|**2024-01-06**|**Part-of-Speech Tagger for Bodo Language using Deep Learning approach**|Dhrubajyoti Pathak et.al.|[2401.03175](http://arxiv.org/abs/2401.03175)|null|\n", "2401.02921": "|**2024-01-05**|**Towards ASR Robust Spoken Language Understanding Through In-Context Learning With Word Confusion Networks**|Kevin Everson et.al.|[2401.02921](http://arxiv.org/abs/2401.02921)|null|\n", "2401.02890": "|**2024-01-05**|**Nonlinear functional regression by functional deep neural network with kernel embedding**|Zhongjie Shi et.al.|[2401.02890](http://arxiv.org/abs/2401.02890)|null|\n", "2401.02673": "|**2024-01-05**|**A unified multichannel far-field speech recognition system: combining neural beamforming with attention based end-to-end model**|Dongdi Zhao et.al.|[2401.02673](http://arxiv.org/abs/2401.02673)|null|\n", "2401.02417": "|**2024-01-04**|**Task Oriented Dialogue as a Catalyst for Self-Supervised Automatic Speech Recognition**|David M. Chan et.al.|[2401.02417](http://arxiv.org/abs/2401.02417)|**[link](https://github.com/amazon-science/amazon-od3)**|\n", "2402.10218": "|**2024-01-04**|**AntiDeepFake: AI for Deep Fake Speech Recognition**|Enkhtogtokh Togootogtokh et.al.|[2402.10218](http://arxiv.org/abs/2402.10218)|null|\n", "2401.02046": "|**2024-01-04**|**CTC Blank Triggered Dynamic Layer-Skipping for Efficient CTC-based Speech Recognition**|Junfeng Hou et.al.|[2401.02046](http://arxiv.org/abs/2401.02046)|null|\n", "2401.01572": "|**2024-01-03**|**Hallucinations in Neural Automatic Speech Recognition: Identifying Errors and Hallucinatory Models**|Rita Frieske et.al.|[2401.01572](http://arxiv.org/abs/2401.01572)|null|\n", "2401.01537": "|**2024-06-04**|**The Art of Deception: Robust Backdoor Attack using Dynamic Stacking of Triggers**|Orson Mengara et.al.|[2401.01537](http://arxiv.org/abs/2401.01537)|null|\n", "2401.00662": "|**2024-01-01**|**Enhancing Pre-trained ASR System Fine-tuning for Dysarthric Speech Recognition using Adversarial Data Augmentation**|Huimeng Wang et.al.|[2401.00662](http://arxiv.org/abs/2401.00662)|null|\n", "2312.17279": "|**2024-05-02**|**Stateful Conformer with Cache-based Inference for Streaming Automatic Speech Recognition**|Vahid Noroozi et.al.|[2312.17279](http://arxiv.org/abs/2312.17279)|null|\n", "2312.16002": "|**2023-12-26**|**The NUS-HLT System for ICASSP2024 ICMC-ASR Grand Challenge**|Meng Ge et.al.|[2312.16002](http://arxiv.org/abs/2312.16002)|null|\n", "2312.15922": "|**2023-12-26**|**Towards Probing Contact Center Large Language Models**|Varun Nathan et.al.|[2312.15922](http://arxiv.org/abs/2312.15922)|null|\n", "2312.15499": "|**2023-12-24**|**Exploring data augmentation in bias mitigation against non-native-accented speech**|Yuanyuan Zhang et.al.|[2312.15499](http://arxiv.org/abs/2312.15499)|null|\n", "2312.14609": "|**2023-12-22**|**BLSTM-Based Confidence Estimation for End-to-End Speech Recognition**|Atsunori Ogawa et.al.|[2312.14609](http://arxiv.org/abs/2312.14609)|null|\n", "2312.14378": "|**2024-02-09**|**Multimodal Attention Merging for Improved Speech Recognition and Audio Event Classification**|Anirudh S. Sundar et.al.|[2312.14378](http://arxiv.org/abs/2312.14378)|null|\n", "2312.14055": "|**2024-07-22**|**Multi-Sentence Grounding for Long-term Instructional Video**|Zeqian Li et.al.|[2312.14055](http://arxiv.org/abs/2312.14055)|null|\n", "2312.14020": "|**2023-12-21**|**BANSpEmo: A Bangla Emotional Speech Recognition Dataset**|Md Gulzar Hussain et.al.|[2312.14020](http://arxiv.org/abs/2312.14020)|null|\n", "2312.13873": "|**2023-12-21**|**Self-Supervised Adaptive AV Fusion Module for Pre-Trained ASR Models**|Christopher Simic et.al.|[2312.13873](http://arxiv.org/abs/2312.13873)|null|\n", "2312.13560": "|**2024-02-03**|**kNN-CTC: Enhancing ASR via Retrieval of CTC Pseudo Labels**|Jiaming Zhou et.al.|[2312.13560](http://arxiv.org/abs/2312.13560)|**[link](https://github.com/nku-hlt/knn-ctc)**|\n", "2408.02582": "|**2024-08-05**|**Clustering and Mining Accented Speech for Inclusive and Fair Speech Recognition**|Jaeyoung Kim et.al.|[2408.02582](http://arxiv.org/abs/2408.02582)|null|\n", "2408.02369": "|**2024-08-08**|**The NPU-ASLP System Description for Visual Speech Recognition in CNVSRC 2024**|He Wang et.al.|[2408.02369](http://arxiv.org/abs/2408.02369)|**[link](https://gitlab.com/csltstu/sunine)**|\n", "2408.02178": "|**2024-08-05**|**StreamVoice+: Evolving into End-to-end Streaming Zero-shot Voice Conversion**|Zhichao Wang et.al.|[2408.02178](http://arxiv.org/abs/2408.02178)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.02978": "|**2024-08-06**|**ASR-enhanced Multimodal Representation Learning for Cross-Domain Product Retrieval**|Ruixiang Zhao et.al.|[2408.02978](http://arxiv.org/abs/2408.02978)|null|\n", "2408.02945": "|**2024-08-06**|**Self-Supervised Learning for Multi-Channel Neural Transducer**|Atsushi Kojima et.al.|[2408.02945](http://arxiv.org/abs/2408.02945)|null|\n", "2408.04325": "|**2024-08-08**|**HydraFormer: One Encoder For All Subsampling Rates**|Yaoxun Xu et.al.|[2408.04325](http://arxiv.org/abs/2408.04325)|**[link](https://github.com/hydraformer/hydraformer)**|\n", "2408.04306": "|**2024-08-08**|**Preserving spoken content in voice anonymisation with character-level vocoder conditioning**|Michele Panariello et.al.|[2408.04306](http://arxiv.org/abs/2408.04306)|**[link](https://github.com/m-pana/spk_anon_nac_lm)**|\n", "2408.04174": "|**2024-08-08**|**wav2graph: A Framework for Supervised Learning Knowledge Graph from Speech**|Khai Le-Duc et.al.|[2408.04174](http://arxiv.org/abs/2408.04174)|**[link](https://github.com/leduckhai/wav2graph)**|\n", "2408.03979": "|**2024-08-07**|**Speaker Adaptation for Quantised End-to-End ASR Models**|Qiuming Zhao et.al.|[2408.03979](http://arxiv.org/abs/2408.03979)|null|\n", "2408.05101": "|**2024-08-09**|**MooER: LLM-based Speech Recognition and Translation Models from Moore Threads**|Junhao Xu et.al.|[2408.05101](http://arxiv.org/abs/2408.05101)|**[link](https://github.com/moorethreads/mooer)**|\n", "2408.06264": "|**2024-08-12**|**Audio Enhancement for Computer Audition -- An Iterative Training Paradigm Using Sample Importance**|Manuel Milling et.al.|[2408.06264](http://arxiv.org/abs/2408.06264)|null|\n", "2408.06043": "|**2024-08-12**|**Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning**|Wonjun Lee et.al.|[2408.06043](http://arxiv.org/abs/2408.06043)|null|\n", "2408.05769": "|**2024-08-11**|**LI-TTA: Language Informed Test-Time Adaptation for Automatic Speech Recognition**|Eunseop Yoon et.al.|[2408.05769](http://arxiv.org/abs/2408.05769)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.05554": "|**2024-08-10**|**Improving Whisper's Recognition Performance for Under-Represented Language Kazakh Leveraging Unpaired Speech and Text**|Jinpeng Li et.al.|[2408.05554](http://arxiv.org/abs/2408.05554)|null|\n", "2408.06484": "|**2024-08-12**|**Cross-Lingual Conversational Speech Summarization with Large Language Models**|Max Nelson et.al.|[2408.06484](http://arxiv.org/abs/2408.06484)|null|\n", "2408.07388": "|**2024-08-14**|**DPSNN: Spiking Neural Network for Low-Latency Streaming Speech Enhancement**|Tao Sun et.al.|[2408.07388](http://arxiv.org/abs/2408.07388)|null|\n", "2408.08027": "|**2024-08-15**|**Enhancing Large Language Model-based Speech Recognition by Contextualization for Rare and Ambiguous Words**|Kento Nozawa et.al.|[2408.08027](http://arxiv.org/abs/2408.08027)|null|\n", "2408.07851": "|**2024-08-14**|**SER Evals: In-domain and Out-of-domain Benchmarking for Speech Emotion Recognition**|Mohamed Osman et.al.|[2408.07851](http://arxiv.org/abs/2408.07851)|**[link](https://github.com/spaghettiSystems/serval)**|\n", "2408.07081": "|**2024-08-16**|**MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical Expressions into $LaTeX$ Formulas for Improved Readability**|Kyudan Jung et.al.|[2408.07081](http://arxiv.org/abs/2408.07081)|null|\n", "2408.09688": "|**2024-08-19**|**Recording for Eyes, Not Echoing to Ears: Contextualized Spoken-to-Written Conversion of ASR Transcripts**|Jiaqing Liu et.al.|[2408.09688](http://arxiv.org/abs/2408.09688)|null|\n", "2408.09491": "|**2024-08-18**|**A Transcription Prompt-based Efficient Audio Large Language Model for Robust Speech Recognition**|Yangze Li et.al.|[2408.09491](http://arxiv.org/abs/2408.09491)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10524": "|**2024-08-20**|**XCB: an effective contextual biasing approach to bias cross-lingual phrases in speech recognition**|Xucheng Wan et.al.|[2408.10524](http://arxiv.org/abs/2408.10524)|null|\n", "2408.11804": "|**2024-08-21**|**Approaching Deep Learning through the Spectral Dynamics of Weights**|David Yunis et.al.|[2408.11804](http://arxiv.org/abs/2408.11804)|**[link](https://github.com/dyunis/spectral_dynamics)**|\n", "2408.11258": "|**2024-08-21**|**Improving Speech Recognition Error Prediction for Modern and Off-the-shelf Speech Recognizers**|Prashant Serai et.al.|[2408.11258](http://arxiv.org/abs/2408.11258)|null|\n", "2408.12500": "|**2024-08-22**|**WhisperMask: A Noise Suppressive Mask-Type Microphone for Whisper Speech**|Hirotaka Hiraki et.al.|[2408.12500](http://arxiv.org/abs/2408.12500)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12279": "|**2024-08-22**|**Developing vocal system impaired patient-aimed voice quality assessment approach using ASR representation-included multiple features**|Shaoxiang Dang et.al.|[2408.12279](http://arxiv.org/abs/2408.12279)|null|\n", "2408.11940": "|**2024-08-21**|**The State of Commercial Automatic French Legal Speech Recognition Systems and their Impact on Court Reporters et al**|Nicolad Garneau et.al.|[2408.11940](http://arxiv.org/abs/2408.11940)|null|\n", "2408.11873": "|**2024-08-19**|**Parameter-Efficient Transfer Learning under Federated Learning for Automatic Speech Recognition**|Xuan Kan et.al.|[2408.11873](http://arxiv.org/abs/2408.11873)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13106": "|**2024-08-28**|**NEST: Self-supervised Fast Conformer as All-purpose Seasoning to Speech Processing Tasks**|He Huang et.al.|[2408.13106](http://arxiv.org/abs/2408.13106)|null|\n", "2408.13008": "|**2024-08-23**|**Focused Discriminative Training For Streaming CTC-Trained Automatic Speech Recognition Models**|Adnan Haider et.al.|[2408.13008](http://arxiv.org/abs/2408.13008)|null|\n", "2408.12734": "|**2024-08-22**|**Towards measuring fairness in speech recognition: Fair-Speech dataset**|Irina-Elena Veliche et.al.|[2408.12734](http://arxiv.org/abs/2408.12734)|null|\n", "2408.14418": "|**2024-08-26**|**MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR Errors with LLM-generated Synthetic Dialogues**|Kuluhan Binici et.al.|[2408.14418](http://arxiv.org/abs/2408.14418)|null|\n", "2408.14262": "|**2024-08-26**|**Self-supervised Speech Representations Still Struggle with African American Vernacular English**|Kalvin Chang et.al.|[2408.14262](http://arxiv.org/abs/2408.14262)|**[link](https://github.com/cmu-llab/s3m-aave)**|\n", "2408.14082": "|**2024-08-26**|**Automatic recognition and detection of aphasic natural speech**|Mara Barberis et.al.|[2408.14082](http://arxiv.org/abs/2408.14082)|null|\n", "2408.13996": "|**2024-08-28**|**Research Advances and New Paradigms for Biology-inspired Spiking Neural Networks**|Tianyu Zheng et.al.|[2408.13996](http://arxiv.org/abs/2408.13996)|null|\n", "2408.13739": "|**2024-08-25**|**Literary and Colloquial Tamil Dialect Identification**|M. Nanmalar et.al.|[2408.13739](http://arxiv.org/abs/2408.13739)|null|\n", "2408.13644": "|**2024-08-24**|**Studying the Effect of Audio Filters in Pre-Trained Models for Environmental Sound Classification**|Aditya Dawn et.al.|[2408.13644](http://arxiv.org/abs/2408.13644)|null|\n", "2408.14991": "|**2024-08-27**|**Speech Recognition Transformers: Topological-lingualism Perspective**|Shruti Singh et.al.|[2408.14991](http://arxiv.org/abs/2408.14991)|null|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.15616": "|**2024-08-28**|**Beyond Levenshtein: Leveraging Multiple Algorithms for Robust Word Error Rate Computations And Granular Error Classifications**|Korbinian Kuhn et.al.|[2408.15616](http://arxiv.org/abs/2408.15616)|**[link](https://github.com/shuffle-project/beyond-levenshtein)**|\n", "2408.15585": "|**2024-08-28**|**Whisper-PMFA: Partial Multi-Scale Feature Aggregation for Speaker Verification using Whisper Models**|Yiyang Zhao et.al.|[2408.15585](http://arxiv.org/abs/2408.15585)|null|\n", "2408.16589": "|**2024-08-29**|**CrisperWhisper: Accurate Timestamps on Verbatim Speech Transcriptions**|Laurin Wagner et.al.|[2408.16589](http://arxiv.org/abs/2408.16589)|null|\n", "2408.16564": "|**2024-08-29**|**Human-Inspired Audio-Visual Speech Recognition: Spike Activity, Cueing Interaction and Causal Processing**|Qianhui Liu et.al.|[2408.16564](http://arxiv.org/abs/2408.16564)|null|\n", "2408.16287": "|**2024-08-29**|**Measuring the Accuracy of Automatic Speech Recognition Solutions**|Korbinian Kuhn et.al.|[2408.16287](http://arxiv.org/abs/2408.16287)|**[link](https://github.com/shuffle-project/asr-comparison)**|\n", "2408.16204": "|**2024-08-29**|**Revisit Micro-batch Clipping: Adaptive Data Pruning via Gradient Manipulation**|Lun Wang et.al.|[2408.16204](http://arxiv.org/abs/2408.16204)|null|\n", "2408.16180": "|**2024-08-29**|**Benchmarking Japanese Speech Recognition on ASR-LLM Setups with Multi-Pass Augmented Generative Error Correction**|Yuka Ko et.al.|[2408.16180](http://arxiv.org/abs/2408.16180)|null|\n"}, "TTS": {"2408.06227": "|**2024-08-12**|**FLEURS-R: A Restored Multilingual Speech Corpus for Generation Tasks**|Min Ma et.al.|[2408.06227](http://arxiv.org/abs/2408.06227)|null|\n", "2408.05758": "|**2024-08-11**|**VQ-CTAP: Cross-Modal Fine-Grained Sequence Representation Learning for Speech Processing**|Chunyu Qiang et.al.|[2408.05758](http://arxiv.org/abs/2408.05758)|null|\n", "2408.03887": "|**2024-08-06**|**Central Kurdish Text-to-Speech Synthesis with Novel End-to-End Transformer Training**|Hawraz A. Ahmad et.al.|[2408.03887](http://arxiv.org/abs/2408.03887)|null|\n", "2408.01808": "|**2024-08-03**|**ALIF: Low-Cost Adversarial Audio Attacks on Black-Box Speech Platforms using Linguistic Features**|Peng Cheng et.al.|[2408.01808](http://arxiv.org/abs/2408.01808)|**[link](https://github.com/TASER2023/TASER)**|\n", "2408.00284": "|**2024-08-01**|**Bailing-TTS: Chinese Dialectal Speech Synthesis Towards Human-like Spontaneous Representation**|Xinhan Di et.al.|[2408.00284](http://arxiv.org/abs/2408.00284)|null|\n", "2407.21491": "|**2024-08-01**|**Generative Expressive Conversational Speech Synthesis**|Rui Liu et.al.|[2407.21491](http://arxiv.org/abs/2407.21491)|**[link](https://github.com/ai-s2-lab/gpt-talker)**|\n", "2407.21476": "|**2024-07-31**|**On the Problem of Text-To-Speech Model Selection for Synthetic Data Generation in Automatic Speech Recognition**|Nick Rossenbach et.al.|[2407.21476](http://arxiv.org/abs/2407.21476)|null|\n", "2407.18571": "|**2024-07-29**|**Speech Bandwidth Expansion Via High Fidelity Generative Adversarial Networks**|Mahmoud Salhab et.al.|[2407.18571](http://arxiv.org/abs/2407.18571)|null|\n", "2407.18541": "|**2024-07-26**|**Towards Improving NAM-to-Speech Synthesis Intelligibility using Self-Supervised Speech Models**|Neil Shah et.al.|[2407.18541](http://arxiv.org/abs/2407.18541)|null|\n", "2407.18505": "|**2024-07-26**|**VoxSim: A perceptual voice similarity dataset**|Junseok Ahn et.al.|[2407.18505](http://arxiv.org/abs/2407.18505)|null|\n", "2407.17997": "|**2024-07-25**|**On the Effect of Purely Synthetic Training Data for Different Automatic Speech Recognition Architectures**|Nick Rossenbach et.al.|[2407.17997](http://arxiv.org/abs/2407.17997)|null|\n", "2407.17167": "|**2024-07-24**|**Zero-Shot vs. Few-Shot Multi-Speaker TTS Using Pre-trained Czech SpeechT5 Model**|Jan Lehe\u010dka et.al.|[2407.17167](http://arxiv.org/abs/2407.17167)|null|\n", "2407.16840": "|**2024-07-23**|**Synth4Kws: Synthesized Speech for User Defined Keyword Spotting in Low Resource Environments**|Pai Zhu et.al.|[2407.16840](http://arxiv.org/abs/2407.16840)|null|\n", "2407.15835": "|**2024-07-22**|**dMel: Speech Tokenization made Simple**|He Bai et.al.|[2407.15835](http://arxiv.org/abs/2407.15835)|null|\n", "2407.15188": "|**2024-07-21**|**Overview of Speaker Modeling and Its Applications: From the Lens of Deep Speaker Representation Learning**|Shuai Wang et.al.|[2407.15188](http://arxiv.org/abs/2407.15188)|null|\n", "2407.14212": "|**2024-07-19**|**Braille-to-Speech Generator: Audio Generation Based on Joint Fine-Tuning of CLIP and Fastspeech2**|Chun Xu et.al.|[2407.14212](http://arxiv.org/abs/2407.14212)|null|\n", "2407.14056": "|**2024-07-19**|**Rasa: Building Expressive Speech Synthesis Systems for Indian Languages in Low-resource Settings**|Praveen Srinivasa Varadhan et.al.|[2407.14056](http://arxiv.org/abs/2407.14056)|**[link](https://github.com/AI4Bharat/Rasa)**|\n", "2407.14006": "|**2024-07-19**|**MSceneSpeech: A Multi-Scene Speech Dataset For Expressive Speech Synthesis**|Qian Yang et.al.|[2407.14006](http://arxiv.org/abs/2407.14006)|null|\n", "2407.13509": "|**2024-07-18**|**Spontaneous Style Text-to-Speech Synthesis with Controllable Spontaneous Behaviors Based on Language Models**|Weiqin Li et.al.|[2407.13509](http://arxiv.org/abs/2407.13509)|null|\n", "2408.00004": "|**2024-07-18**|**Handling Numeric Expressions in Automatic Speech Recognition**|Christian Huber et.al.|[2408.00004](http://arxiv.org/abs/2408.00004)|null|\n", "2407.12707": "|**2024-07-22**|**TTSDS -- Text-to-Speech Distribution Score**|Christoph Minixhofer et.al.|[2407.12707](http://arxiv.org/abs/2407.12707)|**[link](https://github.com/ttsds/ttsds)**|\n", "2408.00788": "|**2024-07-17**|**SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network**|Kexin Wang et.al.|[2408.00788](http://arxiv.org/abs/2408.00788)|null|\n", "2407.12229": "|**2024-07-17**|**Laugh Now Cry Later: Controlling Time-Varying Emotional States of Flow-Matching-Based Zero-Shot Text-to-Speech**|Haibin Wu et.al.|[2407.12229](http://arxiv.org/abs/2407.12229)|null|\n", "2407.12206": "|**2024-07-16**|**A Language Modeling Approach to Diacritic-Free Hebrew TTS**|Amit Roth et.al.|[2407.12206](http://arxiv.org/abs/2407.12206)|null|\n", "2407.09732": "|**2024-07-13**|**Speech Slytherin: Examining the Performance and Efficiency of Mamba for Speech Separation, Recognition, and Synthesis**|Xilin Jiang et.al.|[2407.09732](http://arxiv.org/abs/2407.09732)|**[link](https://github.com/xi-j/Mamba-TasNet)**|\n", "2407.09370": "|**2024-07-17**|**Learning High-Frequency Functions Made Easy with Sinusoidal Positional Encoding**|Chuanhao Sun et.al.|[2407.09370](http://arxiv.org/abs/2407.09370)|**[link](https://github.com/zhyuan11/SPE)**|\n", "2407.08551": "|**2024-07-11**|**Autoregressive Speech Synthesis without Vector Quantization**|Lingwei Meng et.al.|[2407.08551](http://arxiv.org/abs/2407.08551)|null|\n", "2407.08248": "|**2024-07-11**|**Toward accessible comics for blind and low vision readers**|Christophe Rigaud et.al.|[2407.08248](http://arxiv.org/abs/2407.08248)|null|\n", "2407.08016": "|**2024-07-10**|**Source Tracing of Audio Deepfake Systems**|Nicholas Klein et.al.|[2407.08016](http://arxiv.org/abs/2407.08016)|null|\n", "2407.18332": "|**2024-07-08**|**Analyzing Speech Unit Selection for Textless Speech-to-Speech Translation**|Jarod Duret et.al.|[2407.18332](http://arxiv.org/abs/2407.18332)|null|\n", "2407.05471": "|**2024-07-07**|**Fine-Grained and Interpretable Neural Speech Editing**|Max Morrison et.al.|[2407.05471](http://arxiv.org/abs/2407.05471)|**[link](https://github.com/maxrmorrison/torbi)**|\n", "2407.05421": "|**2024-07-07**|**ASRRL-TTS: Agile Speaker Representation Reinforcement Learning for Text-to-Speech Speaker Adaptation**|Ruibo Fu et.al.|[2407.05421](http://arxiv.org/abs/2407.05421)|null|\n", "2407.05407": "|**2024-07-09**|**CosyVoice: A Scalable Multilingual Zero-shot Text-to-speech Synthesizer based on Supervised Semantic Tokens**|Zhihao Du et.al.|[2407.05407](http://arxiv.org/abs/2407.05407)|null|\n", "2407.04575": "|**2024-07-05**|**FA-GAN: Artifacts-free and Phase-aware High-fidelity GAN-based Vocoder**|Rubing Shen et.al.|[2407.04575](http://arxiv.org/abs/2407.04575)|null|\n", "2407.04291": "|**2024-07-05**|**We Need Variations in Speech Synthesis: Sub-center Modelling for Speaker Embeddings**|Ismail Rasim Ulgen et.al.|[2407.04291](http://arxiv.org/abs/2407.04291)|null|\n", "2407.04047": "|**2024-07-04**|**Improving Accented Speech Recognition using Data Augmentation based on Unsupervised Text-to-Speech Synthesis**|Cong-Thanh Do et.al.|[2407.04047](http://arxiv.org/abs/2407.04047)|null|\n", "2407.04034": "|**2024-07-04**|**Optimizing a-DCF for Spoofing-Robust Speaker Verification**|O\u011fuzhan Kurnaz et.al.|[2407.04034](http://arxiv.org/abs/2407.04034)|null|\n", "2407.03892": "|**2024-07-04**|**On the Effectiveness of Acoustic BPE in Decoder-Only TTS**|Bohan Li et.al.|[2407.03892](http://arxiv.org/abs/2407.03892)|null|\n", "2407.03236": "|**2024-07-14**|**CATT: Character-based Arabic Tashkeel Transformer**|Faris Alasmary et.al.|[2407.03236](http://arxiv.org/abs/2407.03236)|**[link](https://github.com/abjadai/catt)**|\n", "2407.02937": "|**2024-07-03**|**Probing the Feasibility of Multilingual Speaker Anonymization**|Sarina Meyer et.al.|[2407.02937](http://arxiv.org/abs/2407.02937)|**[link](https://github.com/digitalphonetics/speaker-anonymization)**|\n", "2407.02243": "|**2024-07-02**|**Robust Zero-Shot Text-to-Speech Synthesis with Reverse Inference Optimization**|Yuchen Hu et.al.|[2407.02243](http://arxiv.org/abs/2407.02243)|null|\n", "2407.01927": "|**2024-07-02**|**TTSlow: Slow Down Text-to-Speech with Efficiency Robustness Evaluations**|Xiaoxue Gao et.al.|[2407.01927](http://arxiv.org/abs/2407.01927)|null|\n", "2407.01291": "|**2024-07-01**|**Lightweight Zero-shot Text-to-Speech with Mixture of Adapters**|Kenichi Fujita et.al.|[2407.01291](http://arxiv.org/abs/2407.01291)|null|\n", "2407.12038": "|**2024-07-31**|**ICAGC 2024: Inspirational and Convincing Audio Generation Challenge 2024**|Ruibo Fu et.al.|[2407.12038](http://arxiv.org/abs/2407.12038)|null|\n", "2407.00826": "|**2024-06-30**|**NAIST Simultaneous Speech Translation System for IWSLT 2024**|Yuka Ko et.al.|[2407.00826](http://arxiv.org/abs/2407.00826)|null|\n", "2407.00766": "|**2024-06-30**|**An Attribute Interpolation Method in Speech Synthesis by Model Merging**|Masato Murata et.al.|[2407.00766](http://arxiv.org/abs/2407.00766)|null|\n", "2407.00753": "|**2024-06-30**|**FLY-TTS: Fast, Lightweight and High-Quality End-to-End Text-to-Speech Synthesis**|Yinlin Guo et.al.|[2407.00753](http://arxiv.org/abs/2407.00753)|null|\n", "2407.00463": "|**2024-07-18**|**Open-Source Conversational AI with SpeechBrain 1.0**|Mirco Ravanelli et.al.|[2407.00463](http://arxiv.org/abs/2407.00463)|null|\n", "2406.19243": "|**2024-06-27**|**Application of ASV for Voice Identification after VC and Duration Predictor Improvement in TTS Models**|Borodin Kirill Nikolayevich et.al.|[2406.19243](http://arxiv.org/abs/2406.19243)|null|\n", "2406.19135": "|**2024-06-27**|**DEX-TTS: Diffusion-based EXpressive Text-to-Speech with Style Modeling on Time Variability**|Hyun Joon Park et.al.|[2406.19135](http://arxiv.org/abs/2406.19135)|**[link](https://github.com/winddori2002/dex-tts)**|\n", "2406.18135": "|**2024-06-26**|**Automatic Speech Recognition for Hindi**|Anish Saha et.al.|[2406.18135](http://arxiv.org/abs/2406.18135)|null|\n", "2406.18089": "|**2024-06-26**|**A Study on Synthesizing Expressive Violin Performances: Approaches and Comparisons**|Tzu-Yun Hung et.al.|[2406.18089](http://arxiv.org/abs/2406.18089)|null|\n", "2406.18088": "|**2024-06-29**|**LLM-Driven Multimodal Opinion Expression Identification**|Bonian Jia et.al.|[2406.18088](http://arxiv.org/abs/2406.18088)|null|\n", "2406.18009": "|**2024-06-26**|**E2 TTS: Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS**|Sefik Emre Eskimez et.al.|[2406.18009](http://arxiv.org/abs/2406.18009)|null|\n", "2406.17957": "|**2024-06-25**|**Improving Robustness of LLM-based Speech Synthesis by Learning Monotonic Alignment**|Paarth Neekhara et.al.|[2406.17957](http://arxiv.org/abs/2406.17957)|null|\n", "2406.17310": "|**2024-06-25**|**High Fidelity Text-to-Speech Via Discrete Tokens Using Token Transducer and Group Masked Language Model**|Joun Yeop Lee et.al.|[2406.17310](http://arxiv.org/abs/2406.17310)|null|\n", "2406.17257": "|**2024-06-25**|**Leveraging Parameter-Efficient Transfer Learning for Multi-Lingual Text-to-Speech Adaptation**|Yingting Li et.al.|[2406.17257](http://arxiv.org/abs/2406.17257)|null|\n", "2406.16808": "|**2024-06-24**|**Exploring the Capability of Mamba in Speech Applications**|Koichi Miyazaki et.al.|[2406.16808](http://arxiv.org/abs/2406.16808)|null|\n", "2406.16751": "|**2024-07-07**|**Towards Zero-Shot Text-To-Speech for Arabic Dialects**|Khai Duy Doan et.al.|[2406.16751](http://arxiv.org/abs/2406.16751)|null|\n", "2406.16716": "|**2024-06-24**|**One-Class Learning with Adaptive Centroid Shift for Audio Deepfake Detection**|Hyun Myung Kim et.al.|[2406.16716](http://arxiv.org/abs/2406.16716)|null|\n", "2406.17801": "|**2024-06-22**|**A multi-speaker multi-lingual voice cloning system based on vits2 for limmits 2024 challenge**|Xiaopeng Wang et.al.|[2406.17801](http://arxiv.org/abs/2406.17801)|null|\n", "2406.15752": "|**2024-06-22**|**TacoLM: GaTed Attention Equipped Codec Language Model are Efficient Zero-Shot Text to Speech Synthesizers**|Yakun Song et.al.|[2406.15752](http://arxiv.org/abs/2406.15752)|**[link](https://github.com/Ereboas/TacoLM)**|\n", "2406.14890": "|**2024-06-21**|**InterBiasing: Boost Unseen Word Recognition through Biasing Intermediate Predictions**|Yu Nakagome et.al.|[2406.14890](http://arxiv.org/abs/2406.14890)|null|\n", "2406.14875": "|**2024-06-21**|**GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech**|Wenbin Wang et.al.|[2406.14875](http://arxiv.org/abs/2406.14875)|null|\n", "2406.14294": "|**2024-06-21**|**DASB - Discrete Audio and Speech Benchmark**|Pooneh Mousavi et.al.|[2406.14294](http://arxiv.org/abs/2406.14294)|null|\n", "2406.12946": "|**2024-06-18**|**Instruction Data Generation and Unsupervised Adaptation for Speech Language Models**|Vahid Noroozi et.al.|[2406.12946](http://arxiv.org/abs/2406.12946)|null|\n", "2406.12164": "|**2024-07-09**|**A Mel Spectrogram Enhancement Paradigm Based on CWT in Speech Synthesis**|Guoqiang Hu et.al.|[2406.12164](http://arxiv.org/abs/2406.12164)|null|\n", "2406.11727": "|**2024-06-27**|**1000 African Voices: Advancing inclusive multi-speaker multi-accent speech synthesis**|Sewade Ogun et.al.|[2406.11727](http://arxiv.org/abs/2406.11727)|null|\n", "2406.11427": "|**2024-06-17**|**DiTTo-TTS: Efficient and Scalable Zero-Shot Text-to-Speech with Diffusion Transformer**|Keon Lee et.al.|[2406.11427](http://arxiv.org/abs/2406.11427)|null|\n", "2406.11037": "|**2024-06-16**|**NAST: Noise Aware Speech Tokenization for Speech Language Models**|Shoval Messica et.al.|[2406.11037](http://arxiv.org/abs/2406.11037)|**[link](https://github.com/ShovalMessica/NAST)**|\n", "2406.10844": "|**2024-06-16**|**Multi-Scale Accent Modeling with Disentangling for Multi-Speaker Multi-Accent TTS Synthesis**|Xuehao Zhou et.al.|[2406.10844](http://arxiv.org/abs/2406.10844)|null|\n", "2406.10514": "|**2024-06-15**|**GTR-Voice: Articulatory Phonetics Informed Controllable Expressive Speech Synthesis**|Zehua Kcriss Li et.al.|[2406.10514](http://arxiv.org/abs/2406.10514)|null|\n", "2406.10422": "|**2024-06-14**|**Phoneme Discretized Saliency Maps for Explainable Detection of AI-Generated Voice**|Shubham Gupta et.al.|[2406.10422](http://arxiv.org/abs/2406.10422)|null|\n", "2406.10056": "|**2024-06-14**|**UniAudio 1.5: Large Language Model-driven Audio Codec is A Few-shot Audio Task Learner**|Dongchao Yang et.al.|[2406.10056](http://arxiv.org/abs/2406.10056)|**[link](https://github.com/yangdongchao/llm-codec)**|\n", "2406.09869": "|**2024-06-14**|**MMM: Multi-Layer Multi-Residual Multi-Stream Discrete Speech Representation from Self-supervised Learning Model**|Jiatong Shi et.al.|[2406.09869](http://arxiv.org/abs/2406.09869)|null|\n", "2406.08989": "|**2024-06-13**|**ToneUnit: A Speech Discretization Approach for Tonal Language Speech Synthesis**|Dehua Tao et.al.|[2406.08989](http://arxiv.org/abs/2406.08989)|null|\n", "2406.08820": "|**2024-06-13**|**DisfluencySpeech -- Single-Speaker Conversational Speech Dataset with Paralanguage**|Kyra Wang et.al.|[2406.08820](http://arxiv.org/abs/2406.08820)|null|\n", "2406.08812": "|**2024-06-13**|**Generating Speakers by Prompting Listener Impressions for Pre-trained Multi-Speaker Text-to-Speech Systems**|Zhengyang Chen et.al.|[2406.08812](http://arxiv.org/abs/2406.08812)|null|\n", "2406.08802": "|**2024-06-13**|**DubWise: Video-Guided Speech Duration Control in Multimodal LLM-based Text-to-Speech for Dubbing**|Neha Sahipjohn et.al.|[2406.08802](http://arxiv.org/abs/2406.08802)|null|\n", "2406.08568": "|**2024-06-12**|**Training Data Augmentation for Dysarthric Automatic Speech Recognition by Text-to-Dysarthric-Speech Synthesis**|Wing-Zin Leung et.al.|[2406.08568](http://arxiv.org/abs/2406.08568)|null|\n", "2406.08416": "|**2024-06-20**|**TokSing: Singing Voice Synthesis based on Discrete Tokens**|Yuning Wu et.al.|[2406.08416](http://arxiv.org/abs/2406.08416)|null|\n", "2406.08196": "|**2024-06-12**|**FreeV: Free Lunch For Vocoders Through Pseudo Inversed Mel Filter**|Yuanjun Lv et.al.|[2406.08196](http://arxiv.org/abs/2406.08196)|**[link](https://github.com/bakerbunker/freev)**|\n", "2406.08111": "|**2024-06-12**|**Audio-conditioned phonemic and prosodic annotation for building text-to-speech models from unlabeled speech data**|Yuma Shirahata et.al.|[2406.08111](http://arxiv.org/abs/2406.08111)|null|\n", "2406.08076": "|**2024-06-12**|**VECL-TTS: Voice identity and Emotional style controllable Cross-Lingual Text-to-Speech**|Ashishkumar Gudmalwar et.al.|[2406.08076](http://arxiv.org/abs/2406.08076)|null|\n", "2406.07969": "|**2024-06-12**|**LibriTTS-P: A Corpus with Speaking Style and Speaker Identity Prompts for Text-to-Speech and Style Captioning**|Masaya Kawamura et.al.|[2406.07969](http://arxiv.org/abs/2406.07969)|**[link](https://github.com/line/libritts-p)**|\n", "2406.07855": "|**2024-06-12**|**VALL-E R: Robust and Efficient Zero-Shot Text-to-Speech Synthesis via Monotonic Alignment**|Bing Han et.al.|[2406.07855](http://arxiv.org/abs/2406.07855)|null|\n", "2406.07803": "|**2024-06-12**|**EmoSphere-TTS: Emotional Style and Intensity Modeling via Spherical Emotion Vector for Controllable Emotional Text-to-Speech**|Deok-Hyeon Cho et.al.|[2406.07803](http://arxiv.org/abs/2406.07803)|**[link](https://github.com/Choddeok/EmoSphere-TTS)**|\n", "2406.07801": "|**2024-06-12**|**PolySpeech: Exploring Unified Multitask Speech Models for Competitiveness with Single-task Models**|Runyan Yang et.al.|[2406.07801](http://arxiv.org/abs/2406.07801)|null|\n", "2406.07725": "|**2024-06-11**|**The Interspeech 2024 Challenge on Speech Processing Using Discrete Units**|Xuankai Chang et.al.|[2406.07725](http://arxiv.org/abs/2406.07725)|null|\n", "2406.07289": "|**2024-06-11**|**Can We Achieve High-quality Direct Speech-to-Speech Translation without Parallel Speech Data?**|Qingkai Fang et.al.|[2406.07289](http://arxiv.org/abs/2406.07289)|null|\n", "2406.07237": "|**2024-06-11**|**CodecFake: Enhancing Anti-Spoofing Models Against Deepfake Audios from Codec-Based Speech Synthesis Systems**|Haibin Wu et.al.|[2406.07237](http://arxiv.org/abs/2406.07237)|null|\n", "2406.06979": "|**2024-06-11**|**AudioMarkBench: Benchmarking Robustness of Audio Watermarking**|Hongbin Liu et.al.|[2406.06979](http://arxiv.org/abs/2406.06979)|**[link](https://github.com/moyangkuo/audiomarkbench)**|\n", "2406.06406": "|**2024-06-11**|**Controlling Emotion in Text-to-Speech with Natural Language Prompts**|Thomas Bott et.al.|[2406.06406](http://arxiv.org/abs/2406.06406)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06403": "|**2024-06-10**|**Meta Learning Text-to-Speech Synthesis in over 7000 Languages**|Florian Lux et.al.|[2406.06403](http://arxiv.org/abs/2406.06403)|**[link](https://github.com/digitalphonetics/ims-toucan)**|\n", "2406.06111": "|**2024-06-10**|**JenGAN: Stacked Shifted Filters in GAN-Based Speech Synthesis**|Hyunjae Cho et.al.|[2406.06111](http://arxiv.org/abs/2406.06111)|null|\n", "2406.05965": "|**2024-06-10**|**MakeSinger: A Semi-Supervised Training Method for Data-Efficient Singing Voice Synthesis via Classifier-free Diffusion Guidance**|Semin Kim et.al.|[2406.05965](http://arxiv.org/abs/2406.05965)|null|\n", "2406.05763": "|**2024-06-19**|**WenetSpeech4TTS: A 12,800-hour Mandarin TTS Corpus for Large Speech Generation Model Benchmark**|Linhan Ma et.al.|[2406.05763](http://arxiv.org/abs/2406.05763)|**[link](https://github.com/dukGuo/valle-audiodec)**|\n", "2406.05699": "|**2024-06-09**|**An Investigation of Noise Robustness for Flow-Matching-Based Zero-Shot TTS**|Xiaofei Wang et.al.|[2406.05699](http://arxiv.org/abs/2406.05699)|null|\n", "2406.05681": "|**2024-06-11**|**Towards Expressive Zero-Shot Speech Synthesis with Hierarchical Prosody Modeling**|Yuepeng Jiang et.al.|[2406.05681](http://arxiv.org/abs/2406.05681)|null|\n", "2406.05672": "|**2024-06-12**|**Text-aware and Context-aware Expressive Audiobook Speech Synthesis**|Dake Guo et.al.|[2406.05672](http://arxiv.org/abs/2406.05672)|null|\n", "2408.06906": "|**2024-08-13**|**VNet: A GAN-based Multi-Tier Discriminator Network for Speech Synthesis Vocoders**|Yubing Cao et.al.|[2408.06906](http://arxiv.org/abs/2408.06906)|null|\n", "2408.06858": "|**2024-08-13**|**SaSLaW: Dialogue Speech Corpus with Audio-visual Egocentric Information Toward Environment-adaptive Dialogue Speech Synthesis**|Osamu Take et.al.|[2408.06858](http://arxiv.org/abs/2408.06858)|**[link](https://github.com/sarulab-speech/saslaw)**|\n", "2408.06827": "|**2024-08-13**|**PRESENT: Zero-Shot Text-to-Prosody Control**|Perry Lam et.al.|[2408.06827](http://arxiv.org/abs/2408.06827)|**[link](https://github.com/iamanigeeit/present)**|\n", "2408.07547": "|**2024-08-14**|**PeriodWave: Multi-Period Flow Matching for High-Fidelity Waveform Generation**|Sang-Hoon Lee et.al.|[2408.07547](http://arxiv.org/abs/2408.07547)|**[link](https://github.com/sh-lee-prml/periodwave)**|\n", "2408.07414": "|**2024-08-14**|**WavLM model ensemble for audio deepfake detection**|David Combei et.al.|[2408.07414](http://arxiv.org/abs/2408.07414)|null|\n", "2408.09215": "|**2024-08-17**|**Generating Data with Text-to-Speech and Large-Language Models for Conversational Speech Recognition**|Samuele Cornell et.al.|[2408.09215](http://arxiv.org/abs/2408.09215)|**[link](https://github.com/popcornell/ASRLightningFT)**|\n", "2408.10852": "|**2024-08-20**|**EELE: Exploring Efficient and Extensible LoRA Integration in Emotional Text-to-Speech**|Xin Qi et.al.|[2408.10852](http://arxiv.org/abs/2408.10852)|null|\n", "2408.10771": "|**2024-08-20**|**SSL-TTS: Leveraging Self-Supervised Embeddings and kNN Retrieval for Zero-Shot Multi-speaker TTS**|Karl El Hajal et.al.|[2408.10771](http://arxiv.org/abs/2408.10771)|null|\n", "2408.10549": "|**2024-08-20**|**AI-Based IVR**|Gassyrbek Kosherbay et.al.|[2408.10549](http://arxiv.org/abs/2408.10549)|null|\n", "2408.10463": "|**2024-08-20**|**Adversarial training of Keyword Spotting to Minimize TTS Data Overfitting**|Hyun Jin Park et.al.|[2408.10463](http://arxiv.org/abs/2408.10463)|null|\n", "2408.10207": "|**2024-07-01**|**A Comprehensive Survey on Diffusion Models and Their Applications**|Md Manjurul Ahsan et.al.|[2408.10207](http://arxiv.org/abs/2408.10207)|null|\n", "2408.12430": "|**2024-08-22**|**Positional Description for Numerical Normalization**|Deepanshu Gupta et.al.|[2408.12430](http://arxiv.org/abs/2408.12430)|null|\n", "2408.12170": "|**2024-08-22**|**VoiceX: A Text-To-Speech Framework for Custom Voices**|Silvan Mertes et.al.|[2408.12170](http://arxiv.org/abs/2408.12170)|null|\n", "2408.11849": "|**2024-08-13**|**Style-Talker: Finetuning Audio Language Model and Style-Based Text-to-Speech Model for Fast Spoken Dialogue Generation**|Yinghao Aaron Li et.al.|[2408.11849](http://arxiv.org/abs/2408.11849)|null|\n", "2408.13240": "|**2024-08-23**|**Which Prosodic Features Matter Most for Pragmatics?**|Nigel G. Ward et.al.|[2408.13240](http://arxiv.org/abs/2408.13240)|null|\n", "2408.14423": "|**2024-08-27**|**DualSpeech: Enhancing Speaker-Fidelity and Text-Intelligibility Through Dual Classifier-Free Guidance**|Jinhyeok Yang et.al.|[2408.14423](http://arxiv.org/abs/2408.14423)|null|\n", "2408.13970": "|**2024-08-26**|**Anonymization of Voices in Spaces for Civic Dialogue: Measuring Impact on Empathy, Trust, and Feeling Heard**|Wonjune Kang et.al.|[2408.13970](http://arxiv.org/abs/2408.13970)|null|\n", "2408.13893": "|**2024-08-28**|**SimpleSpeech 2: Towards Simple and Efficient Text-to-Speech with Flow-based Scalar Latent Transformer Diffusion Models**|Dongchao Yang et.al.|[2408.13893](http://arxiv.org/abs/2408.13893)|null|\n", "2408.13608": "|**2024-08-24**|**SpeechCraft: A Fine-grained Expressive Speech Dataset with Natural Language Description**|Zeyu Jin et.al.|[2408.13608](http://arxiv.org/abs/2408.13608)|**[link](https://github.com/thuhcsi/speechcraft)**|\n", "2408.14887": "|**2024-08-27**|**Literary and Colloquial Dialect Identification for Tamil using Acoustic Features**|M. Nanmalar et.al.|[2408.14887](http://arxiv.org/abs/2408.14887)|null|\n", "2408.14739": "|**2024-08-28**|**VoiceTailor: Lightweight Plug-In Adapter for Diffusion-Based Personalized Text-to-Speech**|Heeseung Kim et.al.|[2408.14739](http://arxiv.org/abs/2408.14739)|null|\n", "2408.14713": "|**2024-08-27**|**StyleSpeech: Parameter-efficient Fine Tuning for Pre-trained Controllable Text-to-Speech**|Haowei Lou et.al.|[2408.14713](http://arxiv.org/abs/2408.14713)|null|\n", "2408.15916": "|**2024-08-28**|**Multi-modal Adversarial Training for Zero-Shot Voice Cloning**|John Janiczek et.al.|[2408.15916](http://arxiv.org/abs/2408.15916)|null|\n", "2408.15775": "|**2024-08-29**|**Easy, Interpretable, Effective: openSMILE for voice deepfake detection**|Octavian Pascu et.al.|[2408.15775](http://arxiv.org/abs/2408.15775)|null|\n", "2408.15676": "|**2024-08-28**|**VoxInstruct: Expressive Human Instruction-to-Speech Generation with Unified Multilingual Codec Language Modelling**|Yixuan Zhou et.al.|[2408.15676](http://arxiv.org/abs/2408.15676)|null|\n", "2408.16725": "|**2024-08-30**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|**[link](https://github.com/gpt-omni/mini-omni)**|\n", "2408.16546": "|**2024-08-29**|**RAVE for Speech: Efficient Voice Conversion at High Sampling Rates**|Anders R. Bargum et.al.|[2408.16546](http://arxiv.org/abs/2408.16546)|null|\n", "2408.16373": "|**2024-08-29**|**Enabling Beam Search for Language Model-Based Text-to-Speech Synthesis**|Zehai Tu et.al.|[2408.16373](http://arxiv.org/abs/2408.16373)|null|\n"}}
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index 5d61016..3f20c31 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,7 +2,7 @@
 layout: default
 ---
 
-## Updated on 2024.09.01
+## Updated on 2024.09.02
 > Usage instructions: [here](./docs/README.md#usage)
 
 > This page is modified from [here](https://github.com/Vincentqyw/cv-arxiv-daily)
@@ -563,7 +563,7 @@ layout: default
 
 | Publish Date | Title | Authors | PDF | Code |
 |:---------|:-----------------------|:---------|:------|:------|
-|**2024-08-29**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|null|
+|**2024-08-30**|**Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming**|Zhifei Xie et.al.|[2408.16725](http://arxiv.org/abs/2408.16725)|**[link](https://github.com/gpt-omni/mini-omni)**|
 |**2024-08-29**|**RAVE for Speech: Efficient Voice Conversion at High Sampling Rates**|Anders R. Bargum et.al.|[2408.16546](http://arxiv.org/abs/2408.16546)|null|
 |**2024-08-29**|**Enabling Beam Search for Language Model-Based Text-to-Speech Synthesis**|Zehai Tu et.al.|[2408.16373](http://arxiv.org/abs/2408.16373)|null|
 |**2024-08-28**|**Multi-modal Adversarial Training for Zero-Shot Voice Cloning**|John Janiczek et.al.|[2408.15916](http://arxiv.org/abs/2408.15916)|null|