From 5d11fa49ec04dca8cece73e2809f89a76f6916aa Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Fri, 13 Dec 2024 16:39:40 +0000
Subject: [PATCH 1/5] packaging:drop dependency on neon package

---
 ovos_stt_plugin_citrinet/__init__.py |   5 +-
 ovos_stt_plugin_citrinet/engine.py   | 204 +++++++++++++++++++++++++++
 requirements.txt                     |  12 +-
 3 files changed, 216 insertions(+), 5 deletions(-)
 create mode 100644 ovos_stt_plugin_citrinet/engine.py

diff --git a/ovos_stt_plugin_citrinet/__init__.py b/ovos_stt_plugin_citrinet/__init__.py
index 3fbc475..721b4dd 100644
--- a/ovos_stt_plugin_citrinet/__init__.py
+++ b/ovos_stt_plugin_citrinet/__init__.py
@@ -4,15 +4,14 @@
 from ovos_plugin_manager.templates.stt import STT
 from ovos_utils.log import LOG
 from speech_recognition import AudioData
-from streaming_stt_nemo import Model, available_languages
+
+from ovos_stt_plugin_citrinet.engine import Model, available_languages
 
 
 class CitrinetSTT(STT):
 
     def __init__(self, config: dict = None):
         super().__init__(config)
-        # replace default Neon model with project aina model
-        Model.langs["ca"]["model"] = "projecte-aina/stt-ca-citrinet-512"
         self.lang = self.config.get('lang') or "ca"
         self.models: Dict[str, Model] = {}
         lang = self.lang.split("-")[0]
diff --git a/ovos_stt_plugin_citrinet/engine.py b/ovos_stt_plugin_citrinet/engine.py
new file mode 100644
index 0000000..03cd8b8
--- /dev/null
+++ b/ovos_stt_plugin_citrinet/engine.py
@@ -0,0 +1,204 @@
+# taken from https://github.com/NeonGeckoCom/streaming-stt-nemo
+
+# NEON AI (TM) SOFTWARE, Software Development Kit & Application Framework
+# All trademark and other rights reserved by their respective owners
+# Copyright 2008-2022 Neongecko.com Inc.
+# Contributors: Daniel McKnight, Guy Daniels, Elon Gasper, Richard Leeds,
+# Regina Bloomstine, Casimiro Ferreira, Andrii Pernatii, Kirill Hrymailo
+# BSD-3 License
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from this
+#    software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS;  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import ctypes
+import gc
+import os.path
+
+import numpy as np
+import onnxruntime as ort
+import sentencepiece as spm
+import soxr
+import torch
+from huggingface_hub import hf_hub_download
+from pydub import AudioSegment
+
+languages = {
+    "en": {
+        "model": "neongeckocom/stt_en_citrinet_512_gamma_0_25",
+    },
+    "es": {
+        "model": "neongeckocom/stt_es_citrinet_512_gamma_0_25",
+    },
+    "fr": {
+        "model": "neongeckocom/stt_fr_citrinet_512_gamma_0_25",
+    },
+    "de": {
+        "model": "neongeckocom/stt_de_citrinet_512_gamma_0_25",
+    },
+    "it": {
+        "model": "neongeckocom/stt_it_citrinet_512_gamma_0_25",
+    },
+    "uk": {
+        "model": "neongeckocom/stt_uk_citrinet_512_gamma_0_25",
+    },
+    "nl": {
+        "model": "neongeckocom/stt_nl_citrinet_512_gamma_0_25",
+    },
+    "pt": {
+        "model": "neongeckocom/stt_pt_citrinet_512_gamma_0_25",
+    },
+    "ca": {
+        "model": "projecte-aina/stt-ca-citrinet-512"
+    },
+}
+
+sample_rate = 16000
+subfolder_name = "onnx"
+available_languages = list(languages.keys())
+
+
+class Model:
+    langs = languages
+    sample_rate = sample_rate
+
+    def __init__(self, lang="en", model_folder=None):
+        if model_folder:
+            self._init_model_from_path(model_folder)
+        else:
+            self._init_model(lang)
+
+    def _init_model(self, lang: str):
+        model_name = self.langs[lang]["model"]
+        self._init_preprocessor(model_name)
+        self._init_encoder(model_name)
+        self._init_tokenizer(model_name)
+        self._trim_memory()
+
+    def _init_model_from_path(self, path: str):
+        if not os.path.isdir(path):
+            raise ValueError(f"'{path}' is not valid NemoSTT onnx model folder")
+        preprocessor_path = f"{path}/preprocessor.ts"
+        encoder_path = f"{path}/model.onnx"
+        tokenizer_path = f"{path}/tokenizer.spm"
+        self._init_preprocessor(preprocessor_path)
+        self._init_encoder(encoder_path)
+        self._init_tokenizer(tokenizer_path)
+        self._trim_memory()
+
+    def _init_preprocessor(self, model_name: str):
+        if os.path.isfile(model_name):
+            preprocessor_path = model_name
+        else:
+            preprocessor_path = hf_hub_download(model_name, "preprocessor.ts", subfolder=subfolder_name)
+        self.preprocessor = torch.jit.load(preprocessor_path)
+
+    def _init_encoder(self, model_name: str):
+        if os.path.isfile(model_name):
+            encoder_path = model_name
+        else:
+            encoder_path = hf_hub_download(model_name, "model.onnx", subfolder=subfolder_name)
+        self.encoder = ort.InferenceSession(encoder_path)
+
+    def _init_tokenizer(self, model_name: str):
+        if os.path.isfile(model_name):
+            tokenizer_path = model_name
+        else:
+            tokenizer_path = hf_hub_download(model_name, "tokenizer.spm", subfolder=subfolder_name)
+        self.tokenizer = spm.SentencePieceProcessor(tokenizer_path)
+
+    def _run_preprocessor(self, audio_16k: np.array):
+        input_signal = torch.tensor(audio_16k).unsqueeze(0)
+        length = torch.tensor(len(audio_16k)).unsqueeze(0)
+        processed_signal, processed_signal_len = self.preprocessor.forward(
+            input_signal=input_signal, length=length
+        )
+        processed_signal = processed_signal.numpy()
+        processed_signal_len = processed_signal_len.numpy()
+        return processed_signal, processed_signal_len
+
+    def _run_encoder(self, processed_signal: np.array, processed_signal_len: np.array):
+        outputs = self.encoder.run(None, {'audio_signal': processed_signal,
+                                          'length': processed_signal_len})
+        logits = outputs[0][0]
+        return logits
+
+    def _run_tokenizer(self, logits: np.array):
+        blank_id = self.tokenizer.vocab_size()
+        decoded_prediction = self._ctc_decode(logits, blank_id)
+        text = self.tokenizer.decode_ids(decoded_prediction)
+        current_hypotheses = [text]
+        return current_hypotheses
+
+    @staticmethod
+    def _ctc_decode(logits: np.array, blank_id: int):
+        labels = logits.argmax(axis=1).tolist()
+
+        previous = blank_id
+        decoded_prediction = []
+        for p in labels:
+            if (p != previous or previous == blank_id) and p != blank_id:
+                decoded_prediction.append(p)
+            previous = p
+        return decoded_prediction
+
+    def stt(self, audio_buffer: np.array, sr: int):
+        audio_fp32 = self._to_float32(audio_buffer)
+        audio_16k = self._resample(audio_fp32, sr)
+
+        processed_signal, processed_signal_len = self._run_preprocessor(audio_16k)
+        logits = self._run_encoder(processed_signal, processed_signal_len)
+        current_hypotheses = self._run_tokenizer(logits)
+
+        self._trim_memory()
+        return current_hypotheses
+
+    def stt_file(self, file_path: str):
+        audio_buffer, sr = self.read_file(file_path)
+        current_hypotheses = self.stt(audio_buffer, sr)
+        return current_hypotheses
+
+    def read_file(self, file_path: str):
+        audio_file = AudioSegment.from_file(file_path)
+        sr = audio_file.frame_rate
+
+        samples = audio_file.get_array_of_samples()
+        audio_buffer = np.array(samples)
+        return audio_buffer, sr
+
+    @staticmethod
+    def _trim_memory():
+        """
+        If possible, gives memory allocated by PyTorch back to the system
+        """
+        libc = ctypes.CDLL("libc.so.6")
+        libc.malloc_trim(0)
+        gc.collect()
+
+    def _resample(self, audio_fp32: np.array, sr: int):
+        audio_16k = soxr.resample(audio_fp32, sr, self.sample_rate)
+        return audio_16k
+
+    def _to_float32(self, audio_buffer: np.array):
+        audio_fp32 = np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
+        return audio_fp32
+
+
+__all__ = ["Model", "available_languages"]
diff --git a/requirements.txt b/requirements.txt
index 9d8ff17..4282e20 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,12 @@
 ovos-plugin-manager>=0.0.24
 ovos-utils~=0.0,>=0.0.30
-streaming-stt-nemo~=0.2
-SpeechRecognition~=3.8
\ No newline at end of file
+SpeechRecognition~=3.8
+# model
+torch
+onnxruntime
+sentencepiece
+# resampling
+soxr
+pydub
+# huggingface
+huggingface-hub
\ No newline at end of file

From 2bdca955843f4dc31896df80eaad2ac1a97306fe Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Fri, 13 Dec 2024 17:18:16 +0000
Subject: [PATCH 2/5] cpu only pytorch

---
 README.md        | 6 ++++++
 requirements.txt | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0cff8c6..d00304a 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,12 @@ for [Nemo Citrinet](https://docs.nvidia.com/nemo-framework/user-guide/latest/nem
 
 ## Install
 
+by default this plugin will install the full pytorch, to avoid dragging all the dependencies it is recommended you install the cpu only version of pytorch **before** installing the plugin
+
+`pip install torch==2.1.0+cpu  -f https://download.pytorch.org/whl/torch_stable.html`
+
+If you skip the step above then the full pytorch will be installed together with the plugin
+
 `pip install ovos-stt-plugin-citrinet`
 
 ## Configuration
diff --git a/requirements.txt b/requirements.txt
index 4282e20..75a1eff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,11 +2,12 @@ ovos-plugin-manager>=0.0.24
 ovos-utils~=0.0,>=0.0.30
 SpeechRecognition~=3.8
 # model
-torch
+torch>=2.1.0
 onnxruntime
 sentencepiece
 # resampling
 soxr
 pydub
 # huggingface
-huggingface-hub
\ No newline at end of file
+huggingface-hub
+numpy<2.0.0
\ No newline at end of file

From 66fe63610e9e69ccfa1092075662e16911db369f Mon Sep 17 00:00:00 2001
From: JarbasAI <33701864+JarbasAl@users.noreply.github.com>
Date: Fri, 13 Dec 2024 17:24:33 +0000
Subject: [PATCH 3/5] Apply suggestions from code review

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 ovos_stt_plugin_citrinet/engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ovos_stt_plugin_citrinet/engine.py b/ovos_stt_plugin_citrinet/engine.py
index 03cd8b8..f0e6cff 100644
--- a/ovos_stt_plugin_citrinet/engine.py
+++ b/ovos_stt_plugin_citrinet/engine.py
@@ -86,6 +86,8 @@ def __init__(self, lang="en", model_folder=None):
             self._init_model(lang)
 
     def _init_model(self, lang: str):
+        if lang not in self.langs:
+            raise ValueError(f"Unsupported language '{lang}'. Available languages: {list(self.langs.keys())}")
         model_name = self.langs[lang]["model"]
         self._init_preprocessor(model_name)
         self._init_encoder(model_name)

From 404a4034609fbc045a29c3b337a12d49ef36c6eb Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Fri, 13 Dec 2024 17:27:25 +0000
Subject: [PATCH 4/5] allow lower pytorch version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 75a1eff..7d706c8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ ovos-plugin-manager>=0.0.24
 ovos-utils~=0.0,>=0.0.30
 SpeechRecognition~=3.8
 # model
-torch>=2.1.0
+torch>=1.13.1
 onnxruntime
 sentencepiece
 # resampling

From 4ac43e16f1096e72c7026a01b96e82a13f54c625 Mon Sep 17 00:00:00 2001
From: miro <jarbasai@mailfence.com>
Date: Fri, 13 Dec 2024 17:29:50 +0000
Subject: [PATCH 5/5] .

---
 .github/workflows/unit_tests.yml | 66 --------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 .github/workflows/unit_tests.yml

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
deleted file mode 100644
index 70ff43a..0000000
--- a/.github/workflows/unit_tests.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-name: Run UnitTests
-on:
-  pull_request:
-    branches:
-      - dev
-    paths-ignore:
-      - 'ovos_stt_plugin_citrinet/version.py'
-      - 'requirements/**'
-      - 'examples/**'
-      - '.github/**'
-      - '.gitignore'
-      - 'LICENSE'
-      - 'CHANGELOG.md'
-      - 'MANIFEST.in'
-      - 'README.md'
-      - 'scripts/**'
-  push:
-    branches:
-      - master
-    paths-ignore:
-      - 'ovos_stt_plugin_citrinet/version.py'
-      - 'requirements/**'
-      - 'examples/**'
-      - '.github/**'
-      - '.gitignore'
-      - 'LICENSE'
-      - 'CHANGELOG.md'
-      - 'MANIFEST.in'
-      - 'README.md'
-      - 'scripts/**'
-  workflow_dispatch:
-
-jobs:
-  unit_tests:
-    strategy:
-      max-parallel: 2
-      matrix:
-        python-version: [ 3.7, 3.8, 3.9, "3.10" ]
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install System Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt install python3-dev
-          python -m pip install build wheel
-      - name: Install core repo
-        run: |
-          pip install .
-      - name: Install test dependencies
-        run: |
-          pip install pytest pytest-timeout pytest-cov
-      - name: Run unittests
-        run: |
-          pytest --cov=ovos_stt_plugin_citrinet --cov-report xml test/unittests
-          # NOTE: additional pytest invocations should also add the --cov-append flag
-          #       or they will overwrite previous invocations' coverage reports
-          #       (for an example, see OVOS Skill Manager's workflow)
-      - name: Upload coverage
-        env:
-          CODECOV_TOKEN: ${{secrets.CODECOV_TOKEN}}
-        uses: codecov/codecov-action@v2