From 15ae292d22139f030cf01e13c82d7895397e658f Mon Sep 17 00:00:00 2001
From: Donghyeon Jeong <dhyeon.jeong@samsung.com>
Date: Wed, 15 Jan 2025 10:10:42 +0900
Subject: [PATCH 1/2] [Blas] copy functionality for signed int16 data type

This pull request aims to add the functionality of copying the int16 data type into fp32.
Please note that this implementation does not utilize SIMD at this time.

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Donghyeon Jeong <dhyeon.jeong@samsung.com>
---
 nntrainer/tensor/blas_interface.cpp | 9 +++++++++
 nntrainer/tensor/blas_interface.h   | 8 ++++++++
 nntrainer/tensor/blas_neon.cpp      | 8 ++++++++
 nntrainer/tensor/blas_neon.h        | 9 +++++++++
 nntrainer/tensor/float_tensor.cpp   | 3 +++
 5 files changed, 37 insertions(+)

diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index b5da66722..47181452d 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -979,6 +979,15 @@ void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
   }
 }
 
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
+#ifdef USE_NEON
+  nntrainer::neon::copy_s16_fp32(N, X, Y);
+#endif
+  for (unsigned int idx = 0; idx < N; ++idx) {
+    Y[idx] = (float)X[idx];
+  }
+}
+
 float snrm2(const int N, const float *X, const int incX) {
 #ifdef USE_BLAS
 #ifdef BLAS_NUM_THREADS
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index 066ed6e53..f31e84d30 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -320,6 +320,14 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
                            const int incX, float *Y, const int intY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int16_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 3a47b949a..a6f598867 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -1597,6 +1597,14 @@ void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
   }
 }
 
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
+  /// @todo implement int16_t to fp32
+  unsigned int idx = 0;
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
   unsigned int idx = 0;
 
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index 1cfd15712..26522a3c9 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -88,6 +88,15 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
  * @param[in] Y int8_t * for Vector Y
  */
 void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int16_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y);
+
 /**
  * @brief     sine with neon: Y = sin(alpha * X)
  * @param[in] N number of elements in X
diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
index 9c31c40f2..22d0b4805 100644
--- a/nntrainer/tensor/float_tensor.cpp
+++ b/nntrainer/tensor/float_tensor.cpp
@@ -763,6 +763,9 @@ void FloatTensor::copyData(const Tensor &from) {
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
     break;
+  case ml::train::TensorDim::DataType::QINT16:
+    copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
+    break;
   case ml::train::TensorDim::DataType::QINT8:
     scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
                           (float *)getData(), 1);

From 4439e05491ffd1b1328f721472d97c27870314c1 Mon Sep 17 00:00:00 2001
From: skykongkong8 <ss.kong@samsung.com>
Date: Thu, 16 Jan 2025 13:37:50 +0900
Subject: [PATCH 2/2] [ neon ] Implement neon kernel for copy_s16_f32

- load for s16, widen to s32, convert to f32, and store.
- Add fallback function with the same function param for easier later refactor.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
---
 nntrainer/tensor/blas_interface.cpp | 11 ++++++++---
 nntrainer/tensor/blas_neon.cpp      | 17 ++++++++++++++++-
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 47181452d..d795aad3f 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -979,13 +979,18 @@ void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
   }
 }
 
+static inline void copy_s16_fp32_fallback(const unsigned int N,
+                                          const int16_t *X, float *Y) {
+  for (unsigned int idx = 0; idx < N; ++idx) {
+    Y[idx] = (float)X[idx];
+  }
+}
+
 void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
 #ifdef USE_NEON
   nntrainer::neon::copy_s16_fp32(N, X, Y);
 #endif
-  for (unsigned int idx = 0; idx < N; ++idx) {
-    Y[idx] = (float)X[idx];
-  }
+  copy_s16_fp32_fallback(N, X, Y);
 }
 
 float snrm2(const int N, const float *X, const int incX) {
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index a6f598867..e8e18061e 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -1598,8 +1598,23 @@ void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
 }
 
 void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
-  /// @todo implement int16_t to fp32
   unsigned int idx = 0;
+  for (; (N - idx) >= 8; idx += 8) {
+    int16x8_t batch = vld1q_s16(&X[idx]);
+    int16x4_t low = vget_low_s16(batch);
+    int16x4_t high = vget_high_s16(batch);
+
+    // widen to s32
+    int32x4_t low_s32 = vmovl_s16(low);
+    int32x4_t high_s32 = vmovl_s16(high);
+
+    // convert to f32
+    float32x4_t low_f32 = vcvtq_f32_s32(low_s32);
+    float32x4_t high_f32 = vcvtq_f32_s32(high_s32);
+
+    vst1q_f32(&Y[idx], low_f32);
+    vst1q_f32(&Y[idx + 4], high_f32);
+  }
   for (; (N - idx) >= 1; ++idx) {
     Y[idx] = X[idx];
   }