nnstreamer · skykongkong8 · Jan 15, 2025 · Jan 16, 2025
@@ -979,6 +979,20 @@ void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
   }
 }
 
+static inline void copy_s16_fp32_fallback(const unsigned int N,
+                                          const int16_t *X, float *Y) {
+  for (unsigned int idx = 0; idx < N; ++idx) {
+    Y[idx] = (float)X[idx];
+  }
+}
+
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
+#ifdef USE_NEON
+  nntrainer::neon::copy_s16_fp32(N, X, Y);
+#endif
+  copy_s16_fp32_fallback(N, X, Y);
+}
+
 float snrm2(const int N, const float *X, const int incX) {
 #ifdef USE_BLAS
 #ifdef BLAS_NUM_THREADS

@@ -320,6 +320,14 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
                            const int incX, float *Y, const int intY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int16_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y

@@ -1597,6 +1597,29 @@ void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
   }
 }
 
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
+  unsigned int idx = 0;
+  for (; (N - idx) >= 8; idx += 8) {
+    int16x8_t batch = vld1q_s16(&X[idx]);
+    int16x4_t low = vget_low_s16(batch);
+    int16x4_t high = vget_high_s16(batch);
+
+    // widen to s32
+    int32x4_t low_s32 = vmovl_s16(low);
+    int32x4_t high_s32 = vmovl_s16(high);
+
+    // convert to f32
+    float32x4_t low_f32 = vcvtq_f32_s32(low_s32);
+    float32x4_t high_f32 = vcvtq_f32_s32(high_s32);
+
+    vst1q_f32(&Y[idx], low_f32);
+    vst1q_f32(&Y[idx + 4], high_f32);
+  }
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
   unsigned int idx = 0;
 

@@ -88,6 +88,15 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
  * @param[in] Y int8_t * for Vector Y
  */
 void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int16_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y);
+
 /**
  * @brief     sine with neon: Y = sin(alpha * X)
  * @param[in] N number of elements in X

@@ -763,6 +763,9 @@ void FloatTensor::copyData(const Tensor &from) {
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
     break;
+  case ml::train::TensorDim::DataType::QINT16:
+    copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
+    break;
   case ml::train::TensorDim::DataType::QINT8:
     scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
                           (float *)getData(), 1);