diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index b5da66722..47181452d 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -979,6 +979,15 @@ void scopy_int8_to_float32(const unsigned int N, const int8_t *X, } } +void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) { +#ifdef USE_NEON + nntrainer::neon::copy_s16_fp32(N, X, Y); +#endif + for (unsigned int idx = 0; idx < N; ++idx) { + Y[idx] = (float)X[idx]; + } +} + float snrm2(const int N, const float *X, const int incX) { #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 066ed6e53..f31e84d30 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -320,6 +320,14 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, void scopy_int8_to_float32(const unsigned int N, const int8_t *X, const int incX, float *Y, const int intY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X int16_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y); + /** * @brief sdot computation : sum of all X * Y * @param[in] N number of elements in Y diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 3a47b949a..a6f598867 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -1597,6 +1597,14 @@ void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) { } } +void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) { + /// @todo implement int16_t to fp32 + unsigned int idx = 0; + for (; (N - idx) >= 1; ++idx) { + Y[idx] = X[idx]; + } +} + void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) { unsigned int idx = 0; diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 1cfd15712..26522a3c9 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -88,6 +88,15 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y); * @param[in] Y int8_t * for Vector Y */ void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y); + +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X int16_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y); + /** * @brief sine with neon: Y = sin(alpha * X) * @param[in] N number of elements in X diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp index 9c31c40f2..22d0b4805 100644 --- a/nntrainer/tensor/float_tensor.cpp +++ b/nntrainer/tensor/float_tensor.cpp @@ -763,6 +763,9 @@ void FloatTensor::copyData(const Tensor &from) { throw std::invalid_argument("Error: enable-fp16 is not enabled"); #endif break; + case ml::train::TensorDim::DataType::QINT16: + copy_s16_fp32(from.size(), from.getData(), (float *)getData()); + break; case ml::train::TensorDim::DataType::QINT8: scopy_int8_to_float32(from.size(), from.getData(), 1, (float *)getData(), 1);