diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index b5da66722..47181452d 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -979,6 +979,15 @@ void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
   }
 }
 
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
+#ifdef USE_NEON
+  nntrainer::neon::copy_s16_fp32(N, X, Y);
+#endif
+  for (unsigned int idx = 0; idx < N; ++idx) {
+    Y[idx] = (float)X[idx];
+  }
+}
+
 float snrm2(const int N, const float *X, const int incX) {
 #ifdef USE_BLAS
 #ifdef BLAS_NUM_THREADS
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index 066ed6e53..f31e84d30 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -320,6 +320,14 @@ void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
                            const int incX, float *Y, const int intY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int16_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 3a47b949a..a6f598867 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -1597,6 +1597,14 @@ void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
   }
 }
 
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
+  /// @todo implement int16_t to fp32
+  unsigned int idx = 0;
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
   unsigned int idx = 0;
 
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index 1cfd15712..26522a3c9 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -88,6 +88,15 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
  * @param[in] Y int8_t * for Vector Y
  */
 void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int16_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y);
+
 /**
  * @brief     sine with neon: Y = sin(alpha * X)
  * @param[in] N number of elements in X
diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
index 9c31c40f2..22d0b4805 100644
--- a/nntrainer/tensor/float_tensor.cpp
+++ b/nntrainer/tensor/float_tensor.cpp
@@ -763,6 +763,9 @@ void FloatTensor::copyData(const Tensor &from) {
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
     break;
+  case ml::train::TensorDim::DataType::QINT16:
+    copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
+    break;
   case ml::train::TensorDim::DataType::QINT8:
     scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
                           (float *)getData(), 1);