diff --git a/include/tim/vx/ops/scatternd_onnx_v16.h b/include/tim/vx/ops/scatternd_onnx_v16.h
index 9b47efff4..9698e9a0c 100644
--- a/include/tim/vx/ops/scatternd_onnx_v16.h
+++ b/include/tim/vx/ops/scatternd_onnx_v16.h
@@ -34,14 +34,23 @@ namespace ops {
  *
  * Scatter updates into a new tensor according to indices.
  *
- * - shape : The shape of the resulting tensor. 
+ * - reduction: Type of reduction to apply: none (default), add, mul, max, min.
  */
 
 class ScatterND_ONNX_V16 : public BuiltinOp {
  public:
-  ScatterND_ONNX_V16(Graph* graph);
+  enum ReductionType {
+    REDUCTION_NONE,
+    REDUCTION_ADD,
+    REDUCTION_MUL,
+    REDUCTION_MAX,
+    REDUCTION_MIN
+  };
+  ScatterND_ONNX_V16(Graph* graph, ReductionType reduction = ReductionType::REDUCTION_NONE);
 
   std::shared_ptr<Operation> Clone(std::shared_ptr<Graph>& graph) const override;
+ protected:
+  ReductionType reduction_;
 };
 
 }  // namespace ops
diff --git a/include/tim/vx/ops/simple_operations.h b/include/tim/vx/ops/simple_operations.h
index 3bd5bd3b7..7f8cc98a2 100644
--- a/include/tim/vx/ops/simple_operations.h
+++ b/include/tim/vx/ops/simple_operations.h
@@ -83,14 +83,14 @@ namespace ops {
  * returns the largest integer less than or equal to a given number.
  *
  * ## Ceil
- * 
+ *
  * returns the largest integer more than or equal to a given number.
- * 
+ *
  * ## Cast
  *
  * Change the format from input tensor to output tensor. This operation ignores
  * the scale and zeroPoint of quanized tensors.
- * 
+ *
  * ## Rcp
  * Computes the reciprocal of input element-wise.
  */
@@ -99,8 +99,8 @@ DECLARE_SIMPLE_OP(DataConvert)
 DECLARE_SIMPLE_OP(Neg)
 DECLARE_SIMPLE_OP(Abs)
 DECLARE_SIMPLE_OP(Sin)
-// TODO(jiangbo): enable it when internal ops supports `Cos`
-//DECLARE_SIMPLE_OP(Cos)
+DECLARE_SIMPLE_OP(Cos)
+DECLARE_SIMPLE_OP(Tan)
 DECLARE_SIMPLE_OP(Exp)
 DECLARE_SIMPLE_OP(Log)
 DECLARE_SIMPLE_OP(Sqrt)
diff --git a/src/tim/vx/ops/bidirectional_sequence_lstm.cc b/src/tim/vx/ops/bidirectional_sequence_lstm.cc
index 7a8745e8e..51e3a42bd 100644
--- a/src/tim/vx/ops/bidirectional_sequence_lstm.cc
+++ b/src/tim/vx/ops/bidirectional_sequence_lstm.cc
@@ -109,6 +109,16 @@ class BidirectionalSequenceLstmImpl : public OpImpl {
     BI_LSTM_BW_INPUT_LAYERNORM_C = 54,
     BI_LSTM_BW_INPUT_LAYERNORM_O = 55,
 
+    BI_LSTM_FW_INPUT_BIAS_R2I = 56,
+    BI_LSTM_FW_INPUT_BIAS_R2F = 57,
+    BI_LSTM_FW_INPUT_BIAS_R2C = 58,
+    BI_LSTM_FW_INPUT_BIAS_R2O = 59,
+
+    BI_LSTM_BW_INPUT_BIAS_R2I = 60,
+    BI_LSTM_BW_INPUT_BIAS_R2F = 61,
+    BI_LSTM_BW_INPUT_BIAS_R2C = 62,
+    BI_LSTM_BW_INPUT_BIAS_R2O = 63,
+
     INPUT_CNT,
 
     BI_LSTM_FW_OUTPUT_OUTPUT = 0,
@@ -147,7 +157,7 @@ class BidirectionalSequenceLstmImpl : public OpImpl {
       const std::shared_ptr<Tensor>& tensor) override {
     in_tensors_[input_tensor_index] = tensor;
 
-    if (this->input_tensor_index == INPUT_CNT - 1) {
+    if (this->input_tensor_index >= INPUT_CNT - 9) {
       // Get all input tensor
       lstm_forward_->BindInput(in_tensors_[BI_LSTM_INPUT_INPUT]);
       reverse_input_->BindInput(in_tensors_[BI_LSTM_INPUT_INPUT]);
@@ -183,6 +193,12 @@ class BidirectionalSequenceLstmImpl : public OpImpl {
       lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_LAYERNORM_F]);
       lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_LAYERNORM_C]);
       lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_LAYERNORM_O]);
+      if(this->input_tensor_index == input_cnt_ - 1) {
+        lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_BIAS_R2I]);
+        lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_BIAS_R2F]);
+        lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_BIAS_R2C]);
+        lstm_forward_->BindInput(in_tensors_[BI_LSTM_FW_INPUT_BIAS_R2O]);
+      }
 
       lstm_backward_->BindInput(bw_input_tensor_);
       lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_H_STATE]);
@@ -214,6 +230,12 @@ class BidirectionalSequenceLstmImpl : public OpImpl {
       lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_LAYERNORM_F]);
       lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_LAYERNORM_C]);
       lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_LAYERNORM_O]);
+      if(this->input_tensor_index == input_cnt_ - 1) {
+        lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_BIAS_R2I]);
+        lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_BIAS_R2F]);
+        lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_BIAS_R2C]);
+        lstm_backward_->BindInput(in_tensors_[BI_LSTM_BW_INPUT_BIAS_R2O]);
+      }
     }
     this->input_tensor_index++;
     return *this;
diff --git a/src/tim/vx/ops/scatternd_onnx_v16.cc b/src/tim/vx/ops/scatternd_onnx_v16.cc
index 758783a3f..6c5669717 100644
--- a/src/tim/vx/ops/scatternd_onnx_v16.cc
+++ b/src/tim/vx/ops/scatternd_onnx_v16.cc
@@ -29,9 +29,29 @@
 namespace tim {
 namespace vx {
 namespace ops {
+vsi_nn_reduction_type_e downcast_reduction_type (ScatterND_ONNX_V16::ReductionType type) {
+  switch (type)
+  {
+    case ScatterND_ONNX_V16::ReductionType::REDUCTION_NONE:
+      return VSI_NN_REDUCTION_TYPE_NONE;
+    case ScatterND_ONNX_V16::ReductionType::REDUCTION_ADD:
+      return VSI_NN_REDUCTION_TYPE_ADD;
+    case ScatterND_ONNX_V16::ReductionType::REDUCTION_MUL:
+      return VSI_NN_REDUCTION_TYPE_MUL;
+    case ScatterND_ONNX_V16::ReductionType::REDUCTION_MAX:
+      return VSI_NN_REDUCTION_TYPE_MAX;
+    case ScatterND_ONNX_V16::ReductionType::REDUCTION_MIN:
+      return VSI_NN_REDUCTION_TYPE_MIN;
+    default:
+      return VSI_NN_REDUCTION_TYPE_NONE;
+  }
+}
+
+ScatterND_ONNX_V16::ScatterND_ONNX_V16(Graph* graph, ReductionType reduction)
+    : BuiltinOp(graph, VSI_NN_OP_SCATTER_ND_UPDATE),
+      reduction_(reduction) {
+  this->impl()->node()->nn_param.scatter_nd_update.reduction = downcast_reduction_type(reduction_);
 
-ScatterND_ONNX_V16::ScatterND_ONNX_V16(Graph* graph)
-    : BuiltinOp(graph, VSI_NN_OP_SCATTER_ND_UPDATE) {
 }
 
 std::shared_ptr<Operation> ScatterND_ONNX_V16::Clone(std::shared_ptr<Graph>& graph) const {
diff --git a/src/tim/vx/ops/simple_operations.cc b/src/tim/vx/ops/simple_operations.cc
index d6f486282..e2250151f 100644
--- a/src/tim/vx/ops/simple_operations.cc
+++ b/src/tim/vx/ops/simple_operations.cc
@@ -40,8 +40,8 @@ DEFINE_SIMPLE_OP(DataConvert, VSI_NN_OP_DATACONVERT)
 DEFINE_SIMPLE_OP(Neg, VSI_NN_OP_NEG)
 DEFINE_SIMPLE_OP(Abs, VSI_NN_OP_ABS)
 DEFINE_SIMPLE_OP(Sin, VSI_NN_OP_SIN)
-// TODO(jiangbo): enable it when ovxlib supports `Cos`
-//DEFINE_SIMPLE_OP(Cos, VSI_NN_OP_COS)
+DEFINE_SIMPLE_OP(Cos, VSI_NN_OP_COS)
+DEFINE_SIMPLE_OP(Tan, VSI_NN_OP_TAN)
 DEFINE_SIMPLE_OP(Exp, VSI_NN_OP_EXP)
 DEFINE_SIMPLE_OP(Log, VSI_NN_OP_LOG)
 DEFINE_SIMPLE_OP(Sqrt, VSI_NN_OP_SQRT)
diff --git a/src/tim/vx/ops/simple_operations_test.cc b/src/tim/vx/ops/simple_operations_test.cc
index 0987d136e..81cb19a56 100644
--- a/src/tim/vx/ops/simple_operations_test.cc
+++ b/src/tim/vx/ops/simple_operations_test.cc
@@ -47,8 +47,8 @@ TEST(Floor, shape_5_1_fp32) {
 
     EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
 
-    auto add = graph->CreateOperation<tim::vx::ops::Floor>();
-    (*add).BindInputs({input_tensor}).BindOutputs({output_tensor});
+    auto op = graph->CreateOperation<tim::vx::ops::Floor>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
 
     EXPECT_TRUE(graph->Compile());
     EXPECT_TRUE(graph->Run());
@@ -79,8 +79,8 @@ TEST(Round, shape_15_1_fp32) {
 
     EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
 
-    auto add = graph->CreateOperation<tim::vx::ops::Round>();
-    (*add).BindInputs({input_tensor}).BindOutputs({output_tensor});
+    auto op = graph->CreateOperation<tim::vx::ops::Round>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
 
     EXPECT_TRUE(graph->Compile());
     EXPECT_TRUE(graph->Run());
@@ -107,8 +107,8 @@ TEST(Ceil, shape_5_1_fp32) {
 
     EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
 
-    auto add = graph->CreateOperation<tim::vx::ops::Ceil>();
-    (*add).BindInputs({input_tensor}).BindOutputs({output_tensor});
+    auto op = graph->CreateOperation<tim::vx::ops::Ceil>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
 
     EXPECT_TRUE(graph->Compile());
     EXPECT_TRUE(graph->Run());
@@ -135,8 +135,8 @@ TEST(Cast, shape_5_1_fp32_to_int32) {
 
     EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
 
-    auto add = graph->CreateOperation<tim::vx::ops::Cast>();
-    (*add).BindInputs({input_tensor}).BindOutputs({output_tensor});
+    auto op = graph->CreateOperation<tim::vx::ops::Cast>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
 
     EXPECT_TRUE(graph->Compile());
     EXPECT_TRUE(graph->Run());
@@ -253,12 +253,68 @@ TEST(Rcp, shape_5_1_fp32) {
 
     EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
 
-    auto add = graph->CreateOperation<tim::vx::ops::Rcp>();
-    (*add).BindInputs({input_tensor}).BindOutputs({output_tensor});
+    auto op = graph->CreateOperation<tim::vx::ops::Rcp>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
 
     EXPECT_TRUE(graph->Compile());
     EXPECT_TRUE(graph->Run());
     std::vector<float> output(5, 0);
     EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
     EXPECT_TRUE(ArraysMatch(golden, output, 1e-5f));
+}
+
+TEST(Cos, shape_5_1_fp32) {
+   auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType io_shape({5, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            io_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32,
+                            io_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor = graph->CreateTensor(output_spec);
+
+    std::vector<float> in_data = { 1.0, 0.0, -1.0, 0.5, -0.5};
+    std::vector<float> golden = {0.5403023, 1, 0.5403023, 0.87758255, 0.87758255};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
+
+    auto op = graph->CreateOperation<tim::vx::ops::Cos>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output(5);
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
+    EXPECT_TRUE(ArraysMatch(golden, output, 1e-5f));
+}
+
+TEST(Tan, shape_5_1_fp32) {
+   auto ctx = tim::vx::Context::Create();
+    auto graph = ctx->CreateGraph();
+
+    tim::vx::ShapeType io_shape({5, 1});
+    tim::vx::TensorSpec input_spec(tim::vx::DataType::FLOAT32,
+                            io_shape, tim::vx::TensorAttribute::INPUT);
+    tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32,
+                            io_shape, tim::vx::TensorAttribute::OUTPUT);
+
+    auto input_tensor = graph->CreateTensor(input_spec);
+    auto output_tensor = graph->CreateTensor(output_spec);
+
+    std::vector<float> in_data = { 1, 0, 1.5, 0.5, -0.5};
+    std::vector<float> golden = { 1.5574077, 0, 14.10142, 0.5463025, -0.5463025};
+
+    EXPECT_TRUE(input_tensor->CopyDataToTensor(in_data.data(), in_data.size()*4));
+
+    auto op = graph->CreateOperation<tim::vx::ops::Tan>();
+    (*op).BindInputs({input_tensor}).BindOutputs({output_tensor});
+
+    EXPECT_TRUE(graph->Compile());
+    EXPECT_TRUE(graph->Run());
+    std::vector<float> output(5);
+    EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
+    EXPECT_TRUE(ArraysMatch(golden, output, 1e-4f));
 }
\ No newline at end of file