diff --git a/onert-micro/onert-micro/include/pal/common/PALSoftmaxInputGrad.h b/onert-micro/onert-micro/include/pal/common/PALSoftmaxInputGrad.h
new file mode 100644
index 00000000000..172a19c770d
--- /dev/null
+++ b/onert-micro/onert-micro/include/pal/common/PALSoftmaxInputGrad.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_EXECUTE_PAL_COMMON_SOFTMAX_INPUT_GRAD_H
+#define ONERT_MICRO_EXECUTE_PAL_COMMON_SOFTMAX_INPUT_GRAD_H
+
+#include "OMStatus.h"
+#include "PALUtils.h"
+
+#include <cmath>
+
+namespace onert_micro
+{
+namespace train
+{
+namespace pal
+{
+
+void inline SoftmaxInputGrad(const float *dloss_doutput_data,
+                             const core::OMRuntimeShape &dloss_doutput_shape,
+                             const float *calculated_data, float *jacobian_row_data,
+                             float *dloss_dinput_data)
+{
+  assert(dloss_doutput_shape.dimensionsCount() == 2);
+  assert(dloss_doutput_shape.dims(0) == 1);
+  const uint32_t output_dim = dloss_doutput_shape.dims(dloss_doutput_shape.dimensionsCount() - 1);
+  for (int i = 0; i < output_dim; ++i)
+  {
+    for (int j = 0; j < output_dim; ++j)
+    {
+      jacobian_row_data[j] = -calculated_data[i] * calculated_data[j];
+    }
+    jacobian_row_data[i] += calculated_data[i];
+    float total = 0.f;
+    for (int j = 0; j < output_dim; ++j)
+    {
+      total += jacobian_row_data[j] * dloss_doutput_data[j];
+    }
+    dloss_dinput_data[i] = total;
+  }
+}
+
+} // namespace pal
+} // namespace train
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_EXECUTE_PAL_COMMON_SOFTMAX_INPUT_GRAD_H
diff --git a/onert-micro/onert-micro/src/train/kernels/Softmax.cpp b/onert-micro/onert-micro/src/train/kernels/Softmax.cpp
new file mode 100644
index 00000000000..b2ab791fa42
--- /dev/null
+++ b/onert-micro/onert-micro/src/train/kernels/Softmax.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OMStatus.h"
+#include "core/OMUtils.h"
+#include "core/OMDataType.h"
+#include "train/OMBackpropExecutionBuilder.h"
+#include "execute/OMRuntimeKernel.h"
+#include "core/memory/OMMemoryManager.h"
+#include "PALSoftmaxInputGrad.h"
+
+using namespace onert_micro;
+using namespace onert_micro::core;
+using namespace onert_micro::train;
+
+namespace
+{
+
+constexpr uint32_t inputTensorIdx = 0;
+constexpr uint32_t outputTensorIdx = 0;
+
+} // namespace
+
+/*
+ * - Calculate input gradient - Optional (not required if it is last op)
+ */
+OMStatus onert_micro::train::train_kernel_CircleSoftmax(const OMBackpropExecuteArgs &args)
+{
+  // Check is it last layer for training
+  if (args.is_last_layer)
+  {
+    return Ok;
+  }
+
+  core::OMRuntimeStorage &forward_storage = args.forward_storage;
+  core::OMRuntimeStorage &backward_storage = args.backward_storage;
+  core::OMRuntimeContext &context = args.backward_context;
+  uint16_t op_index = args.kernel_index;
+
+  const circle::Tensor *input;
+  const circle::Tensor *output;
+
+  uint8_t *dloss_dinput_data;
+
+  uint8_t *output_data;
+  uint8_t *dloss_doutput_data;
+
+  // Read kernel
+  {
+    execute::OMRuntimeKernel runtime_kernel;
+    runtime_kernel.readKernel(op_index, context);
+
+    input = runtime_kernel.inputs[inputTensorIdx];
+    output = runtime_kernel.outputs[outputTensorIdx];
+    assert(input != nullptr);
+    assert(output != nullptr);
+
+    // Read forward storage
+    {
+      runtime_kernel.getDataFromStorage(op_index, forward_storage, context);
+
+      output_data = runtime_kernel.outputs_data[outputTensorIdx];
+      assert(output_data != nullptr);
+    }
+
+    // Read backward storage
+    {
+      runtime_kernel.getDataFromStorage(op_index, backward_storage, context);
+
+      dloss_dinput_data = runtime_kernel.inputs_data[inputTensorIdx];
+      dloss_doutput_data = runtime_kernel.outputs_data[outputTensorIdx];
+
+      assert(dloss_dinput_data != nullptr);
+      assert(dloss_doutput_data != nullptr);
+    }
+  }
+
+  OMRuntimeShape input_shape(input);
+  OMRuntimeShape output_shape(output);
+
+  // Check Softmax output and input shape
+  assert(output_shape.dimensionsCount() == 2);
+  assert(output_shape.dims(0) == 1);
+  if (output_shape.dimensionsCount() != 2 or output_shape.dims(0) != 1)
+    return UnsupportedType;
+
+  // Allocate temporary buffer to save Jacobian row
+  uint8_t *jacobian_row_data = nullptr;
+  OMStatus status = core::memory::OMMemoryManager::allocateMemory(
+    output_shape.flatSize() * sizeof(OMDataType(output->type())), &jacobian_row_data);
+  assert(status == Ok);
+  if (status != Ok)
+    return status;
+
+  // Calculate input grad
+  pal::SoftmaxInputGrad(core::utils::castInputData<float>(dloss_doutput_data), output_shape,
+                        core::utils::castInputData<float>(output_data),
+                        core::utils::castOutputData<float>(jacobian_row_data),
+                        core::utils::castOutputData<float>(dloss_dinput_data));
+
+  // Deallocate temporary buffer with Jacobian row
+  status = core::memory::OMMemoryManager::deallocateMemory(jacobian_row_data);
+
+  return status;
+}