intel · steffenlarsen · Jun 7, 2024 · Apr 18, 2024 · May 28, 2024 · May 31, 2024
@@ -105,7 +105,8 @@ then it supports the `bfloat16` math functions described in the next section.
 
 The following functions are only available when `T` is `bfloat16` or
 `sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of
-`size_t` type.
+`size_t` type, or `sycl::vec<bfloat16, {N}>`, where `{N}` is a valid
+`sycl::vec` size.
 
 ==== isnan
 
@@ -116,12 +117,15 @@ bool isnan(bfloat16 x);
 
 template <size_t N>
 sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x);
+
+template <size_t N>
+sycl::vec<bool, N> isnan(sycl::vec<bfloat16, N> x);
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns true if x is NAN value, otherwise returns false.
+Returns true if x is or contains a NAN value, otherwise returns false.
 
 ==== fma
 
@@ -187,7 +191,7 @@ T fabs(T x);
 
 ===== Description
 
-Compute absolute value of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute absolute value of a `bfloat16` value.
 
 ==== ceil
 
@@ -200,7 +204,7 @@ T ceil(T x);
 
 ===== Description
 
-Returns `x` rounded to an integral value using the round to positive infinity rounding mode
+Returns `x` rounded to an integral value using the round to positive infinity rounding mode.
 
 ==== floor
 
@@ -214,7 +218,7 @@ T floor(T x);
 ===== Description
 
 Returns `x` rounded to an integral value using the round to negative infinity rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+for a `bfloat16` value.
 
 ==== cos
 
@@ -227,7 +231,7 @@ T cos(T x);
 
 ===== Description
 
-Compute cosine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute cosine of a `bfloat16` value.
 
 ==== sin
 
@@ -240,7 +244,7 @@ T sin(T x);
 
 ===== Description
 
-Compute sine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute sine of a `bfloat16` value.
 
 
 ==== exp
@@ -254,7 +258,7 @@ T exp(T x);
 
 ===== Description
 
-Compute the base-e exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute the base-e exponential of a `bfloat16` value.
 
 ==== exp2
 
@@ -267,7 +271,7 @@ T exp2(T x);
 
 ===== Description
 
-Compute the base-2 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute the base-2 exponential of a `bfloat16` value.
 
 ==== exp10
 
@@ -280,7 +284,7 @@ T exp10(T x);
 
 ===== Description
 
-Compute the base-10 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute the base-10 exponential of a `bfloat16` value.
 
 ==== log
 
@@ -293,7 +297,7 @@ T log(T x);
 
 ===== Description
 
-Compute natural logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute natural logarithm of a `bfloat16` value.
 
 ==== log2
 
@@ -306,7 +310,7 @@ T log2(T x);
 
 ===== Description
 
-Compute base-2 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute base-2 logarithm of a `bfloat16` value.
 
 ==== log10
 
@@ -319,7 +323,7 @@ T log10(T x);
 
 ===== Description
 
-Compute base-10 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute base-10 logarithm of a `bfloat16` value.
 
 
 ==== rint
@@ -334,7 +338,7 @@ T rint(T x);
 ===== Description
 
 Returns `x` rounded to an integral value using the round to nearest even rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+for a `bfloat16` value.
 
 ==== sqrt
 
@@ -347,7 +351,7 @@ T sqrt(T x);
 
 ===== Description
 
-Compute square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute square root of a `bfloat16` value.
 
 ==== rsqrt
 
@@ -360,7 +364,7 @@ T rsqrt(T x);
 
 ===== Description
 
-Compute inverse square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Compute inverse square root of a `bfloat16` value.
 
 ==== trunc
 
@@ -374,15 +378,9 @@ T trunc(T x);
 ===== Description
 
 Returns `x` rounded to an integral value using the round to zero rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+for a `bfloat16` value.
 
 == Issues
 
-1. The CUDA backend does not have a use case that would necessitate support
-of the `vec` class in bfloat16 math functions, and `marray` would always be
-preferred over `vec` if `vec` support were to be added in the CUDA backend.
-For portability reasons, support for the `vec` class can be easily added if
-other backends require it.
-
-2. We should decide on a roadmap to extend support of `bfloat16` to other
+1. We should decide on a roadmap to extend support of `bfloat16` to other
 SYCL 2020 math functions.
@@ -46,6 +46,14 @@ template <size_t N> sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+template <int N> sycl::vec<bool, N> isnan(sycl::vec<bfloat16, N> x) {
+  sycl::vec<bool, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = isnan(x[i]);
+  }
+  return res;
+}
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -89,6 +97,14 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+template <int N> sycl::vec<bfloat16, N> fabs(sycl::vec<bfloat16, N> x) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fabs(x[i]);
+  }
+  return res;
+}
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -146,6 +162,16 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+template <int N>
+sycl::vec<bfloat16, N> fmin(sycl::vec<bfloat16, N> x,
+                            sycl::vec<bfloat16, N> y) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fmin(x[i], y[i]);
+  }
+  return res;
+}
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmax(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -202,6 +228,16 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+template <int N>
+sycl::vec<bfloat16, N> fmax(sycl::vec<bfloat16, N> x,
+                            sycl::vec<bfloat16, N> y) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fmax(x[i], y[i]);
+  }
+  return res;
+}
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fma(T x, T y, T z) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -248,6 +284,16 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+template <int N>
+sycl::vec<bfloat16, N> fma(sycl::vec<bfloat16, N> x, sycl::vec<bfloat16, N> y,
+                           sycl::vec<bfloat16, N> z) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fma(x[i], y[i], z[i]);
+  }
+  return res;
+}
+
 #define BFLOAT16_MATH_FP32_WRAPPERS(op)                                        \
   template <typename T>                                                        \
   std::enable_if_t<std::is_same<T, bfloat16>::value, T> op(T x) {              \
@@ -264,37 +310,74 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
     return res;                                                                \
   }
 
+#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op)                                    \
+  template <int N> sycl::vec<bfloat16, N> op(sycl::vec<bfloat16, N> x) {       \
+    sycl::vec<bfloat16, N> res;                                                \
+    for (size_t i = 0; i < N; i++) {                                           \
+      res[i] = op(x[i]);                                                       \
+    }                                                                          \
+    return res;                                                                \
+  }
+
 BFLOAT16_MATH_FP32_WRAPPERS(ceil)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(ceil)
+
 BFLOAT16_MATH_FP32_WRAPPERS(cos)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(cos)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(floor)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(floor)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rint)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rint)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rsqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rsqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sin)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sin)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(trunc)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(trunc)
 
 #undef BFLOAT16_MATH_FP32_WRAPPERS
 #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY
+#undef BFLOAT16_MATH_FP32_WRAPPERS_VEC
 } // namespace ext::oneapi::experimental
 } // namespace _V1
 } // namespace sycl