diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
index 3261a94b17cdf..6359515a67b9d 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
@@ -103,286 +103,556 @@ then it supports the `bfloat16` math functions described in the next section.
 
 === Math Functions
 
-The following functions are only available when `T` is `bfloat16` or
-`sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of
-`size_t` type.
-
 ==== isnan
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
-bool isnan(bfloat16 x);
+bool isnan(bfloat16 x);                                                 (1)
 
-template <size_t N>
-sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x);
+template<typename NonScalar> 
+/*return type*/ isnan(NonScalar x);                                     (2)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
+====== Overload (1)
+
+Returns `true` if `x` is a NaN value, otherwise returns `false`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+ - `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+ - The element type is `bfloat16`.
 
-Returns true if x is NAN value, otherwise returns false.
+*Returns:* If `NonScalar` is `marray`, returns `true` for each element of `x` only if `x[i]` has a NaN value. If `NonScalar` is `vec` or the `[code]#+__swizzled_vec__+#` type, returns -1 for each element of `x` if `x[i]` is a NaN value and returns 0 otherwise.
+
+The return type depends on `NonScalar`. For `marray`, the return type is `marray<bool, N>` and for `vec`, `[code]#+__swizzled_vec__+#` type, the return type is `vec<int16_t, N>`.
 
 ==== fma
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
-template <typename T>
-T fma(T a, T b, T c);
+bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c);                          (1)
+
+template<typename NonScalar1, typename NonScalar2, typename NonScalar3>    (2)
+/*return-type*/ fma(NonScalar1 a, NonScalar2 b, NonScalar3 c)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns the correctly rounded floating-point representation of the
+====== Overload (1)
+
+*Returns:* Returns the correctly rounded floating-point representation of the
 sum of `c` with the infinitely precise product of `a` and `b`.
 Rounding of intermediate products shall not occur. The mantissa
 LSB rounds to the nearest even. Subnormal numbers are supported.
 
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1`, `NonScalar2`, and `NonScalar3`:
+** `NonScalar1`, `NonScalar2`, and `NonScalar3` are each `marray`; or
+** `NonScalar1`, `NonScalar2`, and `NonScalar3` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same number of elements;
+* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same element type; and
+* The element type of `NonScalar1`, `NonScalar2`, and `NonScalar3` is `bfloat16`.
+
+*Returns:* For each element of `a`, `b`, and `c`; the correctly rounded floating-point representation of the sum of `c[i]` with the infinitely precise product of `a[i]` and `b[i]`. Rounding of intermediate products shall not occur. Edge case behavior is per the IEEE 754-2008 standard.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
 ==== fmax
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmax(T x, T y);
+bfloat16 fmax(bfloat16 x, bfloat16 y);                                           (1)
+
+template<typename NonScalar1, typename NonScalar2>                               (2)
+/*return-type*/ fmax(NonScalar1 x, NonScalar2 y)
+
+template<typename NonScalar>                                                     (3)
+/*return-type*/ fmax(NonScalar x, bfloat16 y)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns `y` if
-`x < y`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+====== Overload (1)
+
+Returns `y` if `x < y`, otherwise it returns `x`. If one argument is a NaN, `fmax()` returns the other
+argument. If both arguments are NaNs, `fmax()` returns a NaN.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1` and `NonScalar2`:
+** Both `NonScalar1` and `NonScalar2` are `marray`; or
+** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1` and `NonScalar2` have the same number of elements;
+* `NonScalar1` and `NonScalar2` have the same element type; and
+* The element type of `NonScalar1` and `NonScalar2` is bfloat16.
+
+*Returns:* For each element of `x` and `y`, the value `y[i]` if `x[i] < y[i]`, otherwise `x[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+====== Overload (3)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is bfloat16.
+
+*Returns:* For each element of `x`, the value `y` if `x[i] < y`, otherwise `x[i]`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== fmin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmin(T x, T y);
+bfloat16 fmin(bfloat16 x, bfloat16 y);                                           (1)
+
+template<typename NonScalar1, typename NonScalar2>                               (2)
+/*return-type*/ fmin(NonScalar1 x, NonScalar2 y)
+
+template<typename NonScalar>                                                     (3)
+/*return-type*/ fmin(NonScalar x, bfloat16 y)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns `y` if
-`y < x`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+====== Overload (1)
+
+Returns `x` if `x < y`, otherwise it returns `y`. If one argument is a
+NaN, `fmin()` returns the other argument. If both arguments are NaNs, `fmin()` returns a NaN.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1` and `NonScalar2`:
+** Both `NonScalar1` and `NonScalar2` are `marray`; or
+** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1` and `NonScalar2` have the same number of elements;
+* `NonScalar1` and `NonScalar2` have the same element type; and
+* The element type of `NonScalar1` and `NonScalar2` is bfloat16.
+
+*Returns:* For each element of `x` and `y`, the value `x[i]` if `x[i] < y[i]`, otherwise `y[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+====== Overload (3)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is bfloat16.
+
+*Returns:* For each element of `x`, the value `x[i]` if `x[i] < y`, otherwise `y`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== fabs
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fabs(T x);
+bfloat16 fabs(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                          (2)
+/*return-type*/ fabs(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Compute absolute value(s) of a scalar `bfloat16` value.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute absolute value of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the absolute value of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== ceil
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T ceil(T x);
+bfloat16 ceil(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                          (2)
+/*return-type*/ ceil(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+The value `x` rounded to an integral value using the round to positive infinity rounding mode.
+
+====== Overload (2)
 
-Returns `x` rounded to an integral value using the round to positive infinity rounding mode
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to positive infinity rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== floor
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T floor(T x);
+bfloat16 floor(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ floor(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+The value `x` rounded to an integral value using the round to negative infinity rounding mode.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Returns `x` rounded to an integral value using the round to negative infinity rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to negative infinity rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== cos
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T cos(T x);
+bfloat16 cos(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ cos(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the cosine of `x`.
 
-Compute cosine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the cosine of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== sin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T sin(T x);
+bfloat16 sin(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ sin(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the sine of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute sine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
+*Returns:* For each element of `x`, the sine of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp(T x);
+bfloat16 exp(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ exp(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-e exponential of `x`.
+
+====== Overload (2)
 
-Compute the base-e exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-e exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp2
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp2(T x);
+bfloat16 exp2(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ exp2(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-2 exponential of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute the base-2 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the base-2 exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp10
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp10(T x);
+bfloat16 exp10(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ exp10(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-10 exponential of `x`.
 
-Compute the base-10 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-10 exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log(T x);
+bfloat16 log(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ log(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the natural logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute natural logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the natural logarithm of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log2
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log2(T x);
+bfloat16 log2(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ log2(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-2 logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-2 logarithm of `x[i]`.
 
-Compute base-2 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log10
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log10(T x);
+bfloat16 log10(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                             (2)
+/*return-type*/ log10(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-10 logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute base-10 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-10 logarithm of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 
 ==== rint
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T rint(T x);
+bfloat16 rint(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ rint(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
 
-Returns `x` rounded to an integral value using the round to nearest even rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Returns the value `x` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. Refer to section 7.1 of the OpenCL 1.2 specification document: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#opencl12 for a description of the rounding modes.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value (using round to nearest even rounding mode) in floating-point format.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== sqrt
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T sqrt(T x);
+bfloat16 sqrt(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ sqrt(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the square root of `x`.
+
+====== Overload (2)
 
-Compute square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the square root of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== rsqrt
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T rsqrt(T x);
+bfloat16 rsqrt(bfloat16 x);                                             (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ rsqrt(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the inverse square root of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute inverse square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the inverse square root of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== trunc
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T trunc(T x);
+bfloat16 trunc(bfloat16 x);                                             (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ trunc(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
 
-Returns `x` rounded to an integral value using the round to zero rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Returns the value `x` rounded to an integral value using the round to zero rounding mode.
 
-== Issues
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-1. The CUDA backend does not have a use case that would necessitate support
-of the `vec` class in bfloat16 math functions, and `marray` would always be
-preferred over `vec` if `vec` support were to be added in the CUDA backend.
-For portability reasons, support for the `vec` class can be easily added if
-other backends require it.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to zero rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+== Issues
 
-2. We should decide on a roadmap to extend support of `bfloat16` to other
+1. We should decide on a roadmap to extend support of `bfloat16` to other
 SYCL 2020 math functions.
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
index 2b611f46ddadd..fb4b49a44d4d3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <sycl/builtins.hpp>            // for ceil, cos, exp, exp10, exp2
+#include <sycl/builtins_utils_vec.hpp>  // For simplify_if_swizzle, is_swizzle
 #include <sycl/detail/memcpy.hpp>       // sycl::detail::memcpy
 #include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16, bfloat16ToBits
 #include <sycl/marray.hpp>              // for marray
@@ -30,6 +31,17 @@ uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
 }
 } // namespace detail
 
+// Trait to check if the type is a vector or swizzle of bfloat16.
+template <typename T>
+constexpr bool is_vec_or_swizzle_bf16_v =
+    sycl::detail::is_vec_or_swizzle_v<T> &&
+    sycl::detail::is_valid_elem_type_v<T, bfloat16>;
+
+template <typename T>
+constexpr int num_elements_v = sycl::detail::num_elements<T>::value;
+
+/******************* isnan ********************/
+
 // According to bfloat16 format, NAN value's exponent field is 0xFF and
 // significand has non-zero bits.
 template <typename T>
@@ -46,6 +58,21 @@ template <size_t N> sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+// Overload for BF16 vec and swizzles.
+template <typename T, int N = num_elements_v<T>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<int16_t, N>>
+isnan(T x) {
+  sycl::vec<int16_t, N> res;
+  for (size_t i = 0; i < N; i++) {
+    // The result of isnan is 0 or 1 but SPEC requires
+    // isnan() of vec/swizzle to return -1 or 0.
+    res[i] = isnan(x[i]) ? -1 : 0;
+  }
+  return res;
+}
+
+/******************* fabs ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -89,6 +116,19 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+// Overload for BF16 vec and swizzles.
+template <typename T, int N = num_elements_v<T>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>>
+fabs(T x) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fabs(x[i]);
+  }
+  return res;
+}
+
+/******************* fmin ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -146,6 +186,22 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     N1 == N2,
+                 sycl::vec<bfloat16, N1>>
+fmin(T1 x, T2 y) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fmin(x[i], y[i]);
+  }
+  return res;
+}
+
+/******************* fmax ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmax(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -202,6 +258,22 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     N1 == N2,
+                 sycl::vec<bfloat16, N1>>
+fmax(T1 x, T2 y) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fmax(x[i], y[i]);
+  }
+  return res;
+}
+
+/******************* fma *********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fma(T x, T y, T z) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -248,6 +320,22 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, typename T3, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>, int N3 = num_elements_v<T3>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     is_vec_or_swizzle_bf16_v<T3> && N1 == N2 && N2 == N3,
+                 sycl::vec<bfloat16, N1>>
+fma(T1 x, T2 y, T3 z) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fma(x[i], y[i], z[i]);
+  }
+  return res;
+}
+
+/******************* unary math operations ********************/
+
 #define BFLOAT16_MATH_FP32_WRAPPERS(op)                                        \
   template <typename T>                                                        \
   std::enable_if_t<std::is_same<T, bfloat16>::value, T> op(T x) {              \
@@ -264,37 +352,77 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
     return res;                                                                \
   }
 
+#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op)                                    \
+  /* Overload for BF16 vec and swizzles. */                                    \
+  template <typename T, int N = num_elements_v<T>>                             \
+  std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>> op(    \
+      T x) {                                                                   \
+    sycl::vec<bfloat16, N> res;                                                \
+    for (size_t i = 0; i < N; i++) {                                           \
+      res[i] = op(x[i]);                                                       \
+    }                                                                          \
+    return res;                                                                \
+  }
+
 BFLOAT16_MATH_FP32_WRAPPERS(ceil)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(ceil)
+
 BFLOAT16_MATH_FP32_WRAPPERS(cos)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(cos)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(floor)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(floor)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rint)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rint)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rsqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rsqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sin)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sin)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(trunc)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(trunc)
 
 #undef BFLOAT16_MATH_FP32_WRAPPERS
 #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY
+#undef BFLOAT16_MATH_FP32_WRAPPERS_VEC
 } // namespace ext::oneapi::experimental
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
new file mode 100644
index 0000000000000..481aa35e3cedf
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
@@ -0,0 +1,278 @@
+// RUN: %{build} -fno-fast-math -o %t.out
+// RUN: %{run} %t.out
+
+// Test new, ABI-breaking for all platforms.
+// RUN:  %if preview-breaking-changes-supported %{  %{build} -fpreview-breaking-changes -o %t-pfrev.out %}
+// RUN:  %if preview-breaking-changes-supported %{  %{run} %t-pfrev.out  %}
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr float bf16_eps = 0.00390625;
+
+bool check(float a, float b) {
+  return sycl::fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
+}
+
+bool check(bool a, bool b) { return (a != b); }
+
+#define TEST_UNARY_OP(NAME, SZ, RETTY, INPVAL)                                 \
+  {                                                                            \
+    vec<bfloat16, SZ> arg;                                                     \
+    /* Initialize the vector with INPVAL */                                    \
+    for (int i = 0; i < SZ; i++) {                                             \
+      arg[i] = INPVAL;                                                         \
+    }                                                                          \
+    /* Perform the operation. */                                               \              
+    vec<RETTY, SZ>                                                             \
+        res = sycl::ext::oneapi::experimental::NAME(arg);                      \
+    vec<RETTY, 2> res2 =                                                       \
+        sycl::ext::oneapi::experimental::NAME(arg.template swizzle<0, 0>());   \
+    /* Check the result. */                                                    \                   
+    if (res2[0] != res[0] || res2[1] != res[0]) {                              \
+      ERR[0] += 1;                                                             \
+    }                                                                          \
+    for (int i = 0; i < SZ; i++) {                                             \
+      if (check(res[i], sycl::NAME(INPVAL))) {                                 \
+        ERR[0] += 1;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define TEST_BINARY_OP(NAME, SZ, RETTY, INPVAL)                                \
+  {                                                                            \
+    vec<bfloat16, SZ> arg, arg2;                                               \
+    bfloat16 inpVal2 = 1.0f;                                                   \
+    /* Initialize the vector with INPVAL */                                    \
+    for (int i = 0; i < SZ; i++) {                                             \
+      arg[i] = INPVAL;                                                         \
+      arg2[i] = inpVal2;                                                       \
+    }                                                                          \
+    /* Perform the operation. */                                               \              
+    vec<RETTY, SZ>                                                             \
+        res = sycl::ext::oneapi::experimental::NAME(arg, arg2);                \
+    /* Swizzle and vec different combination. */                               \
+    vec<RETTY, 2> res2 = sycl::ext::oneapi::experimental::NAME(                \
+        arg.template swizzle<0, 0>(), arg2.template swizzle<0, 0>());          \
+    vec<RETTY, 2> res3 = sycl::ext::oneapi::experimental::NAME(                \
+        vec<bfloat16, 2>(arg[0], arg[0]), arg2.template swizzle<0, 0>());      \
+    vec<RETTY, 2> res4 = sycl::ext::oneapi::experimental::NAME(                \
+        arg.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]));     \
+    /* Check the result. */                                                    \
+    if (res2[0] != res[0] || res2[1] != res[0] || res3[0] != res[0] ||         \
+        res3[1] != res[0] || res4[0] != res[0] || res4[1] != res[0]) {         \
+      ERR[0] += 1;                                                             \
+    }                                                                          \
+    for (int i = 0; i < SZ; i++) {                                             \
+      if (check(res[i], sycl::NAME(INPVAL, inpVal2))) {                        \
+        ERR[0] += 1;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define TEST_BUILTIN_VEC(NAME, SZ, RETTY, INPVAL, OPTEST)                      \
+  { /* On Device */                                                            \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf,      \
+                                                                 cgh);         \
+       cgh.single_task([=]() { OPTEST(NAME, SZ, RETTY, INPVAL) });             \
+     }).wait();                                                                \
+  }                                                                            \
+  assert(err == 0);                                                            \
+  { /* On Host */                                                              \
+    int ERR[1] = {0};                                                          \
+    OPTEST(NAME, SZ, RETTY, INPVAL)                                            \
+    assert(ERR[0] == 0);                                                       \
+  }
+
+#define TEST_BUILTIN_UNARY(NAME, RETTY, INPVAL)                                \
+  TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_UNARY_OP)
+
+#define TEST_BUILTIN_BINARY(NAME, RETTY, INPVAL)                               \
+  TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_BINARY_OP)
+
+void test() {
+  queue q;
+  int err = 0;
+  float nan = std::nanf("");
+
+  // Test isnan on host
+  {
+    vec<bfloat16, 3> arg{1.0f, nan, 2.0f};
+    vec<int16_t, 3> res = sycl::ext::oneapi::experimental::isnan(arg);
+    assert((res[0] == 0 && res[1] == -1 && res[2] == 0) &&
+           "isnan() failed on host for vec");
+
+    // Test for swizzles
+    vec<int16_t, 2> res2 = sycl::ext::oneapi::experimental::isnan(arg.lo());
+    assert((res2[0] == 0 && res2[1] == -1) &&
+           "isnan() failed on host for vec swizzles");
+  }
+
+  // Tets isnan on device.
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+       cgh.single_task([=]() {
+         vec<bfloat16, 3> arg{1.0f, nan, 2.0f};
+         vec<int16_t, 3> res = sycl::ext::oneapi::experimental::isnan(arg);
+         if (res[0] != 0 || res[1] != -1 || res[2] != 0) {
+           ERR[0] += 1;
+         }
+       });
+     }).wait();
+    assert(err == 0 && "isnan failed on device for vec");
+  }
+
+  // Unary math builtins.
+  TEST_BUILTIN_UNARY(fabs, bfloat16, -1.0f);
+  TEST_BUILTIN_UNARY(fabs, bfloat16, 1.0f);
+
+  TEST_BUILTIN_UNARY(cos, bfloat16, 0.1f);
+  TEST_BUILTIN_UNARY(sin, bfloat16, 0.2f);
+
+  TEST_BUILTIN_UNARY(ceil, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(floor, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(trunc, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp10, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp2, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(rint, bfloat16, 0.9f);
+
+  TEST_BUILTIN_UNARY(sqrt, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(rsqrt, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(log, bfloat16, 20.0f);
+  TEST_BUILTIN_UNARY(log2, bfloat16, 2.0f);
+  TEST_BUILTIN_UNARY(log10, bfloat16, 2.0f);
+
+  TEST_BUILTIN_BINARY(fmin, bfloat16, 0.9f);
+  TEST_BUILTIN_BINARY(fmax, bfloat16, 0.9f);
+  TEST_BUILTIN_BINARY(fmin, bfloat16, nan);
+  TEST_BUILTIN_BINARY(fmax, bfloat16, nan);
+
+  // Test fma operation on host.
+  {
+    vec<bfloat16, 3> arg1, arg2, arg3;
+    bfloat16 inpVal1 = 1.0f;
+    bfloat16 inpVal2 = 2.0f;
+    bfloat16 inpVal3 = 3.0f;
+    /* Initialize the vector with INPVAL */
+    for (int i = 0; i < 3; i++) {
+      arg1[i] = inpVal1;
+      arg2[i] = inpVal2;
+      arg3[i] = inpVal3;
+    }
+    /* Perform the operation. */
+    auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3);
+
+    // Test different combination of vec an swizzle.
+    auto res1 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+        arg3.template swizzle<0, 0>());
+
+    auto res2 = sycl::ext::oneapi::experimental::fma(
+        vec<bfloat16, 2>(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(),
+        arg3.template swizzle<0, 0>());
+
+    auto res3 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]),
+        arg3.template swizzle<0, 0>());
+
+    auto res4 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+        vec<bfloat16, 2>(arg3[0], arg3[0]));
+
+    /* Check the result. */
+    if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] ||
+        res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] ||
+        res4[0] != res[0] || res4[1] != res[0]) {
+      err += 1;
+    }
+    for (int i = 0; i < 3; i++) {
+      if (check(res[i], sycl::ext::oneapi::experimental::fma(inpVal1, inpVal2,
+                                                             inpVal3))) {
+        err += 1;
+      }
+    }
+    assert(err == 0);
+  }
+
+  // Test fma on device.
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+       cgh.single_task([=]() {
+         vec<bfloat16, 3> arg1, arg2, arg3;
+         bfloat16 inpVal1 = 1.0f;
+         bfloat16 inpVal2 = 2.0f;
+         bfloat16 inpVal3 = 3.0f;
+         /* Initialize the vector with INPVAL */
+         for (int i = 0; i < 3; i++) {
+           arg1[i] = inpVal1;
+           arg2[i] = inpVal2;
+           arg3[i] = inpVal3;
+         }
+         /* Perform the operation. */
+         auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3);
+
+         // Test different combination of vec an swizzle.
+         auto res1 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+             arg3.template swizzle<0, 0>());
+
+         auto res2 = sycl::ext::oneapi::experimental::fma(
+             vec<bfloat16, 2>(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(),
+             arg3.template swizzle<0, 0>());
+
+         auto res3 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]),
+             arg3.template swizzle<0, 0>());
+
+         auto res4 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+             vec<bfloat16, 2>(arg3[0], arg3[0]));
+
+         /* Check the result. */
+         if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] ||
+             res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] ||
+             res4[0] != res[0] || res4[1] != res[0]) {
+           ERR[0] += 1;
+         }
+         for (int i = 0; i < 3; i++) {
+           if (check(res[i], sycl::ext::oneapi::experimental::fma(
+                                 inpVal1, inpVal2, inpVal3))) {
+             ERR[0] += 1;
+           }
+         }
+       });
+     }).wait();
+    assert(err == 0);
+  }
+}
+
+int main() {
+
+  test();
+  return 0;
+}
diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp
new file mode 100644
index 0000000000000..6aea590b6155c
--- /dev/null
+++ b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp
@@ -0,0 +1,377 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// NOTE: ..., followed by some manual cleanup.
+
+// Had to increase inline threashold for this test to force inline of the vec<>
+// math builtins.
+// RUN: %clangxx -I %sycl_include -mllvm -inline-threshold=400 -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -O3 -fsycl-device-only %s -o - | FileCheck %s
+
+// This test checks the device code generated for vec<bfloat16> math builtins.
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMinN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEES5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec", align 8
+// CHECK-NEXT:    [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[AGG_TMP13]], align 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[AGG_TMP2]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 2
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS0_3VECINS2_8BFLOAT16ELI2EEES7_LI2ELI2EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10:![0-9]+]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]]
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META12]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META12]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]]
+// CHECK:       land.lhs.true.i.i:
+// CHECK-NEXT:    [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]]
+// CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       if.end6.i.i:
+// CHECK-NEXT:    [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]]
+// CHECK-NEXT:    br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]]
+// CHECK:       if.end10.i.i:
+// CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]]
+// CHECK:       if.end18.i.i:
+// CHECK-NEXT:    [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8:[0-9]+]], !noalias [[META15:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META15]]
+// CHECK-NEXT:    [[CMP_I44_I_I:%.*]] = fcmp olt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]]
+// CHECK-NEXT:    [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META18:![0-9]+]]
+// CHECK-NEXT:    [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META18]]
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fminINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i:
+// CHECK-NEXT:    [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ -32768, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]]
+// CHECK-NEXT:    [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fminINS0_3vecINS2_8bfloat16ELi2EEES7_Li2ELi2EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMin(vec<bfloat16, 2> a, vec<bfloat16, 2> b) {
+  return experimental::fmin(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMaxN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META22:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8
+// CHECK-NEXT:    [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    store i64 [[TMP1]], ptr [[AGG_TMP13]], align 1
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[AGG_TMP2]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS0_3VECINS2_8BFLOAT16ELI3EEES7_LI3ELI3EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]]
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META23]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META23]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]]
+// CHECK:       land.lhs.true.i.i:
+// CHECK-NEXT:    [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]]
+// CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       if.end6.i.i:
+// CHECK-NEXT:    [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]]
+// CHECK-NEXT:    br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]]
+// CHECK:       if.end10.i.i:
+// CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]]
+// CHECK:       if.end18.i.i:
+// CHECK-NEXT:    [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26]]
+// CHECK-NEXT:    [[CMP_I44_I_I:%.*]] = fcmp ogt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]]
+// CHECK-NEXT:    [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META29:![0-9]+]]
+// CHECK-NEXT:    [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META29]]
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i:
+// CHECK-NEXT:    [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ 0, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]]
+// CHECK-NEXT:    [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS0_3vecINS2_8bfloat16ELi3EEES7_Li3ELi3EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMax(vec<bfloat16, 3> a, vec<bfloat16, 3> b) {
+  return experimental::fmax(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z9TestIsNanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.1") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !srcloc [[META31:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.2", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 4
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL5ISNANINS0_3VECINS2_8BFLOAT16ELI4EEELI4EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_ISXT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]], !noalias [[META32]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]], !noalias [[META32]]
+// CHECK-NEXT:    [[CONV_I_I:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I:%.*]] = and i32 [[CONV_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp ne i32 [[AND2_I_I]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_I_I]]
+// CHECK-NEXT:    [[CONV2_I:%.*]] = sext i1 [[TMP2]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[AGG_RESULT]], i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV2_I]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA10]], !alias.scope [[META32]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental5isnanINS0_3vecINS2_8bfloat16ELi4EEELi4EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IsXT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestIsNan(vec<bfloat16, 4> a) {
+  return experimental::isnan(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFabsN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR3]] !srcloc [[META36:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FABSINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP0]], 32767
+// CHECK-NEXT:    [[SPEC_SELECT_I_I:%.*]] = select i1 [[TMP1]], i16 [[TMP0]], i16 [[TMP2]]
+// CHECK-NEXT:    [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[SPEC_SELECT_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP37:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fabsINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFabs(vec<bfloat16, 8> a) {
+  return experimental::fabs(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestCeilN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[AGG_TMP6_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP6_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4CEILINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[AGG_TMP6_I]], align 1, !noalias [[META39]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z16__spirv_ocl_ceilf(float noundef [[CALL_I_I_I_I]]) #[[ATTR9:[0-9]+]]
+// CHECK-NEXT:    store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45:![0-9]+]], !noalias [[META47:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[CALL_I_I2_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4ceilINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestCeil(vec<bfloat16, 8> a) {
+  return experimental::ceil(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z7TestFMAN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEES5_S5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.4") align 32 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[B:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[C:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META49:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[AGG_TMP416_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP115_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP14_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP25:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP14:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP3:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[A]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[B]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[C]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP3]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP14]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP25]])
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP2_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP25]], align 1
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP1_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP14]], align 1
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP3]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP3]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP14]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP25]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP14_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP115_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP416_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 16
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL3FMAINS0_3VECINS2_8BFLOAT16ELI16EEES7_S7_LI16ELI16ELI16EEENST9ENABLE_IFIXAAAAAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_E24IS_VEC_OR_SWIZZLE_BF16_VIT1_EEQT2_T3_EQT3_T4_ENS5_IS6_XT2_EEEE4TYPEES9_SA_SB__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL6_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Z_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL6_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP416_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[AGG_TMP115_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[AGG_TMP14_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I4_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    [[CALL_I_I5_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Z_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z15__spirv_ocl_fmafff(float noundef [[CALL_I_I_I_I]], float noundef [[CALL_I_I4_I_I]], float noundef [[CALL_I_I5_I_I]]) #[[ATTR9]]
+// CHECK-NEXT:    store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45]], !noalias [[META56:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I6_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]]
+// CHECK-NEXT:    [[CALL8_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[CALL_I_I6_I_I]], ptr addrspace(4) [[CALL8_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP57:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental3fmaINS0_3vecINS2_8bfloat16ELi16EEES7_S7_Li16ELi16ELi16EEENSt9enable_ifIXaaaaaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_E24is_vec_or_swizzle_bf16_vIT1_EeqT2_T3_eqT3_T4_ENS5_IS6_XT2_EEEE4typeES9_SA_SB_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP3]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP14]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP25]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMA(vec<bfloat16, 16> a, vec<bfloat16, 16> b,
+                           vec<bfloat16, 16> c) {
+  return experimental::fma(a, b, c);
+}