diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc index 3261a94b17cdf..6359515a67b9d 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc @@ -103,286 +103,556 @@ then it supports the `bfloat16` math functions described in the next section. === Math Functions -The following functions are only available when `T` is `bfloat16` or -`sycl::marray`, where `{N}` means any positive value of -`size_t` type. - ==== isnan ```c++ namespace sycl::ext::oneapi::experimental { -bool isnan(bfloat16 x); +bool isnan(bfloat16 x); (1) -template -sycl::marray isnan(sycl::marray x); +template +/*return type*/ isnan(NonScalar x); (2) } // namespace sycl::ext::oneapi::experimental ``` ===== Description +====== Overload (1) + +Returns `true` if `x` is a NaN value, otherwise returns `false`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + + - `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and + - The element type is `bfloat16`. -Returns true if x is NAN value, otherwise returns false. +*Returns:* If `NonScalar` is `marray`, returns `true` for each element of `x` only if `x[i]` has a NaN value. If `NonScalar` is `vec` or the `[code]#+__swizzled_vec__+#` type, returns -1 for each element of `x` if `x[i]` is a NaN value and returns 0 otherwise. + +The return type depends on `NonScalar`. For `marray`, the return type is `marray` and for `vec`, `[code]#+__swizzled_vec__+#` type, the return type is `vec`. ==== fma ```c++ namespace sycl::ext::oneapi::experimental { -template -T fma(T a, T b, T c); +bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c); (1) + +template (2) +/*return-type*/ fma(NonScalar1 a, NonScalar2 b, NonScalar3 c) } // namespace sycl::ext::oneapi::experimental ``` ===== Description -Returns the correctly rounded floating-point representation of the +====== Overload (1) + +*Returns:* Returns the correctly rounded floating-point representation of the sum of `c` with the infinitely precise product of `a` and `b`. Rounding of intermediate products shall not occur. The mantissa LSB rounds to the nearest even. Subnormal numbers are supported. +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* One of the following conditions must hold for `NonScalar1`, `NonScalar2`, and `NonScalar3`: +** `NonScalar1`, `NonScalar2`, and `NonScalar3` are each `marray`; or +** `NonScalar1`, `NonScalar2`, and `NonScalar3` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type; +* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same number of elements; +* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same element type; and +* The element type of `NonScalar1`, `NonScalar2`, and `NonScalar3` is `bfloat16`. + +*Returns:* For each element of `a`, `b`, and `c`; the correctly rounded floating-point representation of the sum of `c[i]` with the infinitely precise product of `a[i]` and `b[i]`. Rounding of intermediate products shall not occur. Edge case behavior is per the IEEE 754-2008 standard. + +The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + ==== fmax ```c++ namespace sycl::ext::oneapi::experimental { -template -T fmax(T x, T y); +bfloat16 fmax(bfloat16 x, bfloat16 y); (1) + +template (2) +/*return-type*/ fmax(NonScalar1 x, NonScalar2 y) + +template (3) +/*return-type*/ fmax(NonScalar x, bfloat16 y) } // namespace sycl::ext::oneapi::experimental ``` ===== Description -Returns `y` if -`x < y`, otherwise it -returns `x`. If one argument is a -NaN, `fmax()` returns the other -argument. If both arguments are -NaNs, `fmax()` returns a NaN. +====== Overload (1) + +Returns `y` if `x < y`, otherwise it returns `x`. If one argument is a NaN, `fmax()` returns the other +argument. If both arguments are NaNs, `fmax()` returns a NaN. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* One of the following conditions must hold for `NonScalar1` and `NonScalar2`: +** Both `NonScalar1` and `NonScalar2` are `marray`; or +** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type; +* `NonScalar1` and `NonScalar2` have the same number of elements; +* `NonScalar1` and `NonScalar2` have the same element type; and +* The element type of `NonScalar1` and `NonScalar2` is bfloat16. + +*Returns:* For each element of `x` and `y`, the value `y[i]` if `x[i] < y[i]`, otherwise `x[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN. + +The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + +====== Overload (3) + +*Constraints:* Available only if all of the following conditions are met: + +* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is bfloat16. + +*Returns:* For each element of `x`, the value `y` if `x[i] < y`, otherwise `x[i]`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== fmin ```c++ namespace sycl::ext::oneapi::experimental { -template -T fmin(T x, T y); +bfloat16 fmin(bfloat16 x, bfloat16 y); (1) + +template (2) +/*return-type*/ fmin(NonScalar1 x, NonScalar2 y) + +template (3) +/*return-type*/ fmin(NonScalar x, bfloat16 y) } // namespace sycl::ext::oneapi::experimental ``` ===== Description -Returns `y` if -`y < x`, otherwise it -returns `x`. If one argument is a -NaN, `fmax()` returns the other -argument. If both arguments are -NaNs, `fmax()` returns a NaN. +====== Overload (1) + +Returns `x` if `x < y`, otherwise it returns `y`. If one argument is a +NaN, `fmin()` returns the other argument. If both arguments are NaNs, `fmin()` returns a NaN. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* One of the following conditions must hold for `NonScalar1` and `NonScalar2`: +** Both `NonScalar1` and `NonScalar2` are `marray`; or +** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type; +* `NonScalar1` and `NonScalar2` have the same number of elements; +* `NonScalar1` and `NonScalar2` have the same element type; and +* The element type of `NonScalar1` and `NonScalar2` is bfloat16. + +*Returns:* For each element of `x` and `y`, the value `x[i]` if `x[i] < y[i]`, otherwise `y[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN. + +The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + +====== Overload (3) + +*Constraints:* Available only if all of the following conditions are met: + +* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is bfloat16. + +*Returns:* For each element of `x`, the value `x[i]` if `x[i] < y`, otherwise `y`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== fabs ```c++ namespace sycl::ext::oneapi::experimental { -template -T fabs(T x); +bfloat16 fabs(bfloat16 x); (1) + +template (2) +/*return-type*/ fabs(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Compute absolute value(s) of a scalar `bfloat16` value. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Compute absolute value of a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the absolute value of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== ceil ```c++ namespace sycl::ext::oneapi::experimental { -template -T ceil(T x); +bfloat16 ceil(bfloat16 x); (1) + +template (2) +/*return-type*/ ceil(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +The value `x` rounded to an integral value using the round to positive infinity rounding mode. + +====== Overload (2) -Returns `x` rounded to an integral value using the round to positive infinity rounding mode +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to positive infinity rounding mode. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== floor ```c++ namespace sycl::ext::oneapi::experimental { -template -T floor(T x); +bfloat16 floor(bfloat16 x); (1) + +template (2) +/*return-type*/ floor(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +The value `x` rounded to an integral value using the round to negative infinity rounding mode. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Returns `x` rounded to an integral value using the round to negative infinity rounding mode -for a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to negative infinity rounding mode. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== cos ```c++ namespace sycl::ext::oneapi::experimental { -template -T cos(T x); +bfloat16 cos(bfloat16 x); (1) + +template (2) +/*return-type*/ cos(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the cosine of `x`. -Compute cosine of a `bfloat16` value or `sycl::marray`. +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the cosine of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== sin ```c++ namespace sycl::ext::oneapi::experimental { -template -T sin(T x); +bfloat16 sin(bfloat16 x); (1) + +template (2) +/*return-type*/ sin(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the sine of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -Compute sine of a `bfloat16` value or `sycl::marray`. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. +*Returns:* For each element of `x`, the sine of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== exp ```c++ namespace sycl::ext::oneapi::experimental { -template -T exp(T x); +bfloat16 exp(bfloat16 x); (1) + +template (2) +/*return-type*/ exp(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-e exponential of `x`. + +====== Overload (2) -Compute the base-e exponential of a `bfloat16` value or `sycl::marray`. +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-e exponential of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== exp2 ```c++ namespace sycl::ext::oneapi::experimental { -template -T exp2(T x); +bfloat16 exp2(bfloat16 x); (1) + +template (2) +/*return-type*/ exp2(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-2 exponential of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Compute the base-2 exponential of a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the base-2 exponential of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== exp10 ```c++ namespace sycl::ext::oneapi::experimental { -template -T exp10(T x); +bfloat16 exp10(bfloat16 x); (1) + +template (2) +/*return-type*/ exp10(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-10 exponential of `x`. -Compute the base-10 exponential of a `bfloat16` value or `sycl::marray`. +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-10 exponential of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== log ```c++ namespace sycl::ext::oneapi::experimental { -template -T log(T x); +bfloat16 log(bfloat16 x); (1) + +template (2) +/*return-type*/ log(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the natural logarithm of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -Compute natural logarithm of a `bfloat16` value or `sycl::marray`. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the natural logarithm of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== log2 ```c++ namespace sycl::ext::oneapi::experimental { -template -T log2(T x); +bfloat16 log2(bfloat16 x); (1) + +template (2) +/*return-type*/ log2(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-2 logarithm of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-2 logarithm of `x[i]`. -Compute base-2 logarithm of a `bfloat16` value or `sycl::marray`. +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== log10 ```c++ namespace sycl::ext::oneapi::experimental { -template -T log10(T x); +bfloat16 log10(bfloat16 x); (1) + +template (2) +/*return-type*/ log10(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the base-10 logarithm of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -Compute base-10 logarithm of a `bfloat16` value or `sycl::marray`. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the base-10 logarithm of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== rint ```c++ namespace sycl::ext::oneapi::experimental { -template -T rint(T x); +bfloat16 rint(bfloat16 x); (1) + +template (2) +/*return-type*/ rint(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) -Returns `x` rounded to an integral value using the round to nearest even rounding mode -for a `bfloat16` value or `sycl::marray`. +Returns the value `x` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. Refer to section 7.1 of the OpenCL 1.2 specification document: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#opencl12 for a description of the rounding modes. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== sqrt ```c++ namespace sycl::ext::oneapi::experimental { -template -T sqrt(T x); +bfloat16 sqrt(bfloat16 x); (1) + +template (2) +/*return-type*/ sqrt(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the square root of `x`. + +====== Overload (2) -Compute square root of a `bfloat16` value or `sycl::marray`. +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the square root of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== rsqrt ```c++ namespace sycl::ext::oneapi::experimental { -template -T rsqrt(T x); +bfloat16 rsqrt(bfloat16 x); (1) + +template (2) +/*return-type*/ rsqrt(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) + +Returns the inverse square root of `x`. + +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: + +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. -Compute inverse square root of a `bfloat16` value or `sycl::marray`. +*Returns:* For each element of `x`, the inverse square root of `x[i]`. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. ==== trunc ```c++ namespace sycl::ext::oneapi::experimental { -template -T trunc(T x); +bfloat16 trunc(bfloat16 x); (1) + +template (2) +/*return-type*/ trunc(NonScalar x) } // namespace sycl::ext::oneapi::experimental ``` -===== Description +===== Overload (1) -Returns `x` rounded to an integral value using the round to zero rounding mode -for a `bfloat16` value or `sycl::marray`. +Returns the value `x` rounded to an integral value using the round to zero rounding mode. -== Issues +====== Overload (2) + +*Constraints:* Available only if all of the following conditions are met: -1. The CUDA backend does not have a use case that would necessitate support -of the `vec` class in bfloat16 math functions, and `marray` would always be -preferred over `vec` if `vec` support were to be added in the CUDA backend. -For portability reasons, support for the `vec` class can be easily added if -other backends require it. +* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and +* The element type is `bfloat16`. + +*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to zero rounding mode. + +The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`. + +== Issues -2. We should decide on a roadmap to extend support of `bfloat16` to other +1. We should decide on a roadmap to extend support of `bfloat16` to other SYCL 2020 math functions. diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp index 2b611f46ddadd..fb4b49a44d4d3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp @@ -9,6 +9,7 @@ #pragma once #include // for ceil, cos, exp, exp10, exp2 +#include // For simplify_if_swizzle, is_swizzle #include // sycl::detail::memcpy #include // for bfloat16, bfloat16ToBits #include // for marray @@ -30,6 +31,17 @@ uint32_t to_uint32_t(sycl::marray x, size_t start) { } } // namespace detail +// Trait to check if the type is a vector or swizzle of bfloat16. +template +constexpr bool is_vec_or_swizzle_bf16_v = + sycl::detail::is_vec_or_swizzle_v && + sycl::detail::is_valid_elem_type_v; + +template +constexpr int num_elements_v = sycl::detail::num_elements::value; + +/******************* isnan ********************/ + // According to bfloat16 format, NAN value's exponent field is 0xFF and // significand has non-zero bits. template @@ -46,6 +58,21 @@ template sycl::marray isnan(sycl::marray x) { return res; } +// Overload for BF16 vec and swizzles. +template > +std::enable_if_t, sycl::vec> +isnan(T x) { + sycl::vec res; + for (size_t i = 0; i < N; i++) { + // The result of isnan is 0 or 1 but SPEC requires + // isnan() of vec/swizzle to return -1 or 0. + res[i] = isnan(x[i]) ? -1 : 0; + } + return res; +} + +/******************* fabs ********************/ + template std::enable_if_t, T> fabs(T x) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -89,6 +116,19 @@ sycl::marray fabs(sycl::marray x) { return res; } +// Overload for BF16 vec and swizzles. +template > +std::enable_if_t, sycl::vec> +fabs(T x) { + sycl::vec res; + for (size_t i = 0; i < N; i++) { + res[i] = fabs(x[i]); + } + return res; +} + +/******************* fmin ********************/ + template std::enable_if_t, T> fmin(T x, T y) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -146,6 +186,22 @@ sycl::marray fmin(sycl::marray x, return res; } +// Overload for different combination of BF16 vec and swizzles. +template , + int N2 = num_elements_v> +std::enable_if_t && is_vec_or_swizzle_bf16_v && + N1 == N2, + sycl::vec> +fmin(T1 x, T2 y) { + sycl::vec res; + for (size_t i = 0; i < N1; i++) { + res[i] = fmin(x[i], y[i]); + } + return res; +} + +/******************* fmax ********************/ + template std::enable_if_t, T> fmax(T x, T y) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -202,6 +258,22 @@ sycl::marray fmax(sycl::marray x, return res; } +// Overload for different combination of BF16 vec and swizzles. +template , + int N2 = num_elements_v> +std::enable_if_t && is_vec_or_swizzle_bf16_v && + N1 == N2, + sycl::vec> +fmax(T1 x, T2 y) { + sycl::vec res; + for (size_t i = 0; i < N1; i++) { + res[i] = fmax(x[i], y[i]); + } + return res; +} + +/******************* fma *********************/ + template std::enable_if_t, T> fma(T x, T y, T z) { #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \ @@ -248,6 +320,22 @@ sycl::marray fma(sycl::marray x, return res; } +// Overload for different combination of BF16 vec and swizzles. +template , + int N2 = num_elements_v, int N3 = num_elements_v> +std::enable_if_t && is_vec_or_swizzle_bf16_v && + is_vec_or_swizzle_bf16_v && N1 == N2 && N2 == N3, + sycl::vec> +fma(T1 x, T2 y, T3 z) { + sycl::vec res; + for (size_t i = 0; i < N1; i++) { + res[i] = fma(x[i], y[i], z[i]); + } + return res; +} + +/******************* unary math operations ********************/ + #define BFLOAT16_MATH_FP32_WRAPPERS(op) \ template \ std::enable_if_t::value, T> op(T x) { \ @@ -264,37 +352,77 @@ sycl::marray fma(sycl::marray x, return res; \ } +#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op) \ + /* Overload for BF16 vec and swizzles. */ \ + template > \ + std::enable_if_t, sycl::vec> op( \ + T x) { \ + sycl::vec res; \ + for (size_t i = 0; i < N; i++) { \ + res[i] = op(x[i]); \ + } \ + return res; \ + } + BFLOAT16_MATH_FP32_WRAPPERS(ceil) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(ceil) + BFLOAT16_MATH_FP32_WRAPPERS(cos) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(cos) + BFLOAT16_MATH_FP32_WRAPPERS(exp) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp) + BFLOAT16_MATH_FP32_WRAPPERS(exp10) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp10) + BFLOAT16_MATH_FP32_WRAPPERS(exp2) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp2) + BFLOAT16_MATH_FP32_WRAPPERS(floor) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(floor) + BFLOAT16_MATH_FP32_WRAPPERS(log) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(log) + BFLOAT16_MATH_FP32_WRAPPERS(log2) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(log2) + BFLOAT16_MATH_FP32_WRAPPERS(log10) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(log10) + BFLOAT16_MATH_FP32_WRAPPERS(rint) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(rint) + BFLOAT16_MATH_FP32_WRAPPERS(rsqrt) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(rsqrt) + BFLOAT16_MATH_FP32_WRAPPERS(sin) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(sin) + BFLOAT16_MATH_FP32_WRAPPERS(sqrt) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(sqrt) + BFLOAT16_MATH_FP32_WRAPPERS(trunc) BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc) +BFLOAT16_MATH_FP32_WRAPPERS_VEC(trunc) #undef BFLOAT16_MATH_FP32_WRAPPERS #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY +#undef BFLOAT16_MATH_FP32_WRAPPERS_VEC } // namespace ext::oneapi::experimental } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp new file mode 100644 index 0000000000000..481aa35e3cedf --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp @@ -0,0 +1,278 @@ +// RUN: %{build} -fno-fast-math -o %t.out +// RUN: %{run} %t.out + +// Test new, ABI-breaking for all platforms. +// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t-pfrev.out %} +// RUN: %if preview-breaking-changes-supported %{ %{run} %t-pfrev.out %} + +#include +#include + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental; + +constexpr float bf16_eps = 0.00390625; + +bool check(float a, float b) { + return sycl::fabs(2 * (a - b) / (a + b)) > bf16_eps * 2; +} + +bool check(bool a, bool b) { return (a != b); } + +#define TEST_UNARY_OP(NAME, SZ, RETTY, INPVAL) \ + { \ + vec arg; \ + /* Initialize the vector with INPVAL */ \ + for (int i = 0; i < SZ; i++) { \ + arg[i] = INPVAL; \ + } \ + /* Perform the operation. */ \ + vec \ + res = sycl::ext::oneapi::experimental::NAME(arg); \ + vec res2 = \ + sycl::ext::oneapi::experimental::NAME(arg.template swizzle<0, 0>()); \ + /* Check the result. */ \ + if (res2[0] != res[0] || res2[1] != res[0]) { \ + ERR[0] += 1; \ + } \ + for (int i = 0; i < SZ; i++) { \ + if (check(res[i], sycl::NAME(INPVAL))) { \ + ERR[0] += 1; \ + } \ + } \ + } + +#define TEST_BINARY_OP(NAME, SZ, RETTY, INPVAL) \ + { \ + vec arg, arg2; \ + bfloat16 inpVal2 = 1.0f; \ + /* Initialize the vector with INPVAL */ \ + for (int i = 0; i < SZ; i++) { \ + arg[i] = INPVAL; \ + arg2[i] = inpVal2; \ + } \ + /* Perform the operation. */ \ + vec \ + res = sycl::ext::oneapi::experimental::NAME(arg, arg2); \ + /* Swizzle and vec different combination. */ \ + vec res2 = sycl::ext::oneapi::experimental::NAME( \ + arg.template swizzle<0, 0>(), arg2.template swizzle<0, 0>()); \ + vec res3 = sycl::ext::oneapi::experimental::NAME( \ + vec(arg[0], arg[0]), arg2.template swizzle<0, 0>()); \ + vec res4 = sycl::ext::oneapi::experimental::NAME( \ + arg.template swizzle<0, 0>(), vec(arg2[0], arg2[0])); \ + /* Check the result. */ \ + if (res2[0] != res[0] || res2[1] != res[0] || res3[0] != res[0] || \ + res3[1] != res[0] || res4[0] != res[0] || res4[1] != res[0]) { \ + ERR[0] += 1; \ + } \ + for (int i = 0; i < SZ; i++) { \ + if (check(res[i], sycl::NAME(INPVAL, inpVal2))) { \ + ERR[0] += 1; \ + } \ + } \ + } + +#define TEST_BUILTIN_VEC(NAME, SZ, RETTY, INPVAL, OPTEST) \ + { /* On Device */ \ + buffer err_buf(&err, 1); \ + q.submit([&](handler &cgh) { \ + accessor ERR(err_buf, \ + cgh); \ + cgh.single_task([=]() { OPTEST(NAME, SZ, RETTY, INPVAL) }); \ + }).wait(); \ + } \ + assert(err == 0); \ + { /* On Host */ \ + int ERR[1] = {0}; \ + OPTEST(NAME, SZ, RETTY, INPVAL) \ + assert(ERR[0] == 0); \ + } + +#define TEST_BUILTIN_UNARY(NAME, RETTY, INPVAL) \ + TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_UNARY_OP) \ + TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_UNARY_OP) \ + TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_UNARY_OP) \ + TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_UNARY_OP) \ + TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_UNARY_OP) \ + TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_UNARY_OP) + +#define TEST_BUILTIN_BINARY(NAME, RETTY, INPVAL) \ + TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_BINARY_OP) \ + TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_BINARY_OP) \ + TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_BINARY_OP) \ + TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_BINARY_OP) \ + TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_BINARY_OP) \ + TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_BINARY_OP) + +void test() { + queue q; + int err = 0; + float nan = std::nanf(""); + + // Test isnan on host + { + vec arg{1.0f, nan, 2.0f}; + vec res = sycl::ext::oneapi::experimental::isnan(arg); + assert((res[0] == 0 && res[1] == -1 && res[2] == 0) && + "isnan() failed on host for vec"); + + // Test for swizzles + vec res2 = sycl::ext::oneapi::experimental::isnan(arg.lo()); + assert((res2[0] == 0 && res2[1] == -1) && + "isnan() failed on host for vec swizzles"); + } + + // Tets isnan on device. + { + buffer err_buf(&err, 1); + q.submit([&](handler &cgh) { + accessor ERR(err_buf, cgh); + cgh.single_task([=]() { + vec arg{1.0f, nan, 2.0f}; + vec res = sycl::ext::oneapi::experimental::isnan(arg); + if (res[0] != 0 || res[1] != -1 || res[2] != 0) { + ERR[0] += 1; + } + }); + }).wait(); + assert(err == 0 && "isnan failed on device for vec"); + } + + // Unary math builtins. + TEST_BUILTIN_UNARY(fabs, bfloat16, -1.0f); + TEST_BUILTIN_UNARY(fabs, bfloat16, 1.0f); + + TEST_BUILTIN_UNARY(cos, bfloat16, 0.1f); + TEST_BUILTIN_UNARY(sin, bfloat16, 0.2f); + + TEST_BUILTIN_UNARY(ceil, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(floor, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(trunc, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(exp, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(exp10, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(exp2, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(rint, bfloat16, 0.9f); + + TEST_BUILTIN_UNARY(sqrt, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(rsqrt, bfloat16, 0.9f); + TEST_BUILTIN_UNARY(log, bfloat16, 20.0f); + TEST_BUILTIN_UNARY(log2, bfloat16, 2.0f); + TEST_BUILTIN_UNARY(log10, bfloat16, 2.0f); + + TEST_BUILTIN_BINARY(fmin, bfloat16, 0.9f); + TEST_BUILTIN_BINARY(fmax, bfloat16, 0.9f); + TEST_BUILTIN_BINARY(fmin, bfloat16, nan); + TEST_BUILTIN_BINARY(fmax, bfloat16, nan); + + // Test fma operation on host. + { + vec arg1, arg2, arg3; + bfloat16 inpVal1 = 1.0f; + bfloat16 inpVal2 = 2.0f; + bfloat16 inpVal3 = 3.0f; + /* Initialize the vector with INPVAL */ + for (int i = 0; i < 3; i++) { + arg1[i] = inpVal1; + arg2[i] = inpVal2; + arg3[i] = inpVal3; + } + /* Perform the operation. */ + auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3); + + // Test different combination of vec an swizzle. + auto res1 = sycl::ext::oneapi::experimental::fma( + arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(), + arg3.template swizzle<0, 0>()); + + auto res2 = sycl::ext::oneapi::experimental::fma( + vec(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(), + arg3.template swizzle<0, 0>()); + + auto res3 = sycl::ext::oneapi::experimental::fma( + arg1.template swizzle<0, 0>(), vec(arg2[0], arg2[0]), + arg3.template swizzle<0, 0>()); + + auto res4 = sycl::ext::oneapi::experimental::fma( + arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(), + vec(arg3[0], arg3[0])); + + /* Check the result. */ + if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] || + res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] || + res4[0] != res[0] || res4[1] != res[0]) { + err += 1; + } + for (int i = 0; i < 3; i++) { + if (check(res[i], sycl::ext::oneapi::experimental::fma(inpVal1, inpVal2, + inpVal3))) { + err += 1; + } + } + assert(err == 0); + } + + // Test fma on device. + { + buffer err_buf(&err, 1); + q.submit([&](handler &cgh) { + accessor ERR(err_buf, cgh); + cgh.single_task([=]() { + vec arg1, arg2, arg3; + bfloat16 inpVal1 = 1.0f; + bfloat16 inpVal2 = 2.0f; + bfloat16 inpVal3 = 3.0f; + /* Initialize the vector with INPVAL */ + for (int i = 0; i < 3; i++) { + arg1[i] = inpVal1; + arg2[i] = inpVal2; + arg3[i] = inpVal3; + } + /* Perform the operation. */ + auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3); + + // Test different combination of vec an swizzle. + auto res1 = sycl::ext::oneapi::experimental::fma( + arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(), + arg3.template swizzle<0, 0>()); + + auto res2 = sycl::ext::oneapi::experimental::fma( + vec(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(), + arg3.template swizzle<0, 0>()); + + auto res3 = sycl::ext::oneapi::experimental::fma( + arg1.template swizzle<0, 0>(), vec(arg2[0], arg2[0]), + arg3.template swizzle<0, 0>()); + + auto res4 = sycl::ext::oneapi::experimental::fma( + arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(), + vec(arg3[0], arg3[0])); + + /* Check the result. */ + if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] || + res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] || + res4[0] != res[0] || res4[1] != res[0]) { + ERR[0] += 1; + } + for (int i = 0; i < 3; i++) { + if (check(res[i], sycl::ext::oneapi::experimental::fma( + inpVal1, inpVal2, inpVal3))) { + ERR[0] += 1; + } + } + }); + }).wait(); + assert(err == 0); + } +} + +int main() { + + test(); + return 0; +} diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp new file mode 100644 index 0000000000000..6aea590b6155c --- /dev/null +++ b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp @@ -0,0 +1,377 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// NOTE: ..., followed by some manual cleanup. + +// Had to increase inline threashold for this test to force inline of the vec<> +// math builtins. +// RUN: %clangxx -I %sycl_include -mllvm -inline-threshold=400 -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -O3 -fsycl-device-only %s -o - | FileCheck %s + +// This test checks the device code generated for vec math builtins. +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi; +using namespace sycl::ext::oneapi::experimental; + +// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMinN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEES5_( +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec", align 8 +// CHECK-NEXT: [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec", align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA7:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !tbaa [[TBAA7]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP2]]) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP13]]) +// CHECK-NEXT: store i32 [[TMP1]], ptr [[AGG_TMP13]], align 1 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[AGG_TMP2]], align 1 +// CHECK-NEXT: [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4) +// CHECK-NEXT: [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4) +// CHECK-NEXT: [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4) +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS0_3VECINS2_8BFLOAT16ELI2EEES7_LI2ELI2EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32 +// CHECK-NEXT: [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7:[0-9]+]] +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10:![0-9]+]] +// CHECK-NEXT: [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12:![0-9]+]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]] +// CHECK-NEXT: store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META12]] +// CHECK-NEXT: store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META12]] +// CHECK-NEXT: [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32 +// CHECK-NEXT: [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640 +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640 +// CHECK-NEXT: [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]] +// CHECK-NEXT: br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]] +// CHECK: land.lhs.true.i.i: +// CHECK-NEXT: [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640 +// CHECK-NEXT: [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640 +// CHECK-NEXT: [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]] +// CHECK-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]] +// CHECK: if.end6.i.i: +// CHECK-NEXT: [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640 +// CHECK-NEXT: [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640 +// CHECK-NEXT: [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]] +// CHECK-NEXT: br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]] +// CHECK: if.end10.i.i: +// CHECK-NEXT: [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768 +// CHECK-NEXT: [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]] +// CHECK-NEXT: [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0 +// CHECK-NEXT: [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]] +// CHECK-NEXT: br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]] +// CHECK: if.end18.i.i: +// CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8:[0-9]+]], !noalias [[META15:![0-9]+]] +// CHECK-NEXT: [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META15]] +// CHECK-NEXT: [[CMP_I44_I_I:%.*]] = fcmp olt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]] +// CHECK-NEXT: [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META18:![0-9]+]] +// CHECK-NEXT: [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META18]] +// CHECK-NEXT: [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental4fminINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i: +// CHECK-NEXT: [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ -32768, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]] +// CHECK-NEXT: [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental4fminINS0_3vecINS2_8bfloat16ELi2EEES7_Li2ELi2EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP2]]) +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP13]]) +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL auto TestFMin(vec a, vec b) { + return experimental::fmin(a, b); +} + +// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMaxN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_( +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META22:![0-9]+]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8 +// CHECK-NEXT: [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]] +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[B]], align 8, !tbaa [[TBAA7]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP2]]) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP13]]) +// CHECK-NEXT: store i64 [[TMP1]], ptr [[AGG_TMP13]], align 1 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[AGG_TMP2]], align 1 +// CHECK-NEXT: [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4) +// CHECK-NEXT: [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4) +// CHECK-NEXT: [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4) +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS0_3VECINS2_8BFLOAT16ELI3EEES7_LI3ELI3EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32 +// CHECK-NEXT: [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23:![0-9]+]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]] +// CHECK-NEXT: store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META23]] +// CHECK-NEXT: store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META23]] +// CHECK-NEXT: [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32 +// CHECK-NEXT: [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640 +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640 +// CHECK-NEXT: [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]] +// CHECK-NEXT: br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]] +// CHECK: land.lhs.true.i.i: +// CHECK-NEXT: [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640 +// CHECK-NEXT: [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640 +// CHECK-NEXT: [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]] +// CHECK-NEXT: [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]] +// CHECK: if.end6.i.i: +// CHECK-NEXT: [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640 +// CHECK-NEXT: [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640 +// CHECK-NEXT: [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0 +// CHECK-NEXT: [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]] +// CHECK-NEXT: br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]] +// CHECK: if.end10.i.i: +// CHECK-NEXT: [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768 +// CHECK-NEXT: [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]] +// CHECK-NEXT: [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0 +// CHECK-NEXT: [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]] +// CHECK-NEXT: br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]] +// CHECK: if.end18.i.i: +// CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26:![0-9]+]] +// CHECK-NEXT: [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26]] +// CHECK-NEXT: [[CMP_I44_I_I:%.*]] = fcmp ogt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]] +// CHECK-NEXT: [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META29:![0-9]+]] +// CHECK-NEXT: [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META29]] +// CHECK-NEXT: [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i: +// CHECK-NEXT: [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ 0, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]] +// CHECK-NEXT: [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP30:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS0_3vecINS2_8bfloat16ELi3EEES7_Li3ELi3EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP2]]) +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP13]]) +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL auto TestFMax(vec a, vec b) { + return experimental::fmax(a, b); +} + +// CHECK-LABEL: define dso_local spir_func void @_Z9TestIsNanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEE( +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.1") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !srcloc [[META31:![0-9]+]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.2", align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]] +// CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]]) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP1]]) +// CHECK-NEXT: store i64 [[TMP0]], ptr [[AGG_TMP1]], align 1 +// CHECK-NEXT: [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4) +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL5ISNANINS0_3VECINS2_8BFLOAT16ELI4EEELI4EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_ISXT0_EEEE4TYPEES9__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32 +// CHECK-NEXT: [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]], !noalias [[META32]] +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]], !noalias [[META32]] +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[AND_I_I:%.*]] = and i32 [[CONV_I_I]], 32640 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 32640 +// CHECK-NEXT: [[AND2_I_I:%.*]] = and i32 [[CONV_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I_I:%.*]] = icmp ne i32 [[AND2_I_I]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_I_I]] +// CHECK-NEXT: [[CONV2_I:%.*]] = sext i1 [[TMP2]] to i16 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[AGG_RESULT]], i64 [[I_0_I]] +// CHECK-NEXT: store i16 [[CONV2_I]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA10]], !alias.scope [[META32]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental5isnanINS0_3vecINS2_8bfloat16ELi4EEELi4EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IsXT0_EEEE4typeES9_.exit: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP1]]) +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL auto TestIsNan(vec a) { + return experimental::isnan(a); +} + +// CHECK-LABEL: define dso_local spir_func void @_Z8TestFabsN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE( +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR3]] !srcloc [[META36:![0-9]+]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16 +// CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]]) +// CHECK-NEXT: store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1 +// CHECK-NEXT: [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4) +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FABSINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32 +// CHECK-NEXT: [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[CONV_I_I_I:%.*]] = zext i16 [[TMP0]] to i32 +// CHECK-NEXT: [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640 +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640 +// CHECK-NEXT: [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127 +// CHECK-NEXT: [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP0]], 32767 +// CHECK-NEXT: [[SPEC_SELECT_I_I:%.*]] = select i1 [[TMP1]], i16 [[TMP0]], i16 [[TMP2]] +// CHECK-NEXT: [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: store i16 [[SPEC_SELECT_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP37:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental4fabsINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]]) +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL auto TestFabs(vec a) { + return experimental::fabs(a); +} + +// CHECK-LABEL: define dso_local spir_func void @_Z8TestCeilN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE( +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[REF_TMP_I_I:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[AGG_TMP6_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16 +// CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]]) +// CHECK-NEXT: store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1 +// CHECK-NEXT: [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4) +// CHECK-NEXT: [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP6_I]] to ptr addrspace(4) +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4CEILINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32 +// CHECK-NEXT: [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39:![0-9]+]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]] +// CHECK-NEXT: store i16 [[TMP0]], ptr [[AGG_TMP6_I]], align 1, !noalias [[META39]] +// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42:![0-9]+]] +// CHECK-NEXT: [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z16__spirv_ocl_ceilf(float noundef [[CALL_I_I_I_I]]) #[[ATTR9:[0-9]+]] +// CHECK-NEXT: store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45:![0-9]+]], !noalias [[META47:![0-9]+]] +// CHECK-NEXT: [[CALL_I_I2_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]] +// CHECK-NEXT: [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: store i16 [[CALL_I_I2_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP48:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental4ceilINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]]) +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL auto TestCeil(vec a) { + return experimental::ceil(a); +} + +// CHECK-LABEL: define dso_local spir_func void @_Z7TestFMAN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEES5_S5_( +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.4") align 32 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[B:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[C:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META49:![0-9]+]] !sycl_fixed_targets [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[REF_TMP_I_I:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[AGG_TMP416_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP115_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP14_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8 +// CHECK-NEXT: [[AGG_TMP25:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32 +// CHECK-NEXT: [[AGG_TMP14:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32 +// CHECK-NEXT: [[AGG_TMP3:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32 +// CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[A]], align 32, !tbaa [[TBAA7]] +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[B]], align 32, !tbaa [[TBAA7]] +// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[C]], align 32, !tbaa [[TBAA7]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP3]]) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP14]]) +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP25]]) +// CHECK-NEXT: store <16 x i16> [[AGG_TMP2_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP25]], align 1 +// CHECK-NEXT: store <16 x i16> [[AGG_TMP1_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP14]], align 1 +// CHECK-NEXT: store <16 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP3]], align 1 +// CHECK-NEXT: [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP3]] to ptr addrspace(4) +// CHECK-NEXT: [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP14]] to ptr addrspace(4) +// CHECK-NEXT: [[Z_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP25]] to ptr addrspace(4) +// CHECK-NEXT: [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP14_I]] to ptr addrspace(4) +// CHECK-NEXT: [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP115_I]] to ptr addrspace(4) +// CHECK-NEXT: [[Z_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP416_I]] to ptr addrspace(4) +// CHECK-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK: for.cond.i: +// CHECK-NEXT: [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL3FMAINS0_3VECINS2_8BFLOAT16ELI16EEES7_S7_LI16ELI16ELI16EEENST9ENABLE_IFIXAAAAAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_E24IS_VEC_OR_SWIZZLE_BF16_VIT1_EEQT2_T3_EQT3_T4_ENS5_IS6_XT2_EEEE4TYPEES9_SA_SB__EXIT:%.*]] +// CHECK: for.body.i: +// CHECK-NEXT: [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32 +// CHECK-NEXT: [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[CALL6_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Z_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL6_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50:![0-9]+]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]] +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]] +// CHECK-NEXT: store i16 [[TMP2]], ptr [[AGG_TMP416_I]], align 1, !noalias [[META50]] +// CHECK-NEXT: store i16 [[TMP1]], ptr [[AGG_TMP115_I]], align 1, !noalias [[META50]] +// CHECK-NEXT: store i16 [[TMP0]], ptr [[AGG_TMP14_I]], align 1, !noalias [[META50]] +// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53:![0-9]+]] +// CHECK-NEXT: [[CALL_I_I4_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]] +// CHECK-NEXT: [[CALL_I_I5_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Z_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]] +// CHECK-NEXT: [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z15__spirv_ocl_fmafff(float noundef [[CALL_I_I_I_I]], float noundef [[CALL_I_I4_I_I]], float noundef [[CALL_I_I5_I_I]]) #[[ATTR9]] +// CHECK-NEXT: store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45]], !noalias [[META56:![0-9]+]] +// CHECK-NEXT: [[CALL_I_I6_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]] +// CHECK-NEXT: [[CALL8_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]] +// CHECK-NEXT: store i16 [[CALL_I_I6_I_I]], ptr addrspace(4) [[CALL8_I]], align 2, !tbaa [[TBAA10]] +// CHECK-NEXT: [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP57:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental3fmaINS0_3vecINS2_8bfloat16ELi16EEES7_S7_Li16ELi16ELi16EEENSt9enable_ifIXaaaaaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_E24is_vec_or_swizzle_bf16_vIT1_EeqT2_T3_eqT3_T4_ENS5_IS6_XT2_EEEE4typeES9_SA_SB_.exit: +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP3]]) +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP14]]) +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP25]]) +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL auto TestFMA(vec a, vec b, + vec c) { + return experimental::fma(a, b, c); +}